In [1]:
from torchvision import models
from torch import nn
from data_loader import OpenImagesDataset
from model_utils import device
from model_transformations import Transformations
from torch.utils.data import DataLoader
from params import NUM_EPOCHS, IMAGE_SIZE, GRID_SIZE, NUM_ANCHOR_BOXES
import torch.optim as optim
import torch

In [2]:
# Reading in the training data
trainingData = OpenImagesDataset(rootDirectory='open-images-v6', 
                                 anchorBoxes='centroids.npy', 
                                 transform=Transformations, 
                                 dataType='validation', 
                                 gridSize=GRID_SIZE, 
                                 imageSize=IMAGE_SIZE)    

# Defining the training data
trainDataLoader = DataLoader(dataset=trainingData, 
                             batch_size=2,
                             num_workers=1,
                             shuffle=True)

# Reading in the training data
validationData = OpenImagesDataset(rootDirectory='open-images-v6', 
                                 anchorBoxes='centroids.npy', 
                                 transform=Transformations, 
                                 dataType='validation', 
                                 gridSize=GRID_SIZE, 
                                 imageSize=IMAGE_SIZE)    

# Defining the training data
validationDataLoader = DataLoader(dataset=trainingData, 
                             batch_size=2,
                             num_workers=1,
                             shuffle=True)

In [3]:
def space_to_depth(x, block_size):
    # CITATION: https://stackoverflow.com/questions/58857720/is-there-an-equivalent-pytorch-function-for-tf-nn-space-to-depth
    n, c, h, w = x.size()
    unfolded_x = torch.nn.functional.unfold(x, block_size, stride=block_size)
    return unfolded_x.view(n, c * block_size ** 2, h // block_size, w // block_size)

class DogDetectorModel(nn.Module):
    def __init__(self, modelPath=None, gridSize=13, numAnchorBoxes=7):
        super(DogDetectorModel, self).__init__()
        
        # Recording the grid size
        self.gridSize = gridSize
        
        # Recording the number of anchor boxes
        self.numAnchorBoxes = numAnchorBoxes
        
        # Reading in the pre-trained feature extractor
        self.featureExtractor = models.vgg19_bn(pretrained=True).features
        
        # Freezing the weights of the pre-trained feature extractor
        for parameter in self.featureExtractor.parameters():
            parameter.requires_grad = False
            
        # Defining the object detection layers
        # These layers follow right up to the line in Table 6 (Darknet-19) in YoloV2 Paper
        self.objectDetectorPart1 = nn.Sequential(
            nn.Conv2d(in_channels=512, out_channels=1024, kernel_size=3, stride=1, padding=1, bias=False),
            nn.BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True),
            nn.LeakyReLU(0.1),
            
            nn.Conv2d(in_channels=1024, out_channels=512, kernel_size=3, stride=1, padding=1, bias=False),
            nn.BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True),
            nn.LeakyReLU(0.1),
            
            nn.Conv2d(in_channels=512, out_channels=1024, kernel_size=3, stride=1, padding=1, bias=False),
            nn.BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True),
            nn.LeakyReLU(0.1),        
            
            nn.Conv2d(in_channels=1024, out_channels=512, kernel_size=3, stride=1, padding=1, bias=False),
            nn.BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True),
            nn.LeakyReLU(0.1),
            
            nn.Conv2d(in_channels=512, out_channels=1024, kernel_size=3, stride=1, padding=1, bias=False),
            nn.BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True),
            nn.LeakyReLU(0.1)
        )
        
        # Additional 2 of 3 layers
        self.objectDetectorPart2 = nn.Sequential(
            nn.Conv2d(in_channels=1024, out_channels=1024, kernel_size=3, stride=1, padding=1, bias=False),
            nn.BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True),
            nn.LeakyReLU(0.1),
            
            nn.Conv2d(in_channels=1024, out_channels=1024, kernel_size=3, stride=1, padding=1, bias=False),
            nn.BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True),
            nn.LeakyReLU(0.1)
        )
        
        # Additional 3 of 3 layers (for after skip connection)
        self.objectDetectorPart3 = nn.Sequential(
            nn.Conv2d(in_channels=3072, out_channels=1024, kernel_size=3, stride=1, padding=1, bias=False),
            nn.BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True),
            nn.LeakyReLU(0.1),
        )
        
        # Adding output layer 
        self.output = nn.Sequential (
            nn.Conv2d(in_channels=1024, out_channels=numAnchorBoxes*5, kernel_size=1, stride=1, padding=0, bias=True)
        )    
        
       
    def forward(self, modelInput):
        # Not Executing the MaxPool that is at #52 yet, need to record the skip connection
        tmp = self.featureExtractor[0:52](modelInput)   
        
        # Recording the high resolution features
        skipConnection = tmp
            
        # Concatting the high/low res features (changes from 512*26*26 -> 2048*13*13)
        skipConnection = space_to_depth(skipConnection, 2)
        
        # Executing the MaxPool at #52
        tmp = self.featureExtractor[52](tmp)  
        
        tmp = self.objectDetectorPart1(tmp)
        
        tmp = self.objectDetectorPart2(tmp)
        
        tmp = torch.cat([tmp, skipConnection],dim=1)
        
        tmp = self.objectDetectorPart3(tmp)  
        
        tmp = self.output(tmp)  
        
        tmp = tmp.reshape((-1, self.gridSize, self.gridSize, 5*self.numAnchorBoxes))
        
        return tmp

In [4]:
# Initializing the model
model = DogDetectorModel(gridSize=GRID_SIZE, numAnchorBoxes=NUM_ANCHOR_BOXES)

In [5]:
# Moving the model to the GPU
model = model.to(device)

In [6]:
# Defining the optimizer
optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)

In [7]:
for epoch in range(0, NUM_EPOCHS):
#     # ****************** TRAINING ******************
#     # Setting the model to training
#     model.train()
    
#     for batchIndex, [modelInput, label] in enumerate(trainDataLoader):
#         # Setting the model to training
#         model.train()
        
#         # Moving the model input/label to GPU 
#         modelInput = modelInput.to(device)
#         label = label.to(device)
        
#          # zero the parameter gradients
#         optimizer.zero_grad()
        
#         output = model(modelInput)
#         print(output.shape)
        
#         # ****************** VALIDATION ******************
        
#         break
#         print(batchIndex)
        
    # ****************** VALIDATION ******************
    
    # Setting the model to evaluation mode
    model.eval()
    
    # Turning of the gradient
    with torch.no_grad():
        
        # Iterating through the batches of the validation data
        for batchIndex, [modelInput, label] in enumerate(validationDataLoader):

            # Moving the model input/label to GPU 
            modelInput = modelInput.to(device)
            label = label.to(device)

            print(model(modelInput).shape)
            
            
            
            
            print(label.shape)



            break
    break
        
        
    
    break

torch.Size([2, 13, 13, 35])
torch.Size([2, 13, 13, 35])
