In [9]:
import torch
import torch.nn as nn


class YOLOv1(nn.Module):
    """
    This class contains the YOLOv1 model. It consists of 24 convolutional and
    2 fully-connected layers which divide the input image into a 
    split_size x split_size grid and predict num_boxes bounding boxes per grid
    cell. If the confidence of a bounding box reaches a certain value, it is 
    considered as a valid prediction.
    """
    
    def __init__(self, split_size, num_boxes, num_classes):
        """
        Initializes the neural-net with the parameter values to produce the
        desired predictions.
        
        Parameters:
            split_size (int): Size of the grid which is applied to the image.
            num_boxes (int): Amount of bounding boxes which are predicted per 
                grid cell.
            num_classes (int): Amount of different classes which are being 
                predicted by the model.
        """
        
        super(YOLOv1, self).__init__()
        self.darkNet = nn.Sequential(
            nn.Conv2d(3, 64, 7, padding=3, stride=2, bias=False),
                nn.BatchNorm2d(64),
                nn.LeakyReLU(0.1),
            nn.MaxPool2d(2, 2),
            
            nn.Conv2d(64, 192, 3, padding=1, bias=False),
                nn.BatchNorm2d(192),
                nn.LeakyReLU(0.1),
            nn.MaxPool2d(2, 2),
            
            nn.Conv2d(192, 128, 1, bias=False),
                nn.BatchNorm2d(128),
                nn.LeakyReLU(0.1),
            nn.Conv2d(128, 256, 3, padding=1, bias=False),
                nn.BatchNorm2d(256),
                nn.LeakyReLU(0.1),
            nn.Conv2d(256, 256, 1, bias=False),
                nn.BatchNorm2d(256),
                nn.LeakyReLU(0.1),
            nn.Conv2d(256, 512, 3, padding=1, bias=False),
                nn.BatchNorm2d(512),
                nn.LeakyReLU(0.1),
            nn.MaxPool2d(2, 2),
            
            nn.Conv2d(512, 256, 1, bias=False),
                nn.BatchNorm2d(256),
                nn.LeakyReLU(0.1),
            nn.Conv2d(256, 512, 3, padding=1, bias=False),
                nn.BatchNorm2d(512),
                nn.LeakyReLU(0.1),
            nn.Conv2d(512, 256, 1, bias=False),
                nn.BatchNorm2d(256),
                nn.LeakyReLU(0.1),
            nn.Conv2d(256, 512, 3, padding=1, bias=False),
                nn.BatchNorm2d(512),
                nn.LeakyReLU(0.1),
            nn.Conv2d(512, 256, 1, bias=False),
                nn.BatchNorm2d(256),
                nn.LeakyReLU(0.1),
            nn.Conv2d(256, 512, 3, padding=1, bias=False),
                nn.BatchNorm2d(512),
                nn.LeakyReLU(0.1),
            nn.Conv2d(512, 256, 1, bias=False),
                nn.BatchNorm2d(256),
                nn.LeakyReLU(0.1),
            nn.Conv2d(256, 512, 3, padding=1, bias=False),
                nn.BatchNorm2d(512),
                nn.LeakyReLU(0.1),
            nn.Conv2d(512, 512, 1, bias=False),
                nn.BatchNorm2d(512),
                nn.LeakyReLU(0.1),
            nn.Conv2d(512, 1024, 3, padding=1, bias=False),
                nn.BatchNorm2d(1024),
                nn.LeakyReLU(0.1),
            nn.MaxPool2d(2, 2),
            
            nn.Conv2d(1024, 512, 1, bias=False),
                nn.BatchNorm2d(512),
                nn.LeakyReLU(0.1),
            nn.Conv2d(512, 1024, 3, padding=1, bias=False),
                nn.BatchNorm2d(1024),
                nn.LeakyReLU(0.1),
            nn.Conv2d(1024, 512, 1, bias=False),
                nn.BatchNorm2d(512),
                nn.LeakyReLU(0.1),
            nn.Conv2d(512, 1024, 3, padding=1, bias=False),
                nn.BatchNorm2d(1024),
                nn.LeakyReLU(0.1),
            nn.Conv2d(1024, 1024, 3, padding=1, bias=False),
                nn.BatchNorm2d(1024),
                nn.LeakyReLU(0.1),
            nn.Conv2d(1024, 1024, 3, padding=1, stride=2, bias=False),
                nn.BatchNorm2d(1024),
                nn.LeakyReLU(0.1),
            
            nn.Conv2d(1024, 1024, 3, padding=1, bias=False),
                nn.BatchNorm2d(1024),
                nn.LeakyReLU(0.1),
            nn.Conv2d(1024, 1024, 3, padding=1, bias=False),
                nn.BatchNorm2d(1024),
                nn.LeakyReLU(0.1),
            )
        self.fc = nn.Sequential(
            nn.Flatten(),
            nn.Linear(1024 * split_size * split_size, 4096),
            nn.Dropout(0.5),
            nn.LeakyReLU(0.1),
            nn.Linear(4096, split_size * split_size * (num_classes + num_boxes*5))
            )
        
    def forward(self, x):
        """
        Forwards the input tensor through the model to produce the predictions. 
        Parameters:
            x (tensor): A tensor of shape (batch_size, 3, 448, 448) which represents
                a batch of images.
        """
        x = self.darkNet(x)
        x = self.fc(x)
        return x

In [10]:
# Used for testing
model = YOLOv1(7,2,13)
x = torch.randn(2,3,448,448)
print(model(x).shape)

torch.Size([2, 1127])
