# Some Basics to YOLO-V1

- YOLO was trained on the PASCAL VOC dataset (20 classes). This is a bit outdated, COCO is more popular nowadays. 

- The goal is to output bounding boxes on detected objects.
- Split grid into a $S.S$ grid. Giving $S^2$ cells in the image.
- Each cell will output a prediction with a corresponding bounding box.



An object may appear in multiple cells, ie its body lies inbetween cells. We don't want to output duplicate boxes for the object though, so we say that the cell responsible for ouputting the bounding box on a detected object is the one which contains the objects midpoint (mp). Each output will be relative to the cell, ie each bounding box will be expressed in the form $(x, y, w, h)$ or $(x_{mp}, y_{mp}, width, height)$. These are all relative to a given cell, ie normalised between 0 and 1. Note that width and height can be greater than 1 if the object exists outside of the cell also. 


- Each cell can only detect one object.

- The target shape will be $(S.S.25)$. $SxS$ indicates that for each cell there will be a length 25 vector, where the first 20 correspond to the class probabilities, the next 1 corresponds to the probability score, and the remaining 4 correspond to the bounding boxes.
- The prediction shape will be $(S.S.30)$, the extra 5 is to accomodate another porobability score and boudning box prediction. The hope is that these two boxes will get good at different things.

In [1]:
# Importing necessary libraries
import torch
import torch.nn as nn
from torch.nn import MaxPool2d
from torch.nn import Linear
import torchvision
import numpy as np

In [16]:
class CNNBlock(nn.Module):
    
    def __init__(self, in_channels, out_channels, kernel_size, stride, padding):
        super(CNNBlock, self).__init__()
        
        self.conv       = nn.Conv2d(in_channels, out_channels, bias=False, kernel_size=kernel_size, stride=stride, padding=padding)
        self.batchnorm  = nn.BatchNorm2d(out_channels) # Batch norm wasnt actually invented when Yolo droppped but eh
        self.leaky_relu = nn.LeakyReLU(0.1)

    def forward(self, x):
        x = self.conv(x)
        x = self.batchnorm(x)
        x = self.leaky_relu(x)
        return x
    


    

class Yolov1(nn.Module):
    def __init__(self, split_size, num_boxes, num_classes, in_channels=3):
        super(Yolov1, self).__init__()

        self.in_channels = in_channels
        self.darknet = self.create_conv_layers()
        self.fc = self.create_fc(split_size=split_size, num_boxes=num_boxes, num_classes=num_classes)


    def forward(self, x):
        x = self.darknet(x)
        x = torch.flatten(x, start_dim=1) # LOOK INTO THIS
        return self.fc(x)
    






    # The convolutional layers of Yolo V1. This isnt the cleanest, or least verbose way to implement this, but its a decent way to learn the architecture
    def create_conv_layers(self):

        layers = []
        in_channels = self.in_channels

        layers.append(CNNBlock(in_channels=in_channels, out_channels=64, kernel_size=7, stride=2, padding=3))

        layers.append(MaxPool2d(kernel_size=2, stride=2))



        layers.append(CNNBlock(in_channels=64, out_channels=192, kernel_size=3, stride=1, padding=1))

        layers.append(MaxPool2d(kernel_size=2, stride=2))

        layers.append(CNNBlock(in_channels=192, out_channels=128, kernel_size=1, stride=1, padding=0))

        layers.append(CNNBlock(in_channels=128, out_channels=256, kernel_size=3, stride=1, padding=1))

        layers.append(CNNBlock(in_channels=256, out_channels=256, kernel_size=1, stride=1, padding=0))

        layers.append(CNNBlock(in_channels=256, out_channels=512, kernel_size=3, stride=1, padding=1))

        layers.append(MaxPool2d(kernel_size=2, stride=2))


        # Note this is NOT how you should handle repeated layers, it only works in this case since the output channel
        # dimension matches the input channel dimension
        for i in range(4):

            layers.append(CNNBlock(in_channels=512, out_channels=256, kernel_size=1, stride=1, padding=0))

            layers.append(CNNBlock(in_channels=256, out_channels=512, kernel_size=3, stride=1, padding=1))

        layers.append(CNNBlock(in_channels=512, out_channels=512, kernel_size=1, stride=1, padding=0))

        layers.append(CNNBlock(in_channels=512, out_channels=1024, kernel_size=3, stride=1, padding=1))

        layers.append(MaxPool2d(kernel_size=2, stride=2)) # This reduces the tensor to a 7x7x channels grid



        for i in range(2):

            layers.append(CNNBlock(in_channels=1024, out_channels=512, kernel_size=1, stride=1, padding=0))

            layers.append(CNNBlock(in_channels=512, out_channels=1024, kernel_size=3, stride=1, padding=1))

        layers.append(CNNBlock(in_channels=1024, out_channels=1024, kernel_size=3, stride=1, padding=1))

        layers.append(CNNBlock(in_channels=1024, out_channels=1024, kernel_size=3, stride=2, padding=1))




        layers.append(CNNBlock(in_channels=1024, out_channels=1024, kernel_size=3, stride=1, padding=1))

        layers.append(CNNBlock(in_channels=1024, out_channels=1024, kernel_size=3, stride=1, padding=1))


        return nn.Sequential(*layers)
    



    def create_fc(self, split_size, num_boxes, num_classes):
        S, B, C = split_size, num_boxes, num_classes

        layers = []

        #layers.append(nn.Flatten())

        layers.append(nn.Linear(1024*S*S, 2048)) # Original paper has 4096 but simplify for our testing

        layers.append(nn.Dropout(0.5))

        layers.append(nn.LeakyReLU(0.1))

        layers.append(Linear(2048, S*S*(C+B*5)))

        return nn.Sequential(*layers)





        

In [17]:

# pass a tensor thorugh the model to check if it works
def test(split_size = 7, num_boxes = 2, num_classes = 20):
    model = Yolov1(split_size, num_boxes, num_classes)
    x = torch.randn((3,3, 448, 448))

    print(model(x).shape)


test()

torch.Size([3, 1470])
