In [1]:
import os
import numpy as np
import pandas as pd
import random
from sklearn.model_selection import train_test_split
from PIL import Image
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
from torch.utils.data import Dataset
import torchvision.transforms as Tr
from torch.optim.lr_scheduler import StepLR

  warn(f"Failed to load image Python extension: {e}")


In [2]:
class ConvLayer(nn.Module):
    
    def __init__(self, cin, parameters, activation=nn.LeakyReLU(0.1)):
        
        super().__init__()
        
        cout        = parameters['cout']
        kernel_size = parameters['kernel_size']
        padding     = parameters['padding']
        stride      = parameters['stride'] 
        
        conv = nn.Conv2d(in_channels=cin, 
                         out_channels=cout, 
                         kernel_size=kernel_size, 
                         padding=padding,
                         stride=stride,
                         bias=False)
        bn =  nn.BatchNorm2d(cout)
        self.layer = nn.Sequential(*[conv, bn, activation])
    
    def forward(self, x):
        
        return self.layer(x)
    
class FCLayers(nn.Module):
    
    def __init__(self, cin, fc_architecture, activation=nn.LeakyReLU(0.1)):
        
        super().__init__()
        
        S = fc_architecture['S']
        B = fc_architecture['B']
        C = fc_architecture['C']
        c_hidden = fc_architecture['c_hidden']
        
        layer_list = []
        layer_list.append(nn.Flatten())
        layer_list.append(nn.Linear(cin*S*S, c_hidden))
        layer_list.append(activation)
        layer_list.append(nn.Linear(c_hidden, (C+5*B)*S*S))
        self.layers = nn.Sequential(*layer_list)
        self.S = S
        self.C = C
        self.B = B
        
    def forward(self, x):
        
        return self.layers(x).reshape(-1, self.S, self.S, self.C+5*self.B)

class YOLO1(nn.Module):
    
    def __init__(self, cnn_architecture, fc_architecture, cin=3, activation=nn.LeakyReLU(0.1)):
        
        super().__init__()
        
        layer_list = []
        c_input = cin
        
        for element in cnn_architecture:
            if element['type']=='cnn_block':
                for n in range(element['repeat']):
                    for parameters in element['layers']:
                        layer_list.append(ConvLayer(c_input, parameters, activation))
                        c_input = parameters['cout']
            if element['type']=='maxpool':
                pool = nn.MaxPool2d(kernel_size=element['parameters']['kernel_size'], stride=element['parameters']['stride'])
                layer_list.append(pool)
                    
        self.layers = nn.Sequential(*layer_list)
        
        self.fc_layers = FCLayers(c_input, fc_architecture, activation=activation)
    
    def forward(self, x):
        
        out = self.layers(x)
        out = self.fc_layers(out)

        return out

In [3]:
class YoloLoss(nn.Module):

    def __init__(self, S=7, B=2, C=20):
        super(YoloLoss, self).__init__()
        
        self.mse = nn.MSELoss(reduction="sum")

        self.S = S
        self.B = B
        self.C = C

        # These are from Yolo paper, signifying how much we should
        # pay loss for no object (noobj) and the box coordinates (coord)
        self.lambda_noobj = 0.5
        self.lambda_coord = 5

    def forward(self, predictions, target):
        
        class_probs = predictions[:,:,:,0:self.C]
        Probs = predictions[:,:,:,self.C::5]
        allbox_x = predictions[:,:,:,self.C+1::5]
        allbox_y = predictions[:,:,:,self.C+2::5]
        allbox_w = predictions[:,:,:,self.C+3::5]
        allbox_h = predictions[:,:,:,self.C+4::5]
        allbox_w_sqrt = torch.sqrt(torch.abs(allbox_w))*torch.sign(allbox_w)
        allbox_h_sqrt = torch.sqrt(torch.abs(allbox_h))*torch.sign(allbox_h)

        targetclass = target[:,:,:,0:self.C]
        exists_box = target[:,:,:,[self.C]]
        targetbox_x = target[:,:,:,[self.C+1]]
        targetbox_y = target[:,:,:,[self.C+2]]
        targetbox_w = target[:,:,:,[self.C+3]]
        targetbox_h = target[:,:,:,[self.C+4]]
        targetbox_w_sqrt = torch.sqrt(targetbox_w)
        targetbox_h_sqrt = torch.sqrt(targetbox_h)

        ### IOU Score ###
        allbox_x1 = allbox_x - 0.5*allbox_w
        allbox_x2 = allbox_x + 0.5*allbox_w
        allbox_y1 = allbox_y - 0.5*allbox_h
        allbox_y2 = allbox_y + 0.5*allbox_h

        targetbox_x1 = targetbox_x - 0.5*targetbox_w
        targetbox_x2 = targetbox_x + 0.5*targetbox_w
        targetbox_y1 = targetbox_y - 0.5*targetbox_h
        targetbox_y2 = targetbox_y + 0.5*targetbox_h

        intersetbox_x1= torch.maximum(allbox_x1, targetbox_x1)
        intersetbox_x2= torch.minimum(allbox_x2, targetbox_x2)
        intersetbox_y1= torch.maximum(allbox_y1, targetbox_y1)
        intersetbox_y2= torch.minimum(allbox_y2, targetbox_y2)

        intersection_area = (intersetbox_x2-intersetbox_x1).clamp(0)*(intersetbox_y2 - intersetbox_y1).clamp(0)
        allbox_area = torch.abs(allbox_w*allbox_h)
        targetbox_area = torch.abs(targetbox_w*targetbox_h)
        all_iou = intersection_area/(allbox_area+targetbox_area-intersection_area+ 1e-6)
        maxiou_idx = all_iou.argmax(dim=-1)
        maxiou_onehot = nn.functional.one_hot(maxiou_idx) # selects the box with higher iou

        # ======================== #
        #   FOR BOX COORDINATES    #
        # ======================== #

        box_loss = self.mse(maxiou_onehot*exists_box*targetbox_x, maxiou_onehot*exists_box*allbox_x)
        box_loss += self.mse(maxiou_onehot*exists_box*targetbox_y, maxiou_onehot*exists_box*allbox_y)
        box_loss += self.mse(maxiou_onehot*exists_box*targetbox_w_sqrt, maxiou_onehot*exists_box*allbox_w_sqrt)
        box_loss += self.mse(maxiou_onehot*exists_box*targetbox_h_sqrt, maxiou_onehot*exists_box*allbox_h_sqrt)

        # ======================= #
        #   FOR OBJECT LOSS       #
        # ======================= #

        object_loss = self.mse(maxiou_onehot*exists_box, maxiou_onehot*exists_box*Probs)

        # ================== #
        #   FOR CLASS LOSS   #
        # ================== #

        class_loss = self.mse(exists_box*targetclass, exists_box*class_probs)

        # ======================= #
        #   FOR NO OBJECT LOSS    #
        # ======================= #

        no_object_loss = self.mse((1-exists_box)*exists_box, (1-exists_box)*Probs[:,:,:,[0]])
        for j in range(1,self.B):
            no_object_loss += self.mse((1-exists_box)*exists_box, (1-exists_box)*Probs[:,:,:,[j]])

        # ======================= #
        #   Total LOSS            #
        # ======================= #   

        loss = (
                self.lambda_coord * box_loss  # first two rows in paper
                + object_loss  # third row in paper
                + self.lambda_noobj * no_object_loss  # forth row
                + class_loss  # fifth row
                )
        
        return loss


In [4]:
def intersection_over_union(boxes_preds, boxes_labels, box_format="midpoint"):
    """
    Calculates intersection over union

    Parameters:
        boxes_preds (tensor): Predictions of Bounding Boxes (BATCH_SIZE, 4)
        boxes_labels (tensor): Correct labels of Bounding Boxes (BATCH_SIZE, 4)
        box_format (str): midpoint/corners, if boxes (x,y,w,h) or (x1,y1,x2,y2)

    Returns:
        tensor: Intersection over union for all examples
    """

    if box_format == "midpoint":
        box1_x1 = boxes_preds[..., 0:1] - boxes_preds[..., 2:3] / 2
        box1_y1 = boxes_preds[..., 1:2] - boxes_preds[..., 3:4] / 2
        box1_x2 = boxes_preds[..., 0:1] + boxes_preds[..., 2:3] / 2
        box1_y2 = boxes_preds[..., 1:2] + boxes_preds[..., 3:4] / 2
        box2_x1 = boxes_labels[..., 0:1] - boxes_labels[..., 2:3] / 2
        box2_y1 = boxes_labels[..., 1:2] - boxes_labels[..., 3:4] / 2
        box2_x2 = boxes_labels[..., 0:1] + boxes_labels[..., 2:3] / 2
        box2_y2 = boxes_labels[..., 1:2] + boxes_labels[..., 3:4] / 2

    if box_format == "corners":
        box1_x1 = boxes_preds[..., 0:1]
        box1_y1 = boxes_preds[..., 1:2]
        box1_x2 = boxes_preds[..., 2:3]
        box1_y2 = boxes_preds[..., 3:4]  # (N, 1)
        box2_x1 = boxes_labels[..., 0:1]
        box2_y1 = boxes_labels[..., 1:2]
        box2_x2 = boxes_labels[..., 2:3]
        box2_y2 = boxes_labels[..., 3:4]

    x1 = torch.max(box1_x1, box2_x1)
    y1 = torch.max(box1_y1, box2_y1)
    x2 = torch.min(box1_x2, box2_x2)
    y2 = torch.min(box1_y2, box2_y2)

    # .clamp(0) is for the case when they do not intersect
    intersection = (x2 - x1).clamp(0) * (y2 - y1).clamp(0)

    box1_area = abs((box1_x2 - box1_x1) * (box1_y2 - box1_y1))
    box2_area = abs((box2_x2 - box2_x1) * (box2_y2 - box2_y1))

    return intersection / (box1_area + box2_area - intersection + 1e-6)

In [5]:
class VOCDataset(Dataset):
    
    def __init__(self, path, file, S=7, C=20, imagesize=448):
        
        self.data = pd.read_csv(os.path.join(path, file), header=None).values
        self.sample_path = os.path.join(os.path.join(path, 'images'))
        self.label_path = os.path.join(os.path.join(path, 'labels'))
        self.S = S
        self.C = C
        self.imagesize = imagesize
    
    def __len__(self):
        
        return len(self.data)
    
    def __getitem__(self,index):
        
        img = Image.open(os.path.join(self.sample_path, self.data[index,0]))
        img = img.resize((self.imagesize,self.imagesize))
        sample = np.asanyarray(img)
        sample = sample/255
        sample = torch.tensor(np.transpose(sample, (2,0,1)), dtype=torch.float)
        label_raw = np.loadtxt(os.path.join(self.label_path, self.data[index,1]))
        if len(label_raw.shape)==1:
            label_raw = label_raw.reshape(1,5)
        label = torch.zeros((self.S, self.S, self.C + 5))

        for n in range(len(label_raw)):
            c = int(label_raw[n,0])
            loc = (self.S*label_raw[n,1:3]).astype(int)
            box_center = self.S*label_raw[n,1:3] - loc
            box_size = self.S*label_raw[n,3:5]
            box = torch.tensor(np.concatenate((box_center, box_size)), dtype=torch.float)
            label[loc[1], loc[0], c] = 1
            label[loc[1], loc[0], self.C] = 1
            label[loc[1], loc[0], self.C+1:] = box

        
        return sample, label

In [6]:
path_data = './data/PascalVOC_YOLO/'

batch_size = 16

train_dataset = VOCDataset(path_data, 'train.csv', imagesize=448)

test_dataset = VOCDataset(path_data, 'test.csv', imagesize=448)

train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
                                           batch_size=batch_size,
                                           shuffle=True)

test_loader = torch.utils.data.DataLoader(dataset=test_dataset,
                                           batch_size=batch_size,
                                           shuffle=False)

-[source #1](https://arxiv.org/abs/1506.02640)

-[source #2](https://www.youtube.com/watch?v=n9_XyCGr-MI&list=PLy5rjn5-uSPAKe2PfszYRqNY7JJC45P1d&index=7&t=1920s)

In [7]:
cnn_architecture = []

element_1 = {'type': 'cnn_block',
             'layers':[{'cout':64,
                        'kernel_size':7,
                        'padding': 3,
                        'stride':2}],
            'repeat': 1}

cnn_architecture.append(element_1)

element_2 = {'type': 'maxpool',
             'parameters':{'kernel_size':2,
                           'stride': 2}} 

cnn_architecture.append(element_2)

element_3 = {'type': 'cnn_block',
             'layers':[{'cout':192,
                        'kernel_size':3,
                        'padding': 1,
                        'stride':1}],
            'repeat': 1}

cnn_architecture.append(element_3)

element_4 = {'type': 'maxpool',
             'parameters':{'kernel_size':2,
                           'stride': 2}} 

cnn_architecture.append(element_4)

element_5 = {'type': 'cnn_block',
             'layers':[{'cout':128,
                        'kernel_size':1,
                        'padding': 0,
                        'stride':1},
                      {'cout':256,
                        'kernel_size':3,
                        'padding': 1,
                        'stride':1},
                      {'cout':256,
                        'kernel_size':1,
                        'padding': 0,
                        'stride':1},
                      {'cout':512,
                        'kernel_size':3,
                        'padding': 1,
                        'stride':1}],
            'repeat': 1}

cnn_architecture.append(element_5)

element_6 = {'type': 'maxpool',
             'parameters':{'kernel_size':2,
                           'stride': 2}} 

cnn_architecture.append(element_6)

element_7 = {'type': 'cnn_block',
             'layers':[{'cout':256,
                        'kernel_size':1,
                        'padding': 0,
                        'stride':1},
                      {'cout':512,
                        'kernel_size':3,
                        'padding': 1,
                        'stride':1}],
            'repeat': 4}

cnn_architecture.append(element_7)

element_8 = {'type': 'cnn_block',
             'layers':[{'cout':512,
                        'kernel_size':1,
                        'padding': 0,
                        'stride':1},
                      {'cout':1024,
                        'kernel_size':3,
                        'padding': 1,
                        'stride':1}],
            'repeat': 1}

cnn_architecture.append(element_8)

element_9 = {'type': 'maxpool',
             'parameters':{'kernel_size':2,
                           'stride': 2}} 

cnn_architecture.append(element_9)

element_10 = {'type': 'cnn_block',
             'layers':[{'cout':512,
                        'kernel_size':1,
                        'padding': 0,
                        'stride':1},
                      {'cout':1024,
                        'kernel_size':3,
                        'padding': 1,
                        'stride':1}],
            'repeat': 2}

cnn_architecture.append(element_10)

element_11 = {'type': 'cnn_block',
             'layers':[{'cout':1024,
                        'kernel_size':3,
                        'padding': 1,
                        'stride':1},
                      {'cout':1024,
                        'kernel_size':3,
                        'padding': 1,
                        'stride':2},
                      {'cout':1024,
                        'kernel_size':3,
                        'padding': 1,
                        'stride':1},
                      {'cout':1024,
                        'kernel_size':3,
                        'padding': 1,
                        'stride':1}],
            'repeat': 1}

cnn_architecture.append(element_11)

fc_architecture ={'S':7, 'B':2, 'C':20, 'c_hidden': 496}

In [8]:
model = YOLO1(cnn_architecture, fc_architecture)

In [9]:
for (x,y) in train_loader:
    break

In [10]:
x.shape

torch.Size([16, 3, 448, 448])

In [11]:
y.shape

torch.Size([16, 7, 7, 25])

In [12]:
y_pred = model(x)

In [13]:
y_pred.shape

torch.Size([16, 7, 7, 30])

In [14]:
target = y
predictions = y_pred

In [15]:
class YoloLoss_v0(nn.Module):
    """
    Calculate the loss for yolo (v1) model
    """

    def __init__(self, S=7, B=2, C=20):
        super(YoloLoss_v0, self).__init__()
        self.mse = nn.MSELoss(reduction="sum")

        """
        S is split size of image (in paper 7),
        B is number of boxes (in paper 2),
        C is number of classes (in paper and VOC dataset is 20),
        """
        self.S = S
        self.B = B
        self.C = C

        # These are from Yolo paper, signifying how much we should
        # pay loss for no object (noobj) and the box coordinates (coord)
        self.lambda_noobj = 0.5
        self.lambda_coord = 5

    def forward(self, predictions, target):

        # Calculate IoU for the two predicted bounding boxes with target bbox
        iou_b1 = intersection_over_union(predictions[..., 21:25], target[..., 21:25])
        iou_b2 = intersection_over_union(predictions[..., 26:30], target[..., 21:25])
        ious = torch.cat([iou_b1.unsqueeze(0), iou_b2.unsqueeze(0)], dim=0)

        # Take the box with highest IoU out of the two prediction
        # Note that bestbox will be indices of 0, 1 for which bbox was best
        iou_maxes, bestbox = torch.max(ious, dim=0)
        exists_box = target[..., 20].unsqueeze(3)  # in paper this is Iobj_i

        # ======================== #
        #   FOR BOX COORDINATES    #
        # ======================== #

        # Set boxes with no object in them to 0. We only take out one of the two 
        # predictions, which is the one with highest Iou calculated previously.
        box_predictions = exists_box * (
            (
                bestbox * predictions[..., 26:30]
                + (1 - bestbox) * predictions[..., 21:25]
            )
        )

        box_targets = exists_box * target[..., 21:25]

        # Take sqrt of width, height of boxes to ensure that
        box_predictions[..., 2:4] = torch.sign(box_predictions[..., 2:4]) * torch.sqrt(
            torch.abs(box_predictions[..., 2:4] + 1e-6)
        )
        box_targets[..., 2:4] = torch.sqrt(box_targets[..., 2:4])

        box_loss = self.mse(
            torch.flatten(box_predictions, end_dim=-2),
            torch.flatten(box_targets, end_dim=-2),
        )

        # ==================== #
        #   FOR OBJECT LOSS    #
        # ==================== #

        # pred_box is the confidence score for the bbox with highest IoU
        pred_box = (
            bestbox * predictions[..., 25:26] + (1 - bestbox) * predictions[..., 20:21]
        )

        object_loss = self.mse(
            torch.flatten(exists_box * pred_box),
            torch.flatten(exists_box * target[..., 20:21]),
        )

        # ======================= #
        #   FOR NO OBJECT LOSS    #
        # ======================= #

        #max_no_obj = torch.max(predictions[..., 20:21], predictions[..., 25:26])
        #no_object_loss = self.mse(
        #    torch.flatten((1 - exists_box) * max_no_obj, start_dim=1),
        #    torch.flatten((1 - exists_box) * target[..., 20:21], start_dim=1),
        #)

        no_object_loss = self.mse(
            torch.flatten((1 - exists_box) * predictions[..., 20:21], start_dim=1),
            torch.flatten((1 - exists_box) * target[..., 20:21], start_dim=1),
        )

        no_object_loss += self.mse(
            torch.flatten((1 - exists_box) * predictions[..., 25:26], start_dim=1),
            torch.flatten((1 - exists_box) * target[..., 20:21], start_dim=1)
        )

        # ================== #
        #   FOR CLASS LOSS   #
        # ================== #

        class_loss = self.mse(
            torch.flatten(exists_box * predictions[..., :20], end_dim=-2,),
            torch.flatten(exists_box * target[..., :20], end_dim=-2,),
        )

        loss = (
            self.lambda_coord * box_loss  # first two rows in paper
            + object_loss  # third row in paper
            + self.lambda_noobj * no_object_loss  # forth row
            + class_loss  # fifth row
        )

        return loss

In [16]:
lossfn_0 = YoloLoss_v0()
lossfn_0(predictions, target)

tensor(1247.9930, grad_fn=<AddBackward0>)

In [17]:
lossfn = YoloLoss()
lossfn(predictions, target)

tensor(1247.9955, grad_fn=<AddBackward0>)