<a href="https://colab.research.google.com/github/adnan119/Pytorch-Projects/blob/main/Object_Detection/YOLO/YOLO_v1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import torch
import torch.nn as nn
import os
import pandas as pd
from PIL import Image

In [2]:
architecture_config = [
                       #format: (kernel_size, num_filters, stride, padding)
                       (7, 64, 2, 3),
                       "M",
                       (3, 192, 1, 1),
                       "M",
                       (3, 128, 1, 0),
                       (1, 256, 1, 1),
                       (3, 256, 1, 0),
                       (1, 512, 1, 1),
                       "M",
                       [(1, 256, 1, 0),(3, 512, 1, 1), 4], # 4 = no. of times these two tuples should be repeated
                       (1, 512, 1, 0),
                       (3, 1024, 1, 1),
                       "M",
                       [(1, 512, 1, 0), (3, 1024, 1, 1), 2],
                       (3, 1024, 1, 1),
                       (3, 1024, 2, 1),
                       (3, 1024, 1, 1),
                       (3, 1024, 1, 1),
]

In [3]:
class CNNBlock(nn.Module):
  def __init__(self, in_channels, out_channels, **kwargs):
    super(CNNBlock, self).__init__()
    self.conv = nn.Conv2d(in_channels, out_channels, bias = False, **kwargs)
    self.batchnorm = nn.BatchNorm2d(out_channels)
    self.leakyrelu = nn.LeakyReLU(0.1)

  def forward(self, x):
    return self.leakyrelu(self.batchnorm(self.conv(x)))

In [4]:
class yolov1(nn.Module):
  def __init__(self, in_channels=3, **kwargs):
    super(yolov1, self).__init__()
    self.architecture = architecture_config
    self.in_channels = in_channels
    self.darknet = self._create_conv_layers(self.architecture)
    self.fcs = self._create_fcs(**kwargs)

  def forward(self, x):
    x = self.darknet(x)
    return self.fcs(torch.flatten(x, start_dim=1))

  def _create_conv_layers(self, architecture):
    layers = []
    in_channels = self.in_channels

    for x in architecture:
      if type(x) == tuple:
        layers += [
                   CNNBlock(in_channels, 
                            x[1], 
                            kernel_size = x[0], 
                            stride = x[2], 
                            padding = x[3])
                  ]
        in_channels = x[1]

      elif type(x) == str:
        layers += [nn.MaxPool2d(kernel_size=2, stride = 2)]
      elif type(x) == list:
        conv_1 = x[0]
        conv_2 = x[1]
        num_repeats = x[2]

        for i in range(num_repeats):
          layers += [
                     CNNBlock(in_channels,
                              conv_1[1],
                              kernel_size = conv_1[0],
                              stride = conv_1[2],
                              padding = conv_1[3]),
                     CNNBlock(conv_1[1],
                              conv_2[1],
                              kernel_size = conv_2[0],
                              stride = conv_2[2],
                              padding = conv_2[3])
          ]

          in_channels = conv_2[1]
    return nn.Sequential(*layers)

  def _create_fcs(self, split_size, num_boxes, num_classes):
    S, B, C = split_size, num_boxes, num_classes
    return nn.Sequential(
        nn.Flatten(),
        nn.Linear(1024 * S * S, 512),
        nn.Dropout(0.0),
        nn.LeakyReLU(0.1),
        nn.Linear(512, S * S *(C + B * 5)),
    )

In [5]:
def test(S = 7, B = 2, C = 20):
  model = yolov1(split_size = S, num_boxes = B, num_classes = C)
  x = torch.randn((2, 3, 448, 448))
  print(model(x).shape)

In [6]:
test()

torch.Size([2, 1470])


In [7]:
class yololoss(nn.Module):
  def __init__(self, S = 7, B = 2, C = 20):
    super(yololoss, self).__init__()
    self.S = S
    self.B = B
    self.C = C
    self.lambda_coord = 5
    self.lambda_noobj = 0.5

  def forward(self, prediction, target):
    prediction = prediction.reshape(-1, self.S, self.S, self.C + self.B*5)

    iou_b1 = intersection_over_union(prediction[...,21:25], target[...,21:25])
    iou_b2 = intersection_over_union(prediction[...,26:30], target[...,21:25])

    ious = torch.cat([iou_b1.unsqueeze(0),iou_b2.unsqueeze(0)], dim =0)
    iou_maxes, best_box = torch.max(ious, dim=0)
    exists_box = target[..., 20].unsqueeze(3) #Iobj_i
 
    ####  BOX LOSS  ####
    #Box-Coordinates (mid-point, width & height)
    box_predictions = exists_box * (
        (
            best_box * prediction[...,26:30]
            + (1 - best_box) * prediction[...,21:25]
        )
    )

    box_targets = exists_box * target[...,21:25]

    box_predictions[..., 2:4] = torch.sign(box_predictions[..., 2:4]) * torch.sqrt(
        torch.abs(box_predictions[..., 2:4] + 1e-6)
    )
    
    #box dimensions: (N, S, S, 25)
    box_targets[..., 2:4] = torch.sqrt(box_targets[..., 2:4])
    #(N, S, S, 4) -> (N*S*S, 4)
    box_loss = self.mse(torch.flatten(box_predictions, end_dim=-2),
                        torch.flatten(box_targets, end_dim=-2),
                        )
    
    #### OBJECT LOSS ####
    pred_box = (
        best_box * prediction[..., 25:26] + (1 - best_box) * prediction[...,20:21]
    )
    # (N,S,S,1) -> (N*S*S*1)
    object_loss = self.mse(
        torch.flatten(exists_box * pred_box),
        torch.flatten(exists_box * target[...,20:21])
    )

    #### NO-OBJECT LOSS ####
    #(N, S, S, 1) -> (N, S*S*1)
    no_object_loss = self.mse(
        torch.flatten((1 - exists_box) * prediction[..., 20:21], start_dim=1),
        torch.flatten((1 - exists_box) * target[..., 20:21], start_dim=1),
    )

    no_object_loss = self.mse(
        torch.flatten((1 - exists_box) * prediction[..., 25:26], start_dim=1),
        torch.flatten((1 - exists_box) * target[..., 20:21], start_dim=1),
    )

    class_loss = self.mse(
        torch.flatten(exists_box * prediction[...,:20], end_dim = -2),
        torch.flatten(exists_box * target[...,:20], end_dim = -2)
    )

    loss = (
        self.lambda_coord * box_loss 
        + object_loss
        + self.lambda_noobj * no_object_loss
        + class_loss
    )

    return loss

In [8]:
class COTSDataset(torch.utils.data.Dataset):
  def __init__(self, csv_file, img_dir, label_dir,
               S=7, B=2, C=20, transform=None):
    self.annotations = pd.read_csv(csv_file)
    self.img_dir = img_dir
    self.label_dir = label_dir
    self.transform = transform
    self.S = S
    self.C = C
    self.B = B

  def __len__(self):
    return len(self.annotations)

  def __getitem__(self, index):
    label_path = os.path.join(self.label_dir, self.annotations.iloc[index, 1])
    boxes = []
    with open(label_path) as f:
      for label in f.readlines():
        class_label, x, y, width, height = [
             float(x) if float(x) != int(float(x)) else int(x)
             for x in label.replace("\n","").split()                               
        ]

        boxes.append([class_label, x, y, width, height])

    img_path = os.path.join(self.img_dir, self.annotations.iloc[index, 0])
    image = Image.open(img_path)
    boxes = torch.Tensor(boxes)

    if self.transform:
      image, boxes = self.transform(image, boxes)

    label_matrix = torch.zeros((self.S, self.S, self.C + 5*self.B))

    for box in boxes:
      class_label, x, y, width, height = box.tolist()
      class_label = int(class_label)
      i, j = int(self.S * x), int(self.S * y)
      x_cell, y_cell = self.S * x - i, self.S * y - j
      width_cell, height_cell = (
          width * self.S,
          height *self.S,
      )

      if label_matrix[i, j, 20] == 0:
        label_matrix[i, j, 20] = 1 
        box_coordinates = torch.Tensor(
            [x_cell, y_cell, width_cell, height_cell]
        )
        label_matrix[i,j, 21:25] = box_coordinates
        label_matrix[i,j, class_label] = 1

      return image, label_matrix



# **Utility Funtions**

In [9]:
def intersection_over_union(boxes_preds, boxes_labels, box_format = "midpoint"):

  if box_format == "midpoint":
    box1_x1 = boxes_pred[..., 0:1] - boxes_pred[..., 2:3]/2
    box1_y1 = boxes_pred[..., 1:2] - boxes_pred[..., 3:4]/2
    box1_x2 = boxes_pred[..., 0:1] + boxes_pred[..., 2:3]/2
    box1_y2 = boxes_pred[..., 1:2] + boxes_pred[..., 3:4]/2
    box2_x1 = boxes_labels[..., 0:1] - boxes_labels[..., 2:4]/2
    box2_y1 = boxes_labels[..., 1:2] - boxes_labels[..., 3:4]/2
    box2_x2 = boxes_labels[..., 0:1] + boxes_labels[..., 2:3]/2
    box2_y2 = boxes_labesl[..., 1:2] + boxes_labels[..., 3:4]/2

  elif box_format == "corners":
    box1_x1 = boxes_pred[..., 0:1]
    box1_y1 = boxes_pred[..., 1:2]
    box1_x2 = boxes_pred[..., 2:3]
    box1_y2 = boxes_pred[..., 3:4]
    box2_x1 = boxes_labels[..., 0:1]
    box2_y1 = boxes_labels[..., 1:2]
    box2_x2 = boxes_labels[..., 2:3]
    box2_y2 = boxes_labesl[..., 3:4]

  x1 = torch.max(box1_x1, box2_x1)
  y1 = torch.max(box1_y1, box2_y1)
  x2 = torch.min(box1_x2, box2_x2)
  y2 = torch.min(box1_y2, box2_y2)

  intersection = (x2 - x1).clamp(0) * (y2 - y1).clamp(0)

  box1_area = abs((box1_x2 - box1_x1) * (box1_y2 - box1_y1))
  box2_area = abs((box2_x2 - box2_x1) * (box2_y2 - box2_y1))

  union  = box1_area + box2_area -intersection

  return intersection/(union + 1e-6)

In [11]:
def non_max_supression(
    bboxes,
    iou_threshold,
    prob_threshold,
    box_format = "corners"
                      ):
  
  assert type(bboxes) == list

  bboxes = [box for box in bboxes if box[1] > prob_threshold]
  bboxes = sorted(bboxes, key=lambda x : x[1], reverse = True)
  bboxes_aft_nms = []

  while bboxes:
    chosen_box - bboxes.pop(0)

    bboxes = [
              box for box in bboxes
              if box[0] != chosen_box[0]
              or intersection_over_union(
                  torch.Tensor(chosen_box[2:]),
                  torch.Tensor(box[2:]),
                  box_format = box_format
              ) < iou_threshold
    ]

    bboxes_aft_nms.append(chosen_box)

  return bboxes_aft_nms