In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import torch
import torch.nn as nn

In [None]:
!unzip "/content/drive/MyDrive/archive.zip" -d "./data/"

In [None]:

# (kernel_size, filters, stride, padding)
architecture_config = [
    (7, 64, 2, 3),
    "M",
    (3, 192, 1, 1),
    "M",
    (1, 128, 1, 0),
    (3, 256, 1, 1),
    (1, 256, 1, 0),
    (3, 512, 1, 1),
    "M",
    [(1, 256, 1, 0), (3, 512, 1, 1), 4],
    (1, 512, 1, 0),
    (3, 1024, 1, 1),
    "M",
    [(1, 512, 1, 0), (3, 1024, 1, 1), 2],
    (3, 1024, 1, 1),
    (3, 1024, 2, 1),
    (3, 1024, 1, 1),
    (3, 1024, 1, 1),
]


class CNNBlock(nn.Module):
    def __init__(self, in_channels, out_channels, **kwargs):
        super(CNNBlock, self).__init__()
        self.conv = nn.Conv2d(in_channels, out_channels, bias=False, **kwargs)
        self.batchnorm = nn.BatchNorm2d(out_channels)
        self.leakyrelu = nn.LeakyReLU(0.1)

    def forward(self, x):
        return self.leakyrelu(self.batchnorm(self.conv(x)))


In [None]:
class Yolov1(nn.Module):
    def __init__(self, in_channels=3, **kwargs):
        super(Yolov1, self).__init__()
        self.architecture = architecture_config
        self.in_channels = in_channels
        self.darknet = self._create_conv_layers(self.architecture)
        self.fcs = self._create_fcs(**kwargs)

    def forward(self, x):
        x = self.darknet(x)
        return self.fcs(torch.flatten(x, start_dim=1))

    def _create_conv_layers(self, architecture):
        layers = []
        in_channels = self.in_channels

        for x in architecture:
            if type(x) == tuple:
                layers += [
                    CNNBlock(
                        in_channels, x[1], kernel_size=x[0], stride=x[2], padding=x[3],
                    )
                ]
                in_channels = x[1]

            elif type(x) == str:
                layers += [nn.MaxPool2d(kernel_size=(2, 2), stride=(2, 2))]

            elif type(x) == list:
                conv1 = x[0]
                conv2 = x[1]
                num_repeats = x[2]

                for _ in range(num_repeats):
                    layers += [
                        CNNBlock(
                            in_channels,
                            conv1[1],
                            kernel_size=conv1[0],
                            stride=conv1[2],
                            padding=conv1[3],
                        )
                    ]
                    layers += [
                        CNNBlock(
                            conv1[1],
                            conv2[1],
                            kernel_size=conv2[0],
                            stride=conv2[2],
                            padding=conv2[3],
                        )
                    ]
                    in_channels = conv2[1]

        return nn.Sequential(*layers)

    def _create_fcs(self, split_size, num_boxes, num_classes):
        S, B, C = split_size, num_boxes, num_classes

        # In original paper this should be
        # nn.Linear(1024*S*S, 4096),
        # nn.LeakyReLU(0.1),
        # nn.Linear(4096, S*S*(B*5+C))

        return nn.Sequential(
            nn.Flatten(),
            nn.Linear(1024 * S * S, 4096),
            nn.Dropout(0.5),
            nn.LeakyReLU(0.1),
            nn.Linear(4096, S * S * (C + B * 5)),
        )


In [None]:
# Testing the model
model = Yolov1(split_size=7, num_boxes=2, num_classes=20)
x = torch.randn((2,3,448,448))
print(model(x).shape)

torch.Size([2, 1470])


In [None]:
# Generate the summary of the model
from torchsummary import summary
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
summary(model.to(device), (3,448,448))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1         [-1, 64, 224, 224]           9,408
       BatchNorm2d-2         [-1, 64, 224, 224]             128
         LeakyReLU-3         [-1, 64, 224, 224]               0
          CNNBlock-4         [-1, 64, 224, 224]               0
         MaxPool2d-5         [-1, 64, 112, 112]               0
            Conv2d-6        [-1, 192, 112, 112]         110,592
       BatchNorm2d-7        [-1, 192, 112, 112]             384
         LeakyReLU-8        [-1, 192, 112, 112]               0
          CNNBlock-9        [-1, 192, 112, 112]               0
        MaxPool2d-10          [-1, 192, 56, 56]               0
           Conv2d-11          [-1, 128, 56, 56]          24,576
      BatchNorm2d-12          [-1, 128, 56, 56]             256
        LeakyReLU-13          [-1, 128, 56, 56]               0
         CNNBlock-14          [-1, 128,

### LOSS FUNCTION


In [None]:
import torch
import torch.nn as nn

class YoloLoss(nn.Module):
  def __init__(self, S=7, B=2, C=20):
    super(YoloLoss, self).__init__()
    self.mse = nn.MSELoss(reduction="sum") # reduction="sum" indicates that the losses will be summed up
    self.S = S
    self.B = B
    self.C = C

    self.lambda_noobj = 0.5
    self.lambda_coord = 5

  def forward(self, predictions, target):
    predictions = predictions.reshape(-1, self.S, self.S, self.C + self.B*5)

    iou_b1 = intersection_over_union(predictions[..., 21:25], target[..., 21:25])
    iou_b2 = intersection_over_union(predictions[..., 26:30], target[..., 21:25])

    ious = torch.cat([iou_b1.unsqueeze(0), iou_b2.unsqueeze(0)], dim=0)
    iou_maxes, bestbox = torch.max(ious, dim=0) # Store the maximum values and corresponding indices, respectively

    # Get the confidence probability and unsqueeze in 3rd dim
    exists_box = target[..., 20].unsqueeze(3) # Tells us if there is an object in cell (i)

    #=======================#
    # FOR BOUNDING BOX LOSS #
    #=======================#
    box_predictions = exists_box * (
        (
          bestbox * predictions[..., 26:30] # if second bbox is the best
          + (1 - bestbox) * predictions[..., 21:25] # if first bbox is the best
        )
    )

    box_targets = exists_box * target[..., 21:25]

    # Taking square root of width and height as required
    box_predictions[...,2:4] = torch.sign(box_predictions[...,2:4]) * torch.sqrt(
        torch.abs(box_predictions[...,2:4] + 1e-6))
    box_targets[..., 2:4] = torch.sqrt(box_targets[..., 2:4])

    # Calculate the loss
    # end_dim=-2 indicates that all dimensions except the last two should be flattened because (N,S,S,4) --> (N*S*S, 4)
    box_loss = self.mse(
            torch.flatten(box_predictions, end_dim=-2),
            torch.flatten(box_targets, end_dim=-2))

    #=================#
    # FOR OBJECT LOSS #
    #=================#
    pred_box = (
        bestbox*predictions[..., 25:26] # if second box is best, take its confidence value
        + (1-bestbox)*predictions[..., 20:21] # if first box is best, take its confidence value
    )

    object_loss = self.mse(
        torch.flatten(exists_box * pred_box),
        torch.flatten(exists_box * target[...,20:21])
    )

    #====================#
    # FOR NO OBJECT LOSS #
    #====================#

    # For first bbox
    no_object_loss = self.mse(
        torch.flatten((1 - exists_box) * predictions[...,20:21], start_dim=1), # (N,S,S,1) --> (N, S*S*1)
        torch.flatten((1 - exists_box) * target[...,20:21], start_dim=1)
    )

    # For second bbox
    no_object_loss += self.mse(
        torch.flatten((1 - exists_box) * predictions[...,25:26], start_dim=1),
        torch.flatten((1 - exists_box) * target[...,20:21], start_dim=1)
    )

    #================#
    # FOR CLASS LOSS #
    #================#
    class_loss = self.mse(
        torch.flatten(exists_box * predictions[..., :20], end_dim=-2), # (N,S,S,20)-->(N*S*S, 20)
        torch.flatten(exists_box * target[..., :20], end_dim=-2)
    )

    # ACTUAL LOSS
    loss = (
        self.lambda_coord * box_loss
        + object_loss
        + self.lambda_noobj * no_object_loss
        + class_loss
    )

    return loss


### utility function

In [None]:
img_labels = {0:'aeroplane',
              1:'bicycle',
              2:'bird',
              3:'boat',
              4:'bottle',
              5:'bus',
              6:'car',
              7:'cat',
              8:'chair',
              9:'cow',
              10:'diningtable',
              11:'dog',
              12:'horse',
              13:'motorbike',
              14:'person',
              15:'pottedplant',
              16:'sheep',
              17:'sofa',
              18:'train',
              19:'tvmonitor'}

In [None]:
def intersection_over_union(boxes_preds, boxes_labels, box_format="midpoint"):
  """
  Calculates intersection over union

  Parameters:
    boxes_preds (tensor): Predictions of Bounding Boxes (BATCH_SIZE, 4)
    boxes_labels (tensor): Correct labels of Bounding Boxes (BATCH_SIZE, 4)
    box_format (str): midpoint/ corners, if boxes (x,y,w,h) or (x1, y1, x2, y2)

  Returns:
    tensor: Intersection over union for all examples

  Note:
    The `...` is used for indexing all elements in the preceding dimensions
    and `0:1` represents the range of indices to extract along the last dimension
  """

  if box_format == 'midpoint':
    box1_x1 = boxes_preds[..., 0:1] - boxes_preds[..., 2:3] / 2
    box1_y1 = boxes_preds[..., 1:2] - boxes_preds[..., 3:4] / 2
    box1_x2 = boxes_preds[..., 0:1] + boxes_preds[..., 2:3] / 2
    box1_y2 = boxes_preds[..., 1:2] + boxes_preds[..., 3:4] / 2

    box2_x1 = boxes_labels[..., 0:1] - boxes_labels[..., 2:3] / 2
    box2_y1 = boxes_labels[..., 1:2] - boxes_labels[..., 3:4] / 2
    box2_x2 = boxes_labels[..., 0:1] + boxes_labels[..., 2:3] / 2
    box2_y2 = boxes_labels[..., 1:2] + boxes_labels[..., 3:4] / 2

  elif box_format == 'corners':
    box1_x1 = boxes_preds[..., 0:1]
    box1_y1 = boxes_preds[..., 1:2]
    box1_x2 = boxes_preds[..., 2:3]
    box1_y2 = boxes_preds[..., 3:4] # Slicing this way to maintain the shape i.e (N,1) where, N is the number of bboxes

    box2_x1 = boxes_labels[..., 0:1]
    box2_y1 = boxes_labels[..., 1:2]
    box2_x2 = boxes_labels[..., 2:3]
    box2_y2 = boxes_labels[..., 3:4]

  x1 = torch.max(box1_x1, box2_x1)
  y1 = torch.max(box1_y1, box2_y1)
  x2 = torch.max(box1_x2, box2_x2)
  y2 = torch.max(box1_y2, box2_y2)

  # .clamp(0) is for the case when they do not intersect
  intersection = (x2-x1).clamp(0) * (y2-y1).clamp(0) # length * breadth

  box1_area = abs((box1_x2 - box1_x1) * (box1_y2 - box1_y1)) # Absolute so that area is not negative
  box2_area = abs((box2_x2 - box2_x1) * (box2_y2 - box2_y1)) # Absolute so that area is not negative

  # IOU = Area of intersection / Area of Union
  return intersection / (box1_area + box2_area - intersection + 1e-6)


In [None]:
def non_max_supression(bboxes, iou_threshold, threshold, box_format="corners"):
  """
  Note: bboxes input should be list of bounding boxes
  i.e bboxes = [[1, 0.9, x1, y1, x2, y2], ..] # Each bounding box --> [class, probability, x1, y1, x2, y2]

  """

  # Validate the input
  assert type(bboxes) == list

  # Discard all the bounding box < probability threshold
  bboxes = [box for box in bboxes if box[1] > threshold]

  # Sort the bboxes in descending order based on their probabilities
  bboxes = sorted(bboxes, key=lambda x:x[1], reverse=True)

  # Create empty list for bboxes to append after NMS
  bboxes_after_nms = []

  while bboxes:
    chosen_box = bboxes.pop(0) # Select and remove the bounding box with largest probability from bboxes list

    # New list comprehension for different class or same class having IoU less than threshold
    bboxes = [
        box
        for box in bboxes
        if box[0] != chosen_box[0] # Checks if the class label of box is different from the class label of chosen_box
        or intersection_over_union(
            torch.tensor(chosen_box[2:]),
            torch.tensor(box[2:]),
            box_format = box_format,
        ) < iou_threshold # Checks if IoU < iou_threshold
    ]

    bboxes_after_nms.append(chosen_box)

  return bboxes_after_nms

In [None]:
import torch
from collections import Counter

def mean_average_precision(pred_boxes, true_boxes, iou_threshold=0.5, box_format='midpoint', num_classes=20):
  """
  Note: pred_boxes input should be list of bounding boxes
  i.e pred_boxes = [[train_idx, class_pred, prob_score, x1, y1, x2, y2], ...]

  Similarly for true_boxes
  """

  average_precisions = [] # To store average precisions of each class
  epsilon = 1e-6 # For numerical stability

  for c in range(num_classes):
    detections = []
    ground_truths = []

    for detection in pred_boxes:
      if detection[1] == c:
        detections.append(detection)

    for true_box in true_boxes:
      if true_box[1] == c:
        ground_truths.append(true_box)

    # Calculate the count of unique elements in the ground_truths list and stores the result in the amount_bboxes variable
    # For eg. img 0 has 3 bboxes, img 1 has 5 bboxes then, amount_bboxes = {0:3, 1:5}
    amount_bboxes = Counter([gt[0] for gt in ground_truths])

    for key, val in amount_bboxes.items():
      amount_bboxes[key] = torch.zeros(val)
    # amount_boxes = {0: torch.tensor([0,0,0]), 1: torch.tensor([0,0,0,0,0])}

    # Sort the bboxes in descending order based on their probabilities
    detections.sort(key=lambda x: x[2], reverse=True)

    TP = torch.zeros((len(detections)))
    FP = torch.zeros((len(detections)))
    total_true_bboxes = len(ground_truths)

    # If none exists for this class then we can safely skip
    if total_true_bboxes == 0:
      continue

    for detection_idx, detection in enumerate(detections):
      # Only filter images having same index
      ground_truth_img = [bbox for bbox in ground_truths if bbox[0] == detection[0]]

      num_gts = len(ground_truth_img)
      best_iou = 0

      # For selection of bbox having highest iou
      for idx, gt in enumerate(ground_truth_img):
        iou = intersection_over_union(
            torch.tensor(detection[3:]),
            torch.tensor(gt[3:]),
            box_format = box_format
        )

        if iou > best_iou:
          best_iou = iou
          best_gt_idx = idx

      # Categorizing either TP or FP
      if best_iou > iou_threshold:
        # Check if we haven't covered this bounding box before | '0' means we haven't covered
        if amount_bboxes[detection[0]][best_gt_idx] == 0:
          TP[detection_idx] = 1
          amount_bboxes[detection[0]][best_gt_idx] == 1 # Update that now it's covered
        else:
          FP[detection_idx] = 1
      else:
        FP[detection_idx] = 1

    # [1,1,0,1,0] --> [1,2,2,3,3]
    TP_cumsum = torch.cumsum(TP, dim=0)
    FP_cumsum = torch.cumsum(FP, dim=0)

    recalls = TP_cumsum / (total_true_bboxes + epsilon)
    precisions = torch.divide(TP_cumsum, (TP_cumsum + FP_cumsum + epsilon))

    # By adding these initial values of 1 to precisions and 0 to recalls,
    # the code ensures that the precision and recall values start with the appropriate initial points.
    precisions = torch.cat((torch.tensor([1]), precisions))
    recalls = torch.cat((torch.tensor([0]), recalls))

    # Calculate the average precision by using the trapezoidal rule to compute the area under the precision-recall curve
    average_precisions.append(torch.trapz(precisions, recalls))

  # Return mAP
  return sum(average_precisions) / len(average_precisions)


In [None]:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.patches import Rectangle

def plot_image(image, boxes):
    """Plots predicted bounding boxes on the image"""
    # Convert input image to numpy array
    im = np.array(image)
    height, width, _ = im.shape

    # Create figure and axes
    fig, ax = plt.subplots(1)
    # Display the image
    ax.imshow(im)

    # box[0] is x midpoint, box[2] is width
    # box[1] is y midpoint, box[3] is height

    # Create a Rectangle patch
    for box in boxes:
        cls = int(box[0])
        prob = box[1]
        box = box[2:]
        assert len(box) == 4, "Got more values than in x, y, w, h, in a box!"

        # Calculate the top corner of bounding box
        upper_left_x = box[0] - box[2] / 2
        upper_left_y = box[1] - box[3] / 2

        # Create Rectangle patch
        rect = Rectangle(
            (upper_left_x * width, upper_left_y * height),
            box[2] * width,
            box[3] * height,
            linewidth=1,
            edgecolor="r",
            facecolor="none",
        )
        # Add the patch to the Axes
        ax.add_patch(rect)

        # Add class and probability text
        text = f"{img_labels[cls]}: {prob:.2f}"
        ax.text(
            upper_left_x * width,
            upper_left_y * height - 10,
            text,
            fontsize=10,
            color="r",
            verticalalignment="top",
            bbox={"facecolor": "white", "alpha": 0.7, "pad": 2},
        )

    plt.show()
    return fig

In [None]:
def get_bboxes(
    loader,
    model,
    iou_threshold,
    threshold,
    pred_format="cells",
    box_format="midpoint",
    device="cuda" if torch.cuda.is_available() else "cpu"):

  """
  Input images --> get all true boxes and predicted boxes
  """

  all_pred_boxes = []
  all_true_boxes = []

  # make sure model is in eval before get bboxes
  model.eval()
  train_idx = 0 # For each batch

  for batch_idx, (x, labels) in enumerate(loader):
      x = x.to(device)
      labels = labels.to(device)

      with torch.no_grad():
          predictions = model(x)

      batch_size = x.shape[0]
      true_bboxes = cellboxes_to_boxes(labels)
      bboxes = cellboxes_to_boxes(predictions)

      # For every image in each batch --> NMS
      for idx in range(batch_size):
          nms_boxes = non_max_supression(
              bboxes[idx],
              iou_threshold=iou_threshold,
              threshold=threshold,
              box_format=box_format,
          )

          for nms_box in nms_boxes:
              all_pred_boxes.append([train_idx] + nms_box)

          for box in true_bboxes[idx]:
              # many will get converted to 0 pred
              if box[1] > threshold:
                  all_true_boxes.append([train_idx] + box)

          train_idx += 1

  model.train()
  return all_pred_boxes, all_true_boxes

In [None]:
def convert_cellboxes(predictions, S=7):
    """
    Converts bounding boxes output from Yolo with
    an image split size of S into entire image ratios
    rather than relative to cell ratios.
    """

    predictions = predictions.to("cpu")
    batch_size = predictions.shape[0]
    predictions = predictions.reshape(batch_size, 7, 7, 30)
    bboxes1 = predictions[..., 21:25]
    bboxes2 = predictions[..., 26:30]
    scores = torch.cat(
        (predictions[..., 20].unsqueeze(0), predictions[..., 25].unsqueeze(0)), dim=0
    )
    best_box = scores.argmax(0).unsqueeze(-1)
    best_boxes = bboxes1 * (1 - best_box) + best_box * bboxes2
    cell_indices = torch.arange(7).repeat(batch_size, 7, 1).unsqueeze(-1)
    x = 1 / S * (best_boxes[..., :1] + cell_indices)
    y = 1 / S * (best_boxes[..., 1:2] + cell_indices.permute(0, 2, 1, 3))
    w_y = 1 / S * best_boxes[..., 2:4]
    converted_bboxes = torch.cat((x, y, w_y), dim=-1)
    predicted_class = predictions[..., :20].argmax(-1).unsqueeze(-1)
    best_confidence = torch.max(predictions[..., 20], predictions[..., 25]).unsqueeze(
        -1
    )
    converted_preds = torch.cat(
        (predicted_class, best_confidence, converted_bboxes), dim=-1
    )

    return converted_preds

In [None]:
def cellboxes_to_boxes(out, S=7):
    converted_pred = convert_cellboxes(out).reshape(out.shape[0], S * S, -1)
    converted_pred[..., 0] = converted_pred[..., 0].long()
    all_bboxes = []

    for ex_idx in range(out.shape[0]):
        bboxes = []

        for bbox_idx in range(S * S):
            bboxes.append([x.item() for x in converted_pred[ex_idx, bbox_idx, :]])
        all_bboxes.append(bboxes)

    return all_bboxes

In [None]:
import torch
import os
import pandas as pd
from PIL import Image


class VOCDataset(torch.utils.data.Dataset):
  def __init__(self, csv_file, img_dir, label_dir, S=7, B=2, C=20, transform=None):
    self.annotations = pd.read_csv(csv_file)
    self.img_dir = img_dir
    self.label_dir = label_dir
    self.transform = transform
    self.S = S
    self.B = B
    self.C = C

  def __len__(self):
    return len(self.annotations)

  def __getitem__(self, index):
    label_path = os.path.join(self.label_dir, self.annotations.iloc[index,1])
    boxes = []

    with open(label_path) as f:
      for label in f.readlines():
        # List comprehension that converts each component of the line from string format to either float or integer
        class_label, x, y, width, height = [
            float(x) if float(x) != int(float(x)) else int(x)
            for x in label.replace("\n","").split()
        ]
        # Append bboxes for that particular label
        boxes.append([class_label, x, y, width, height])

    img_path = os.path.join(self.img_dir, self.annotations.iloc[index, 0])
    image = Image.open(img_path)
    boxes = torch.tensor(boxes)

    if self.transform:
      image = self.transform(image)

    label_matrix = torch.zeros((self.S, self.S, self.C + 5*self.B))
    for box in boxes:
      class_label, x, y, width, height = box.tolist()
      class_label = int(class_label)

      # i=cell row and j=cell column --> get the cell in which midpoint lies
      i , j = int(self.S * y), int(self.S * x)
      # Then again scales down to 0-1
      x_cell, y_cell = self.S * x - j, self.S * y - i
      width_cell, height_cell = (
          width * self.S,
          height * self.S
      )

      # Now fill in the label_matrix
      if label_matrix[i,j,20] == 0: # 20th index specifies if there is object or not
        label_matrix[i,j,20] = 1 # This means that cell has object
        box_coordinates = torch.tensor(
            [x_cell, y_cell, width_cell, height_cell]
        )
        label_matrix[i,j,21:25] = box_coordinates
        label_matrix[i,j,class_label] = 1 # Specifying that particular class is present

    return image, label_matrix

### model training

In [None]:
import torch
import torchvision.transforms as transforms
import torch.optim as optim
import torchvision.transforms.functional as FT
from tqdm import tqdm
from torch.utils.data import DataLoader
# from model import Yolov1
# from dataset import VOCDataset
# from utils import (
#     non_max_suppression,
#     mean_average_precision,
#     intersection_over_union,
#     cellboxes_to_boxes,
#     get_bboxes,
#     plot_image,
#     save_checkpoint,
#     load_checkpoint,
# )
# from loss import YoloLoss

seed = 123
torch.manual_seed(seed)

# Hyperparameters etc.
LEARNING_RATE = 0.001
DEVICE = "cuda" if torch.cuda.is_available else "cpu"
BATCH_SIZE = 64
PIN_MEMORY = True
LOAD_MODEL = False
WEIGHT_DECAY = 0
EPOCHS = 5
MODEL_FILE = "./content/drive/MyDrive/weights/yolov1_pascal.pt"
IMG_DIR = "/content/data/images"
LABEL_DIR = "/content/data/labels"

In [None]:
transform = transforms.Compose([transforms.Resize((448, 448)), transforms.ToTensor()])

In [None]:
train_dataset = VOCDataset(
        "/content/data/train.csv",
        transform=transform,
        img_dir=IMG_DIR,
        label_dir=LABEL_DIR,
    )

test_dataset = VOCDataset(
        "/content/data/test.csv", transform=transform, img_dir=IMG_DIR, label_dir=LABEL_DIR,
    )

train_loader = DataLoader(
    dataset=train_dataset,
    batch_size=BATCH_SIZE,
    shuffle=True,
)

test_loader = DataLoader(
    dataset=test_dataset,
    batch_size=BATCH_SIZE,
    shuffle=True,
)

In [None]:
model = Yolov1(split_size=7, num_boxes=2, num_classes=20).to(DEVICE)
optimizer = optim.Adam(
    model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY
)
loss_fn = YoloLoss()
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer=optimizer, factor=0.1, patience=3, mode='max', verbose=True)


In [None]:
for epoch in range(EPOCHS):
    loop = tqdm(train_loader, leave=True)
    mean_loss = []
    model.train()
    # for x,y in train_loader:
    for batch_idx, (x, y) in enumerate(loop):
        x = x.to(DEVICE)
        y = y.to(DEVICE)
        out = model(x)
        loss = loss_fn(out, y)
        mean_loss.append(loss.item())
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()


        loop.set_postfix(loss = loss.item())


    print(f"Mean loss was {sum(mean_loss)/len(mean_loss)}")

    pred_boxes, target_boxes = get_bboxes(
        train_loader, model, iou_threshold=0.5, threshold=0.4
    )

    mean_avg_prec = mean_average_precision(
        pred_boxes, target_boxes, iou_threshold=0.5, box_format="midpoint"
    )
    print(f"Train mAP: {mean_avg_prec}")
    scheduler.step(mean_avg_prec)


100%|██████████| 1/1 [00:00<00:00,  2.21it/s, loss=654]


Mean loss was 654.0750122070312
Train mAP: 0.0


100%|██████████| 1/1 [00:00<00:00,  2.63it/s, loss=2.32e+4]


Mean loss was 23201.404296875
Train mAP: 0.2376224249601364


100%|██████████| 1/1 [00:00<00:00,  2.69it/s, loss=1.01e+4]


Mean loss was 10123.587890625
Train mAP: 0.010914698243141174


100%|██████████| 1/1 [00:00<00:00,  2.70it/s, loss=4.4e+4]


Mean loss was 43997.53125
Train mAP: 0.05589340254664421


100%|██████████| 1/1 [00:00<00:00,  2.70it/s, loss=5.56e+4]


Mean loss was 55561.8046875
Train mAP: 0.041912227869033813


In [None]:
train_dataset = VOCDataset(
        "/content/data/train.csv",
        transform=transform,
        img_dir=IMG_DIR,
        label_dir=LABEL_DIR,
    )


train_loader = DataLoader(
    dataset=train_dataset,
    batch_size=BATCH_SIZE,
    shuffle=True,
)

model = YoloV1(split_size=7, num_boxes=2, num_classes=3).to(DEVICE)
model.load_state_dict(torch.load(MODEL_FILE, map_location=torch.device('cpu')))
model.to(device)


optimizer = optim.Adam(
    model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY
)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer=optimizer, factor=0.1, patience=3, mode='max', verbose=True)
loss_fn = YoloLoss()

index = 0
for x, y in train_loader:
  x = x.to(DEVICE)
  for idx in range(1):
    bboxes = cellboxes_to_boxes(model(x))
    bboxes = non_max_suppression(bboxes[idx], iou_threshold=0.5, threshold=0.4, box_format="midpoint")
    plot_image(x[idx].permute(1,2,0).to("cpu"), bboxes)
    index += 1
    if index == 5:
      break
  if index == 5:
    break