In [None]:
import torch
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as patches
from collections import Counter
import torch.nn as nn
import torchvision.transforms as transforms
from torchvision.transforms import Compose, Resize, RandomHorizontalFlip, RandomVerticalFlip, RandomRotation, ColorJitter, RandomResizedCrop, RandomAffine, ToTensor
import torch.optim as optim
import torchvision.transforms.functional as FT
from tqdm import tqdm
from torch.optim import lr_scheduler
from torch.utils.data import DataLoader
import torchvision.models as models
import os
import pandas as pd
from PIL import Image
from torch.optim.lr_scheduler import MultiStepLR
import torchvision.transforms.functional as TF
import random
import torch
import torchvision.models as models

# Load the pre-trained VGG11 model

In [None]:

torch.cuda.empty_cache()


In [None]:
seed = 123
torch.manual_seed(seed)
torch.cuda.empty_cache()
with open('results_new.txt', 'a') as file:
    file.write("Started\n")
# Hyperparameters etc. 
LEARNING_RATE = 2e-5
DEVICE = "cuda" if torch.cuda.is_available else "cpu"
BATCH_SIZE = 16 # 64 in original paper but I don't have that much vram, grad accum?
WEIGHT_DECAY = 0.005
EPOCHS = 10
MOMEMTUM=0.9
NUM_WORKERS=2
MILESTONES=[75,105]
PIN_MEMORY = False
LOAD_MODEL = False
LOAD_MODEL_FILE = "over.pth.tar"
IMG_DIR = "data/images"
LABEL_DIR = "data/labels"
resnet=models.resnet50(pretrained=True).to(DEVICE)
# vgg19 = models.vgg19(pretrained=True).to(DEVICE)
for param in resnet.parameters():
    param.requires_grad = False
resnet=nn.Sequential(*list(resnet.children())[:-2]).to(DEVICE)
# vg=nn.Sequential(*list(vgg19.children())[:-2]).to(DEVICE)




print(DEVICE)

In [None]:
def IOU(boxes_preds, boxes_labels, box_format="midpoint"):
    """
    Calculates intersection over union

    Parameters:
        boxes_preds (tensor): Predictions of Bounding Boxes (BATCH_SIZE, 4)
        boxes_labels (tensor): Correct labels of Bounding Boxes (BATCH_SIZE, 4)
        box_format (str): midpoint/corners, if boxes (x,y,w,h) or (x1,y1,x2,y2)

    Returns:
        tensor: Intersection over union for all examples
    """

    if box_format == "midpoint":
        box1_x1 = boxes_preds[..., 0:1] - boxes_preds[..., 2:3] / 2
        box1_y1 = boxes_preds[..., 1:2] - boxes_preds[..., 3:4] / 2
        box1_x2 = boxes_preds[..., 0:1] + boxes_preds[..., 2:3] / 2
        box1_y2 = boxes_preds[..., 1:2] + boxes_preds[..., 3:4] / 2
        box2_x1 = boxes_labels[..., 0:1] - boxes_labels[..., 2:3] / 2
        box2_y1 = boxes_labels[..., 1:2] - boxes_labels[..., 3:4] / 2
        box2_x2 = boxes_labels[..., 0:1] + boxes_labels[..., 2:3] / 2
        box2_y2 = boxes_labels[..., 1:2] + boxes_labels[..., 3:4] / 2

    if box_format == "corners":
        box1_x1 = boxes_preds[..., 0:1]
        box1_y1 = boxes_preds[..., 1:2]
        box1_x2 = boxes_preds[..., 2:3]
        box1_y2 = boxes_preds[..., 3:4]  # (N, 1)
        box2_x1 = boxes_labels[..., 0:1]
        box2_y1 = boxes_labels[..., 1:2]
        box2_x2 = boxes_labels[..., 2:3]
        box2_y2 = boxes_labels[..., 3:4]

    x1 = torch.max(box1_x1, box2_x1)
    y1 = torch.max(box1_y1, box2_y1)
    x2 = torch.min(box1_x2, box2_x2)
    y2 = torch.min(box1_y2, box2_y2)

    # .clamp(0) is for the case when they do not intersect
    intersection = (x2 - x1).clamp(0) * (y2 - y1).clamp(0)

    box1_area = abs((box1_x2 - box1_x1) * (box1_y2 - box1_y1))
    box2_area = abs((box2_x2 - box2_x1) * (box2_y2 - box2_y1))

    return intersection / (box1_area + box2_area - intersection + 1e-6)

In [None]:

def NMS(bboxes, iou_threshold, threshold, box_format="corners"):
    """
    Does Non Max Suppression given bboxes

    Parameters:
        bboxes (list): list of lists containing all bboxes with each bboxes
        specified as [class_pred, prob_score, x1, y1, x2, y2]
        iou_threshold (float): threshold where predicted bboxes is correct
        threshold (float): threshold to remove predicted bboxes (independent of IoU) 
        box_format (str): "midpoint" or "corners" used to specify bboxes

    Returns:
        list: bboxes after performing NMS given a specific IoU threshold
    """


    bboxes = [box for box in bboxes if box[1] > threshold] # REMOVING THE BOXES WITH PROB LESS THAN A THRESHOLD
    bboxes = sorted(bboxes, key=lambda x: x[1],reverse=True) # SORTING THE BOXES IN DECREASING ORDER OF PROB
    bboxes_after_nms = []

    while bboxes:
        chosen_box = bboxes.pop(0)
        l=[]
        for box in bboxes:
            if box[0] != chosen_box[0] or IOU(torch.tensor(chosen_box[2:]),torch.tensor(box[2:]),)< iou_threshold:
                l.append(box)
        bboxes=l
        bboxes_after_nms.append(chosen_box)

    return bboxes_after_nms


In [None]:
def mean_average_precision(
    pred_boxes, true_boxes, iou_threshold=0.5, box_format="midpoint", num_classes=20
):
    """
    Calculates mean average precision 

    Given pred_boxes is boxes over many images


    Parameters:
        pred_boxes (list): list of lists containing all bboxes with each bboxes
        specified as [train_idx, class_prediction, prob_score, x1, y1, x2, y2]
        true_boxes (list): Similar as pred_boxes except all the correct ones 
        iou_threshold (float): threshold where predicted bboxes is correct
        box_format (str): "midpoint" or "corners" used to specify bboxes
        num_classes (int): number of classes

    Returns:
        float: mAP value across all classes given a specific IoU threshold 
    """
    average_precisions = [] # STORING PRECISION FOR EVERY CLASS
    epsilon = 1e-6

    for c in range(num_classes):
        detections = []
        ground_truths = []

        # Go through all predictions and targets,
        # and only add the ones that belong to the
        # current class c
        for detection in pred_boxes:
            if detection[1] == c:
                detections.append(detection)

        for true_box in true_boxes:
            if true_box[1] == c:
                ground_truths.append(true_box)

        # find the amount of bboxes for each training example
        # Counter here finds how many ground truth bboxes we get
        # for each training example, so let's say img 0 has 3,
        # img 1 has 5 then we will obtain a dictionary with:
        # amount_bboxes = {0:3, 1:5}

        amount_bboxes = Counter([gt[0] for gt in ground_truths])

        # We then go through each key, val in this dictionary
        # and convert to the following (w.r.t same example):
        # ammount_bboxes = {0:torch.tensor[0,0,0], 1:torch.tensor[0,0,0,0,0]}
        for key, val in amount_bboxes.items():
            amount_bboxes[key] = torch.zeros(val)

        # sort by box probabilities which is index 2
        detections.sort(key=lambda x: x[2], reverse=True)
        TP = torch.zeros((len(detections)))
        FP = torch.zeros((len(detections)))
        total_true_bboxes = len(ground_truths)
        
        # If none exists for this class then we can safely skip
        if total_true_bboxes == 0:
            continue

        for detection_idx, detection in enumerate(detections):
            # Only take out the ground_truths that have the same
            # training idx as detection
            ground_truth_img = [
                bbox for bbox in ground_truths if bbox[0] == detection[0]
            ]

            num_gts = len(ground_truth_img)
            best_iou = 0

            for idx, gt in enumerate(ground_truth_img):
                iou = IOU(torch.tensor(detection[3:]),torch.tensor(gt[3:]),box_format=box_format)
                if iou > best_iou:
                    best_iou = iou
                    best_gt_idx = idx

            if best_iou > iou_threshold:
                # only detect ground truth detection once
                if amount_bboxes[detection[0]][best_gt_idx] == 0:
                    # true positive and add this bounding box to seen
                    TP[detection_idx] = 1
                    amount_bboxes[detection[0]][best_gt_idx] = 1
                else:
                    FP[detection_idx] = 1

            # if IOU is lower then the detection is a false positive
            else:
                FP[detection_idx] = 1

        TP_cumsum = torch.cumsum(TP, dim=0)
        FP_cumsum = torch.cumsum(FP, dim=0)
        recalls = TP_cumsum / (total_true_bboxes + epsilon)
        precisions = torch.divide(TP_cumsum, (TP_cumsum + FP_cumsum + epsilon))
        precisions = torch.cat((torch.tensor([1]), precisions)) # For Finding integration we are appending points
        recalls = torch.cat((torch.tensor([0]), recalls)) # For Finding integration we are appending points
        # torch.trapz for numerical integration
        average_precisions.append(torch.trapz(precisions, recalls))

    return sum(average_precisions) / len(average_precisions)

In [None]:
import torch
import torch.nn as nn

class Yolov1(nn.Module):
    def __init__(self, in_channels=3, split_size=5, num_boxes=8, num_classes=20):
        super(Yolov1, self).__init__()
        self.darknet = nn.Sequential(
            # Convolutional layers
            # nn.Conv2d(in_channels, 64, kernel_size=7, stride=2, padding=3, bias=False),
            # nn.BatchNorm2d(64),
            # nn.LeakyReLU(0.1),
            # nn.MaxPool2d(kernel_size=2, stride=2),

            # nn.Conv2d(64, 192, kernel_size=3, stride=1, padding=1, bias=False),
            # nn.BatchNorm2d(192),
            # nn.LeakyReLU(0.1),
            # nn.MaxPool2d(kernel_size=2, stride=2),

            # nn.Conv2d(192, 128, kernel_size=1, stride=1, padding=0, bias=False),
            # nn.BatchNorm2d(128),
            # nn.LeakyReLU(0.1),

            # nn.Conv2d(128, 256, kernel_size=3, stride=1, padding=1, bias=False),
            # nn.BatchNorm2d(256),
            # nn.LeakyReLU(0.1),
            # nn.Conv2d(256, 256, kernel_size=1, stride=1, padding=0, bias=False),
            # nn.BatchNorm2d(256),
            # nn.LeakyReLU(0.1),

            # nn.Conv2d(256, 512, kernel_size=3, stride=1, padding=1, bias=False),
            # nn.BatchNorm2d(512),
            # nn.LeakyReLU(0.1),
            # nn.MaxPool2d(kernel_size=2, stride=2),

            # # Multiple Convolutional blocks
            # nn.Conv2d(512, 256, kernel_size=1, stride=1, padding=0, bias=False),
            # nn.BatchNorm2d(256),
            # nn.LeakyReLU(0.1),
            # nn.Conv2d(256, 512, kernel_size=3, stride=1, padding=1, bias=False),
            # nn.BatchNorm2d(512),
            # nn.LeakyReLU(0.1),
            # nn.Conv2d(512, 256, kernel_size=1, stride=1, padding=0, bias=False),
            # nn.BatchNorm2d(256),
            # nn.LeakyReLU(0.1),
            # nn.Conv2d(256, 512, kernel_size=3, stride=1, padding=1, bias=False),
            # nn.BatchNorm2d(512),
            # nn.LeakyReLU(0.1),
            # nn.Conv2d(512, 256, kernel_size=1, stride=1, padding=0, bias=False),
            # nn.BatchNorm2d(256),
            # nn.LeakyReLU(0.1),
            # nn.Conv2d(256, 512, kernel_size=3, stride=1, padding=1, bias=False),
            # nn.BatchNorm2d(512),
            # nn.LeakyReLU(0.1),
            # nn.Conv2d(512, 256, kernel_size=1, stride=1, padding=0, bias=False),
            # nn.BatchNorm2d(256),
            # nn.LeakyReLU(0.1),
            # nn.Conv2d(256, 512, kernel_size=3, stride=1, padding=1, bias=False),
            # nn.BatchNorm2d(512),
            # nn.LeakyReLU(0.1),

            # # Final Convolutional layers
            # nn.Conv2d(512, 512, kernel_size=1, stride=1, padding=0, bias=False),
            # nn.BatchNorm2d(512),
            # nn.LeakyReLU(0.1),

            # nn.Conv2d(512, 1024, kernel_size=3, stride=1, padding=1, bias=False),
            # nn.BatchNorm2d(1024),
            # nn.LeakyReLU(0.1),
            # nn.MaxPool2d(kernel_size=2, stride=2),

            # nn.Conv2d(1024, 512, kernel_size=1, stride=1, padding=0, bias=False),
            # nn.BatchNorm2d(512),
            # nn.LeakyReLU(0.1),
            # nn.Conv2d(512, 1024, kernel_size=3, stride=1, padding=1, bias=False),
            # nn.BatchNorm2d(1024),
            # nn.LeakyReLU(0.1),
            # nn.Conv2d(1024, 512, kernel_size=1, stride=1, padding=0, bias=False),
            # nn.BatchNorm2d(512),
            # nn.LeakyReLU(0.1),
            # nn.Conv2d(512, 1024, kernel_size=3, stride=1, padding=1, bias=False),
            # nn.BatchNorm2d(1024),
            # nn.LeakyReLU(0.1),
            # nn.Conv2d(1024, 1024, kernel_size=3, stride=2, padding=1, bias=False),
            # nn.BatchNorm2d(1024),
            # nn.LeakyReLU(0.1),
            # nn.Conv2d(1024, 1024, kernel_size=3, stride=1, padding=1, bias=False),
            # nn.BatchNorm2d(1024),
            # nn.LeakyReLU(0.1),
            # nn.Conv2d(1024, 1024, kernel_size=3, stride=1, padding=1, bias=False),
            # nn.BatchNorm2d(1024),
            # nn.LeakyReLU(0.1),

            # Fully connected layers
            nn.Flatten(),
            nn.Dropout(0.5),
            nn.Linear(802816, 496),
            nn.LeakyReLU(0.1),
            nn.Linear(496, split_size * split_size * (num_classes + num_boxes * 5)),
        )

    def forward(self, x):
        return self.darknet(resnet(x))

# def test():
#     model = Yolov1(split_size=7, num_boxes=2, num_classes=20)
#     x = torch.randn((2, 3, 448, 448))
#     print(model(x).shape)

# test()

In [None]:

class YoloLoss(nn.Module):
    """
    Calculate the loss for yolo (v1) model
    """

    def __init__(self, S=7, B=2, C=20):
        super(YoloLoss, self).__init__()
        self.mse = nn.MSELoss(reduction="sum")

        """
        S is split size of image (in paper 7),
        B is number of boxes (in paper 2),
        C is number of classes (in paper and VOC dataset is 20),
        """
        self.S = S
        self.B = B
        self.C = C

        # These are from Yolo paper, signifying how much we should
        # pay loss for no object (noobj) and the box coordinates (coord)
        self.lambda_noobj = 0.5
        self.lambda_coord = 5

    def forward(self, predictions, target):
        # predictions are shaped (BATCH_SIZE, S*S(C+B*5) when inputted
        predictions = predictions.reshape(-1, self.S, self.S, self.C + self.B * 5)

        # Calculate IoU for the two predicted bounding boxes with target bbox
        iou_b1 = IOU(predictions[..., 21:25], target[..., 21:25])
        iou_b2 =IOU(predictions[..., 26:30], target[..., 21:25])
        ious = torch.cat([iou_b1.unsqueeze(0), iou_b2.unsqueeze(0)], dim=0)

        # Take the box with highest IoU out of the two prediction
        # Note that bestbox will be indices of 0, 1 for which bbox was best
        iou_maxes, bestbox = torch.max(ious, dim=0)
        exists_box = target[..., 20].unsqueeze(3)  # in paper this is Iobj_i

        # ======================== #
        #   FOR BOX COORDINATES    #
        # ======================== #

        # Set boxes with no object in them to 0. We only take out one of the two 
        # predictions, which is the one with highest Iou calculated previously.
        box_predictions = exists_box * (
            (
                bestbox * predictions[..., 26:30]
                + (1 - bestbox) * predictions[..., 21:25]
            )
        )

        box_targets = exists_box * target[..., 21:25]

        # Take sqrt of width, height of boxes to ensure that
        box_predictions[..., 2:4] = torch.sign(box_predictions[..., 2:4]) * torch.sqrt(
            torch.abs(box_predictions[..., 2:4] + 1e-6)
        )
        box_targets[..., 2:4] = torch.sqrt(box_targets[..., 2:4])

        box_loss = self.mse(
            torch.flatten(box_predictions, end_dim=-2),
            torch.flatten(box_targets, end_dim=-2),
        )

        # ==================== #
        #   FOR OBJECT LOSS    #
        # ==================== #

        # pred_box is the confidence score for the bbox with highest IoU
        pred_box = (
            bestbox * predictions[..., 25:26] + (1 - bestbox) * predictions[..., 20:21]
        )

        object_loss = self.mse(
            torch.flatten(exists_box * pred_box),
            torch.flatten(exists_box * target[..., 20:21]),
        )

        # ======================= #
        #   FOR NO OBJECT LOSS    #
        # ======================= #

        #max_no_obj = torch.max(predictions[..., 20:21], predictions[..., 25:26])
        #no_object_loss = self.mse(
        #    torch.flatten((1 - exists_box) * max_no_obj, start_dim=1),
        #    torch.flatten((1 - exists_box) * target[..., 20:21], start_dim=1),
        #)

        no_object_loss = self.mse(
            torch.flatten((1 - exists_box) * predictions[..., 20:21], start_dim=1),
            torch.flatten((1 - exists_box) * target[..., 20:21], start_dim=1),
        )

        no_object_loss += self.mse(
            torch.flatten((1 - exists_box) * predictions[..., 25:26], start_dim=1),
            torch.flatten((1 - exists_box) * target[..., 20:21], start_dim=1)
        )

        # ================== #
        #   FOR CLASS LOSS   #
        # ================== #

        class_loss = self.mse(
            torch.flatten(exists_box * predictions[..., :20], end_dim=-2,),
            torch.flatten(exists_box * target[..., :20], end_dim=-2,),
        )

        loss = (
            self.lambda_coord * box_loss  # first two rows in paper
            + object_loss  # third row in paper
            + self.lambda_noobj * no_object_loss  # forth row
            + class_loss  # fifth row
        )

        return loss


In [None]:
class VOCDataset(torch.utils.data.Dataset):
    def __init__(
        self, csv_file, img_dir, label_dir, S=7, B=2, C=20, transform=None,
    ):
        self.annotations = pd.read_csv(csv_file)
        self.img_dir = img_dir
        self.label_dir = label_dir
        self.transform = transform
        self.S = S
        self.B = B
        self.C = C

    def __len__(self):
        return len(self.annotations)

    def __getitem__(self, index):
        label_path = os.path.join(self.label_dir, self.annotations.iloc[index, 1])
        boxes = []
        with open(label_path) as f:
            for label in f.readlines():
                class_label, x, y, width, height = [
                    float(x) if float(x) != int(float(x)) else int(x)
                    for x in label.replace("\n", "").split()
                ]

                boxes.append([class_label, x, y, width, height])

        img_path = os.path.join(self.img_dir, self.annotations.iloc[index, 0])
        image = Image.open(img_path)
        boxes = torch.tensor(boxes)

        if self.transform:
            # image = self.transform(image)
            image, boxes = self.transform(image, boxes)

        # Convert To Cells
        label_matrix = torch.zeros((self.S, self.S, self.C + 5 * self.B))
        for box in boxes:
            class_label, x, y, width, height = box.tolist()
            class_label = int(class_label)

            # i,j represents the cell row and cell column
            i, j = int(self.S * y), int(self.S * x)
            x_cell, y_cell = self.S * x - j, self.S * y - i

            """
            Calculating the width and height of cell of bounding box,
            relative to the cell is done by the following, with
            width as the example:
            
            width_pixels = (width*self.image_width)
            cell_pixels = (self.image_width)
            
            Then to find the width relative to the cell is simply:
            width_pixels/cell_pixels, simplification leads to the
            formulas below.
            """
            width_cell, height_cell = (
                width * self.S,
                height * self.S,
            )

            # If no object already found for specific cell i,j
            # Note: This means we restrict to ONE object
            # per cell!
            if label_matrix[i, j, 20] == 0:
                # Set that there exists an object
                label_matrix[i, j, 20] = 1

                # Box coordinates
                box_coordinates = torch.tensor(
                    [x_cell, y_cell, width_cell, height_cell]
                )

                label_matrix[i, j, 21:25] = box_coordinates

                # Set one hot encoding for class_label
                label_matrix[i, j, class_label] = 1

        return image, label_matrix


In [None]:

def convert_cellboxes(predictions, S=7):
    """
    Converts bounding boxes output from Yolo with
    an image split size of S into entire image ratios
    rather than relative to cell ratios. Tried to do this
    vectorized, but this resulted in quite difficult to read
    code... Use as a black box? Or implement a more intuitive,
    using 2 for loops iterating range(S) and convert them one
    by one, resulting in a slower but more readable implementation.
    """

    predictions = predictions.to("cpu")
    batch_size = predictions.shape[0]
    predictions = predictions.reshape(batch_size, 7, 7, 30)
    bboxes1 = predictions[..., 21:25]
    bboxes2 = predictions[..., 26:30]
    scores = torch.cat(
        (predictions[..., 20].unsqueeze(0), predictions[..., 25].unsqueeze(0)), dim=0
    )
    best_box = scores.argmax(0).unsqueeze(-1)
    best_boxes = bboxes1 * (1 - best_box) + best_box * bboxes2
    cell_indices = torch.arange(7).repeat(batch_size, 7, 1).unsqueeze(-1)
    x = 1 / S * (best_boxes[..., :1] + cell_indices)
    y = 1 / S * (best_boxes[..., 1:2] + cell_indices.permute(0, 2, 1, 3))
    w_y = 1 / S * best_boxes[..., 2:4]
    converted_bboxes = torch.cat((x, y, w_y), dim=-1)
    predicted_class = predictions[..., :20].argmax(-1).unsqueeze(-1)
    best_confidence = torch.max(predictions[..., 20], predictions[..., 25]).unsqueeze(
        -1
    )
    converted_preds = torch.cat(
        (predicted_class, best_confidence, converted_bboxes), dim=-1
    )

    return converted_preds


def cellboxes_to_boxes(out, S=7):
    converted_pred = convert_cellboxes(out).reshape(out.shape[0], S * S, -1)
    converted_pred[..., 0] = converted_pred[..., 0].long()
    all_bboxes = []

    for ex_idx in range(out.shape[0]):
        bboxes = []

        for bbox_idx in range(S * S):
            bboxes.append([x.item() for x in converted_pred[ex_idx, bbox_idx, :]])
        all_bboxes.append(bboxes)

    return all_bboxes

def save_checkpoint(state, filename="my_checkpoint.pth.tar"):
    print("=> Saving checkpoint")
    torch.save(state, filename)


def load_checkpoint(checkpoint, model, optimizer):
    print("=> Loading checkpoint")
    model.load_state_dict(checkpoint["state_dict"])
    optimizer.load_state_dict(checkpoint["optimizer"])

In [None]:
def plot_image(image, boxes,id):
    """Plots predicted bounding boxes on the image"""
    im = np.array(image)
    height, width, _ = im.shape

    # Create figure and axes
    fig, ax = plt.subplots(figsize=(2,3))
    # Display the image
    ax.imshow(im)

    # box[0] is x midpoint, box[2] is width
    # box[1] is y midpoint, box[3] is height

    # Create a Rectangle potch
    for box in boxes:
        box = box[2:]
        assert len(box) == 4, "Got more values than in x, y, w, h, in a box!"
        upper_left_x = box[0] - box[2] / 2
        upper_left_y = box[1] - box[3] / 2
        rect = patches.Rectangle(
            (upper_left_x * width, upper_left_y * height),
            box[2] * width,
            box[3] * height,
            linewidth=1,
            edgecolor="r",
            facecolor="none",
        )
        # Add the patch to the Axes
        ax.add_patch(rect)

    plt.savefig(f"{id}.png")

def get_bboxes(
    loader,
    model,
    iou_threshold,
    threshold,
    pred_format="cells",
    box_format="midpoint",
    device="cuda",
):
    all_pred_boxes = []
    all_true_boxes = []

    # make sure model is in eval before get bboxes
    model.eval()
    train_idx = 0

    for batch_idx, (x, labels) in enumerate((loader)):
        x = x.to(device)
        labels = labels.to(device)

        with torch.no_grad():
            predictions = model(x)

        batch_size = x.shape[0]
        true_bboxes = cellboxes_to_boxes(labels)
        bboxes = cellboxes_to_boxes(predictions)

        for idx in range(batch_size):
            nms_boxes = NMS(
                bboxes[idx],
                iou_threshold=iou_threshold,
                threshold=threshold,
                box_format=box_format,
            )


            #if batch_idx == 0 and idx == 0:
            #    plot_image(x[idx].permute(1,2,0).to("cpu"), nms_boxes)
            #    print(nms_boxes)

            for nms_box in nms_boxes:
                all_pred_boxes.append([train_idx] + nms_box)

            for box in true_bboxes[idx]:
                # many will get converted to 0 pred
                if box[1] > threshold:
                    all_true_boxes.append([train_idx] + box)

            train_idx += 1

    model.train()
    return all_pred_boxes, all_true_boxes

In [None]:

# class Compose(object):
#     def __init__(self, transforms):
#         self.transforms = transforms

#     def __call__(self, img, bboxes):
#         for t in self.transforms:
#             img, bboxes = t(img), bboxes

#         return img, bboxes
class Compose(object):
    def __init__(self, transforms):
        self.transforms = transforms

    def __call__(self, img, bboxes):
        for t in self.transforms:
            img, bboxes = t(img), bboxes

        return img, bboxes

# class Resize(torch.nn.Module):
#     def __init__(self, size):
#         super().__init__()
#         self.size = size

#     def forward(self, img, bboxes):
#         img = TF.resize(img, self.size)
#         img = TF.to_tensor(img)
#         return img, bboxes

# class RandomHorizontalFlip(torch.nn.Module):
#     def __init__(self, p):
#         super().__init__()
#         self.p = p
    
#     def forward(self, img, bboxes):
#         if random.random() <= self.p:
#             img = TF.hflip(img)
#             for bbox in bboxes:
#                 bbox[1] = 1 - bbox[1]
            
#         return img, bboxes


# class ColorJitter(torch.nn.Module):
#     def __init__(self, brightness, saturation):
#         super().__init__()
#         self.transform = transforms.ColorJitter(
#             brightness=brightness,
#             saturation=saturation
#         )
    
#     def forward(self, img, bboxes):
#         return self.transform(img), bboxes


# class RandomAffine(torch.nn.Module):
#     def __init__(self, translate=None, scale=None):
#         super().__init__()
#         self.translate = translate
#         self.scale = scale

#     def forward(self, img, bboxes):
#         # print(bboxes.shape)
#         img_size = TF._get_image_size(img)
#         if self.translate is not None:
#             max_dx = float(self.translate[0])*img_size[0]
#             max_dy = float(self.translate[1])*img_size[1]
#             tx = int(round(torch.empty(1).uniform_(-max_dx, max_dx).item()))
#             ty = int(round(torch.empty(1).uniform_(-max_dy, max_dy).item()))
#             translations = (tx, ty)

#         else:
#             translations = (0,0)
#         if self.scale is not None:
#             scale = float(torch.empty(1).uniform_(self.scale[0], self.scale[1]).item())
#         else:
#             scale = 1.0
        
#         img = TF.affine(
#             img, 
#             translate=translations, 
#             scale=scale,
#             angle=0,
#             shear=0
#         )

#         for b, box in enumerate(bboxes):
#             x = bboxes[b,1]
#             y = bboxes[b,2]
#             x_prime = x - 0.5
#             y_prime = y - 0.5
            
#             x_prime = scale*x_prime + translations[0]/img_size[0]
#             y_prime = scale*y_prime + translations[1]/img_size[1]

#             bboxes[b,1] = x_prime + 0.5
#             bboxes[b,2] = y_prime + 0.5
#             bboxes[b,3] = bboxes[b,3]*scale
#             bboxes[b,4] = bboxes[b,4]*scale
#         return img, bboxes

# class ToTensor(torch.nn.Module):
#     def __init__(self):
#         super().__init__()
#         self.transform = transforms.ToTensor()
    
#     def forward(self, img, bboxes):
#         if isinstance(img, torch.Tensor):
#             return img, bboxes
#         else:
#             return self.transform(img), bboxes
        
# transform1 = Compose([
#     Resize((448,448)),
#     RandomAffine(
#         translate=(0.2,0.2),
#         scale=(0.5,0.5)
#     ),
#     RandomHorizontalFlip(p=0.5),
#     ColorJitter(
#         brightness=0.5,
#         saturation=0.5
#     ),
#     ToTensor()
# ])

transform = Compose([Resize((448, 448)), ToTensor()])

In [None]:
def train_fn(train_loader, model, optimizer, loss_fn):
    loop = tqdm(train_loader, leave=True)
    mean_loss = []

    for batch_idx, (x, y) in enumerate(loop):
        x, y = x.to(DEVICE), y.to(DEVICE)
        out = model(x)
        loss = loss_fn(out, y)
        mean_loss.append(loss.item())
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # update progress bar
        loop.set_postfix(loss=loss.item())
    with open('results_new.txt', 'a') as file:
        file.write(f"Mean loss was {sum(mean_loss)/len(mean_loss)}\n")

In [None]:
model = Yolov1(split_size=7, num_boxes=2, num_classes=20).to(DEVICE)
optimizer = optim.Adam(
    model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY,betas=(MOMEMTUM,0.999)
    # model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY

)
# scheduler=MultiStepLR(optimizer,milestones=MILESTONES,gamma=0.1)
loss_fn = YoloLoss()

if LOAD_MODEL:
    load_checkpoint(torch.load(LOAD_MODEL_FILE), model, optimizer)

train_dataset = VOCDataset(
    "data/train.csv.csv",
    transform=transform,
    img_dir=IMG_DIR,
    label_dir=LABEL_DIR,
)

test_dataset = VOCDataset(
    "data/test.csv",transform=transform, img_dir=IMG_DIR, label_dir=LABEL_DIR,
)

train_loader = DataLoader(
    dataset=train_dataset,
    batch_size=BATCH_SIZE,
    num_workers=NUM_WORKERS,
    pin_memory=PIN_MEMORY,
    shuffle=True,
    drop_last=True,
)

test_loader = DataLoader(
    dataset=test_dataset,
    batch_size=BATCH_SIZE,
    num_workers=NUM_WORKERS,
    pin_memory=PIN_MEMORY,
    shuffle=True,
    drop_last=True,
)
bestcheckpoint=0
for epoch in range(EPOCHS):
    # if epoch < 2:
    #     current_lr = 2e-5  # Low learning rate for the first two epochs
    # elif epoch < 12:
    #     current_lr = 8e-5    # Increased learning rate for the next 10 epochs
    # elif epoch < 17:
    #     current_lr = 4e-5  # Reduced learning rate for the next 5 epochs
    # else:
    #     current_lr = 2e-4  # Final learning rate for remaining epochs

    # # # Update the learning rate in the optimizer
    # for param_group in optimizer.param_groups:
    #     param_group['lr'] = current_lr
    # if epoch < 10:
        # lr=LEARNING_RATE + (0.01 - LEARNING_RATE)*epoch/10
        # for param_group in optimizer.param_groups:
        #     param_group['lr']=lr
    with torch.no_grad():
        pred_boxes, target_boxes = get_bboxes(
            train_loader, model, iou_threshold=0.3, threshold=0.2
        )

        mean_avg_prec = mean_average_precision(
            pred_boxes, target_boxes, iou_threshold=0.3, box_format="midpoint"
        )
        
        # pred_boxes1, target_boxes1 = get_bboxes(
        #     test_loader, model, iou_threshold=0.3, threshold=0.2
        # )

        # mean_avg_prec1 = mean_average_precision(
        #     pred_boxes1, target_boxes1, iou_threshold=0.5, box_format="midpoint"
        # )
        # with open('results_new.txt', 'a') as file:
        #     file.write(f"Epoch: {epoch} Train mAP: {mean_avg_prec} Test mAP:{mean_avg_prec1}\n")
        # scheduler.step()
        # print(f"Epoch: {epoch} Train mAP: {mean_avg_prec}")

        if mean_avg_prec > bestcheckpoint:
            checkpoint = {
                "state_dict": model.state_dict(),
                "optimizer": optimizer.state_dict(),
            }
            save_checkpoint(checkpoint, filename=LOAD_MODEL_FILE)
            bestcheckpoint=mean_avg_prec

    train_fn(train_loader, model, optimizer, loss_fn)

In [None]:


object_categories = [
    "aeroplane",
    "bicycle",
    "bird",
    "boat",
    "bottle",
    "bus",
    "car",
    "cat",
    "chair",
    "cow",
    "diningtable",
    "dog",
    "horse",
    "motorbike",
    "person",
    "pottedplant",
    "sheep",
    "sofa",
    "train",
    "tvmonitor"
]


# for x, y in train_loader:
#         x = x.to(DEVICE)
#         for idx in range(8):
#             bboxes = cellboxes_to_boxes(model(x))
#             bboxes = NMS(bboxes[idx], iou_threshold=0.5, threshold=0.4, box_format="midpoint")
#             plot_image(x[idx].permute(1,2,0).to("cpu"), bboxes,idx)
#             print(object_categories[int(bboxes[0][0])])
#         break

In [None]:
# LOAD_MODEL_FILE="yolov1-vgg16.pt"
# load_checkpoint(torch.load(LOAD_MODEL_FILE), model, optimizer)
pred_boxes, target_boxes = get_bboxes(
        test_loader, model, iou_threshold=0.5, threshold=0.4
    )

mean_avg_prec = mean_average_precision(
        pred_boxes, target_boxes, iou_threshold=0.5, box_format="midpoint"
    )
with open('results_new.txt', 'a') as file:
        file.write(f"Test mAP: {mean_avg_prec}\n")

# loop = tqdm(test_loader, leave=True)
# mean_loss = []

# with torch.no_grad():
#     for batch_idx, (x, y) in enumerate(loop):
#         x, y = x.to(DEVICE), y.to(DEVICE)
#         out = model(x)
#         loss = loss_fn(out, y)
#         mean_loss.append(loss.item())

#         # update progress bar
#         loop.set_postfix(loss=loss.item())
# with open('results_new.txt', 'a') as file:
#         file.write(f"Mean loss was {sum(mean_loss)/len(mean_loss)}\n")
# print(f"Mean loss was {sum(mean_loss)/len(mean_loss)}")

