In [77]:
import torch
import torch.nn as nn
import pandas as pd
import math
import xml.etree.ElementTree as ET
from PIL import Image

In [7]:
# Setting hyperparameters
if torch.backends.mps.is_available():
    device = 'mps'
elif torch.cuda.is_available():
    device = 'cuda'
else:
    device = 'cpu'

CHECKPOINT_PTH = 'checkpoint.pth'

In [6]:
def save_checkpoint(epoch, model, optimizer,loss):
    checkpoint = {
    'epoch': epoch,
    'model_state_dict': model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict(),
    'loss': loss,
    }
    
    torch.save(checkpoint, CHECKPOINT_PTH)

def load_checkpoint():
    checkpoint = torch.load(CHECKPOINT_PTH)
    model.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    epoch = checkpoint['epoch']
    loss = checkpoint['loss']

In [57]:
class Yolo(nn.Module):
    def __init__(self, in_channels = 3, split_size = 7, n_bboxes = 2, n_classes = 20):
        super().__init__()
        self.conv_network = nn.Sequential(
            self._conv_layer(in_channels, 64, 7, stride = 2),
            self._get_max_pool(),
            self._conv_layer(64, 192, 3),
            self._get_max_pool(),
            self._conv_layer(192, 128, 1),
            self._conv_layer(128, 256, 3),
            self._conv_layer(256, 256, 1),
            self._conv_layer(256, 512, 3),
            self._get_max_pool(),
            self._get_four_conv_block_with_512_out(),
            self._conv_layer(512, 512, 1),
            self._conv_layer(512, 1024, 3),
            self._get_max_pool(),
            self._get_four_conv_block_with_1024_out(),
            self._conv_layer(1024, 1024, 3),
            self._conv_layer(1024, 1024, 3, stride = 2),
            self._conv_layer(1024, 1024, 3),
            self._conv_layer(1024, 1024, 3),
        )
        self.fc_network = nn.Sequential(
            nn.Flatten(),
            nn.Linear(1024*split_size*split_size, 4906),
            nn.LeakyReLU(0.1),
            nn.Dropout(p = 0.5),
            nn.Linear(4096, split_size * split_size * (n_bboxes * 5 + n_classes))
        )

    def _conv_layer(self,in_channels, out_channels, kernel_size, stride = 1):
        return nn.Sequential(
            nn.Conv2d(in_channels = in_channels, out_channels= out_channels, kernel_size = kernel_size, stride = stride),
            nn.LeakyReLU(negative_slope=0.1)
        )
    
    def _get_max_pool(self):
        return nn.MaxPool2d(kernel_size = 2)

    def _get_four_conv_block_with_512_out(self):
        return nn.Sequential(
            self._get_one_and_three_conv_with_512_out(), 
            self._get_one_and_three_conv_with_512_out(),
            self._get_one_and_three_conv_with_512_out(),
            self._get_one_and_three_conv_with_512_out()
        )

    def _get_one_and_three_conv_with_512_out(self):
        return nn.Sequential(
            nn.Conv2d(512, 256, 1), 
            nn.Conv2d(256, 512, 3)
        )

    def _get_four_conv_block_with_1024_out(self):
        return nn.Sequential(
            self._get_one_and_three_with_1024_out(),
            self._get_one_and_three_with_1024_out()
        )
        
    def _get_one_and_three_with_1024_out(self):
        return nn.Sequential(
            nn.Conv2d(1024, 512, 1), 
            nn.Conv2d(512, 1024, 3)
        )

    def forward(self, inputs):
        """
        inputs: (batch_size, image_len, image_width, channels)
        returns: (batch_size, split_size, split_size, n_bboxes*5 + n_classes)
        """
        conv_output = self.conv_network(inputs)
        return self.fc_network(conv_output)


In [15]:
def lr_scheduler(epoch):
    if epoch < 10:
        return 1e-3 + (1e-2 - 1e-3) * (epoch / 10)  # Linearly increase from 1e-3 to 1e-2
    elif epoch < 85:
        return 1e-2  # Keep constant at 1e-2
    elif epoch < 115:
        return 1e-3  # Decrease to 1e-3
    else:
        return 1e-4  # Decrease to 1e-4


class CustomLRScheduler:
    def __init__(self, optimizer, lr_func):
        self.optimizer = optimizer
        self.lr_func = lr_func

    def step(self, epoch):
        lr = self.lr_func(epoch)
        for param_group in self.optimizer.param_groups:
            param_group['lr'] = lr

In [56]:
def intersection_over_union(box1, box2):
    """
    box1: (batch_size_1, 5)
    box2: (batch_size_2, 5)

    return: (batch_size_1, Batch_size_2) float representing iou
    """
    # Unpack the coordinates and dimensions
    x1_center, y1_center, w1, h1 = box1[..., 0:4].T
    x2_center, y2_center, w2, h2 = box2[..., 0:4].T

    x1_min, y1_min = x1_center - w1 / 2, y1_center - h1 / 2
    x1_max, y1_max = x1_center + w1 / 2, y1_center + h1 / 2
    x2_min, y2_min = x2_center - w2 / 2, y2_center - h2 / 2
    x2_max, y2_max = x2_center + w2 / 2, y2_center + h2 / 2

    inter_xmin = torch.max(x1_min.unsqueeze(1), x2_min.unsqueeze(0))
    inter_ymin = torch.max(y1_min.unsqueeze(1), y2_min.unsqueeze(0))
    inter_xmax = torch.min(x1_max.unsqueeze(1), x2_max.unsqueeze(0))
    inter_ymax = torch.min(y1_max.unsqueeze(1), y2_max.unsqueeze(0))

    inter_width = torch.clamp(inter_xmax - inter_xmin, min=0)
    inter_height = torch.clamp(inter_ymax - inter_ymin, min=0)
    inter_area = inter_width * inter_height

    box1_area = w1 * h1
    box2_area = w2 * h2

    union_area = box1_area.unsqueeze(1) + box2_area - inter_area

    iou = inter_area / torch.clamp(union_area, min=1e-8)

    return iou

def non_max_suppression(boxes, scores, iou_threshold):
    """
    Perform Non-Maximum Suppression (NMS) on the bounding boxes.

    Parameters:
    boxes: shape (batch_size, 5)
           Each row contains [x, y, w, h, score] coordinates of a bounding box.
    iou_threshold: float
                   IoU threshold for suppressing boxes.

    Returns:
    indices: list of int
             Indices of the bounding boxes to keep.
    """
    if len(boxes) == 0:
        return []

    # Convert boxes to numpy arrays (if not already)
    boxes = torch.tensor(boxes)
    scores = torch.tensor(scores)

    # Compute the bottom-right coordinates and area of each box
    x1 = boxes[:, 0]
    y1 = boxes[:, 1]
    w = boxes[:, 2]
    h = boxes[:, 3]
    x2 = x1 + w
    y2 = y1 + h
    areas = w * h

    # Get the indices of the boxes sorted by scores in descending order
    order = boxes[:, 4].argsort()[::-1]

    keep = []
    while order.size > 0:
        # The index of the current box with the highest score
        i = order[0]
        keep.append(i)

        # Compute the IoU of the kept box with the remaining boxes
        xx1 = np.maximum(x1[i], x1[order[1:]])
        yy1 = np.maximum(y1[i], y1[order[1:]])
        xx2 = np.minimum(x2[i], x2[order[1:]])
        yy2 = np.minimum(y2[i], y2[order[1:]])

        w_inter = np.maximum(0, xx2 - xx1)
        h_inter = np.maximum(0, yy2 - yy1)
        inter_area = w_inter * h_inter

        iou = inter_area / (areas[i] + areas[order[1:]] - inter_area)

        # Keep boxes with IoU less than the threshold
        inds = np.where(iou <= iou_threshold)[0]
        order = order[inds + 1]

    return keep

        

In [27]:
example = torch.tensor([[[1], [2], [3]],[[7],[8],[9]]])
print(example.shape)
example[...,0].unsqueeze(-1).shape

torch.Size([2, 3, 1])


torch.Size([2, 3, 1])

In [63]:
a = torch.tensor([[1, 2, -1, 2], [1,2,3, 2], [3, 0, 4, 2]])
b = torch.tensor([[3, 0, 4, 2],[3, 0, 4, 2]])

intersection_over_union(a,b).shape

torch.Size([3, 2])

In [69]:
test = torch.tensor([[0,0,0,0]])
(test == 0).float()

tensor([[1., 1., 1., 1.]])

In [72]:
class YoloLoss(nn.Module):
    def __init__(self, split_size = 7, n_bboxes = 2, n_classes = 20, lambda_coord = 5, lambda_noobj = 0.5):
        super().__init__()
        self.lambda_coord = lambda_coord
        self.lambda_noobj = lambda_noobj
        self.split_size = split_size
        self.n_bboxes = n_bboxes
        self.n_classes = n_classes
        self.mse_loss = nn.MSELoss(reduction="sum")

    def assign_boxes(self, pred_boxes, gt_boxes):
        """
        pred_boxes: (n_bboxes, 5)
        gt_boxes: (n_gt_boxes, 5)
        """
        iou_matrix = intersection_over_union(pred_boxes, gt_boxes) #(n_gt_boxes, n_bboxes)
        
        gt_assigned = torch.full((gt_boxes.size(0),), -1, dtype=torch.long)  # -1 means not assigned
        pred_assigned = torch.full((pred_boxes.size(0),), -1, dtype=torch.long)

        while iou_matrix.numel() > 0 and (iou_matrix != -1).any():

            max_iou, max_iou_indices = iou_matrix.max(dim=1)  # Get max IoU for each gt_box
            best_pred_idx = max_iou.argmax()  # The best predicted box index for the current max IoU
            best_gt_idx = max_iou_indices[best_pred_idx]  # The ground truth box index for the best predicted box
    
            # Assign the best predicted box to the ground truth box
            gt_assigned[best_gt_idx] = best_pred_idx
            pred_assigned[best_pred_idx] = best_gt_idx
    
            # Invalidate the assigned row and column in the IoU matrix to prevent reassignment
            iou_matrix[best_gt_idx, :] = -1
            iou_matrix[:, best_pred_idx] = -1

        return gt_assigned, pred_assigned

    def get_gt_boxes(self, target):
        object_exists = target[..., 4] == 1
        gt_boxes = target[..., :4][object_exists]
        gt_classes = target[..., 5:][object_exists]
        return gt_boxes, gt_classes
        
    def forward(self, predictions, target):
        """
        predictions: (batch_size, split_size, split_size, n_bboxes*5 + n_classes)
        target: (batch_size, split_size, split_size, n_bboxes, (4+1+n_classes)) the last field in the target tells us if object exists or not
        """
        batch_size = predictions.shape[0]
        predictions = predictions.view(batch_size, self.split_size, self.split_size, self.n_bboxes*5 + self.n_classes)

        pred_boxes = predictions[..., :self.n_bboxes * 5].view(batch_size, self.split_size, self.split_size, batch_size, self.B, 5)
        pred_classes = predictions[..., self.n_bboxes * 5:]

        gt_boxes, gt_classes = self.get_gt_boxes(target)

        if gt_boxes.size(0) == 0:
            return torch.tensor(0.0)

        gt_assigned, pred_assigned = self.assign_boxes(pred_boxes.view(-1, 4), gt_boxes)

        pred_box_coords = pred_boxes[..., :4]
        pred_box_conf = pred_boxes[..., 4]
        
        gt_box_coords = gt_boxes[gt_assigned]
        correct_pred = pred_assigned != -1
        incorrect_pred = pred_assigned == -1

        coord_loss = self.lambda_coord * (
            self.mse_loss(pred_box_coords[..., :2], gt_box_coords[..., :2]) +
            self.mse_loss(torch.sqrt(pred_box_coords[..., 2:]), torch.sqrt(gt_box_coords[..., 2:]))
        )
        object_loss = self.mse_loss(pred_box_conf[correct_pred], 
                                    torch.ones_like(pred_box_conf[correct_pred]))
        
        no_object_mask = (pred_assigned == -1).float()
        no_object_loss = self.lambda_noobj * self.mse_loss(pred_box_conf[incorrect_pred], 
                                                           torch.zeros_like(pred_box_conf[incorrect_pred]))

        class_loss = self.mse_loss(pred_classes, gt_classes)

        total_loss = coord_loss + object_loss + no_object_loss + class_loss

        return total_loss
                
        
    

In [102]:
annotations = "./VOCdevkit/VOC2012/Annotations/"
layout = "./VOCdevkit/VOC2012/ImageSets/Main/"
images = "./VOCdevkit/VOC2012/JPEGImages/"
def extract_filenames(train_file_path):
    filenames = []
    
    with open(train_file_path, 'r') as file:
        for line in file:
            parts = line.split()
            if parts:
                # The image ID is the first part of each line
                filename = parts[0]
                filenames.append(filename)
    
    return filenames

def parse_xml(xml_file):
    tree = ET.parse(xml_file)
    root = tree.getroot()

    filename = root.find('filename').text
    objects_and_bboxes = []

    for obj in root.findall('object'):
        # Extract object class
        obj_class = obj.find('name').text

        # Extract bounding box coordinates
        bndbox = obj.find('bndbox')
        xmin = int(bndbox.find('xmin').text)
        xmax = int(bndbox.find('xmax').text)
        ymin = int(bndbox.find('ymin').text)
        ymax = int(bndbox.find('ymax').text)

        # Calculate x_center, y_center, width, height
        x_center = (xmin + xmax) / 2
        y_center = (ymin + ymax) / 2
        width = xmax - xmin
        height = ymax - ymin

        # Store object and bounding box information
        objects_and_bboxes.append({
            'class': obj_class,
            'bbox': (x_center, y_center, width, height)
        })

    return filename, objects_and_bboxes
def load_image(image_file):
    return Image.open(image_file)
    
train_files = extract_filenames(layout + "train.txt")
val_files = extract_filenames(layout + "val.txt")

classes = set()
for filename in train_files:
    _, objects = parse_xml(annotations + filename + '.xml')
    classes.update([c['class'] for c in objects])
print({j:i for i, j in enumerate(classes)})
        

{'pottedplant': 0, 'car': 1, 'chair': 2, 'cow': 3, 'bird': 4, 'aeroplane': 5, 'diningtable': 6, 'train': 7, 'bus': 8, 'sheep': 9, 'cat': 10, 'tvmonitor': 11, 'dog': 12, 'sofa': 13, 'horse': 14, 'person': 15, 'boat': 16, 'motorbike': 17, 'bicycle': 18, 'bottle': 19}


In [None]:
class VOCDataset(torch.utils.data.Dataset):
    def __init__(self, data = train_files, files_dir = files_dir, split_size  7, n_bboxes = 2, n_classes = 3, class_dictionary = None, transform=None):
        self.data = data
        self.files_dir = files_dir
        self.transform = transform
        self.split_size = split_size
        self.n_bboxes = n_bboxes
        self.n_classes = n_classes
        if class_dictionary == None:
            self.class_dictionary = {j:i for i, clas in enumerate(classes)}
        else:
            self.class_dictionary = class_dictionary

    def __len__(self):
        return len(self.data)

    def __getitem__(self):
        