In [1]:
import torch
import torch.nn as nn
import pandas as pd
import math
import xml.etree.ElementTree as ET
from PIL import Image
from torch.utils.data import DataLoader
from tqdm import tqdm
import torchvision.transforms as transforms

In [2]:
class Yolo(nn.Module):
    def __init__(self, in_channels = 3, split_size = 7, n_bboxes = 2, n_classes = 20):
        super().__init__()
        self.conv_network = nn.Sequential(
            self._conv_layer(in_channels, 64, 7, stride = 2, padding = 3),
            self._get_max_pool(),
            self._conv_layer(64, 192, 3, 1, 1),
            self._get_max_pool(),
            self._conv_layer(192, 128, 1, 1, 0),
            self._conv_layer(128, 256, 3, 1, 1),
            self._conv_layer(256, 256, 1, 1, 0),
            self._conv_layer(256, 512, 3, 1, 1),
            self._get_max_pool(),
            self._get_four_conv_block_with_512_out(),
            self._conv_layer(512, 512, 1, 1, 0),
            self._conv_layer(512, 1024, 3, 1, 1),
            self._get_max_pool(),
            self._get_four_conv_block_with_1024_out(),
            self._conv_layer(1024, 1024, 3, 1, 1),
            self._conv_layer(1024, 1024, 3, stride = 2, padding =1),
            self._conv_layer(1024, 1024, 3, 1, 1),
            self._conv_layer(1024, 1024, 3, 1, 1),
        )
        self.fc_network = nn.Sequential(
            nn.Flatten(),
            nn.Linear(1024*split_size*split_size, 4096),
            nn.LeakyReLU(0.1),
            nn.Dropout(p = 0.5),
            nn.Linear(4096, split_size * split_size * (n_bboxes * 5 + n_classes))
        )

    def _conv_layer(self,in_channels, out_channels, kernel_size, stride = 1, padding = 0):
        return nn.Sequential(
            nn.Conv2d(in_channels = in_channels, out_channels= out_channels, kernel_size = kernel_size, stride = stride, padding = padding),
            nn.LeakyReLU(negative_slope=0.1)
        )
    
    def _get_max_pool(self):
        return nn.MaxPool2d(kernel_size = 2)

    def _get_four_conv_block_with_512_out(self):
        return nn.Sequential(
            self._get_one_and_three_conv_with_512_out(), 
            self._get_one_and_three_conv_with_512_out(),
            self._get_one_and_three_conv_with_512_out(),
            self._get_one_and_three_conv_with_512_out()
        )

    def _get_one_and_three_conv_with_512_out(self):
        return nn.Sequential(
            nn.Conv2d(512, 256, 1, 1, 0), 
            nn.Conv2d(256, 512, 3, 1, 1)
        )

    def _get_four_conv_block_with_1024_out(self):
        return nn.Sequential(
            self._get_one_and_three_with_1024_out(),
            self._get_one_and_three_with_1024_out()
        )
        
    def _get_one_and_three_with_1024_out(self):
        return nn.Sequential(
            nn.Conv2d(1024, 512, 1, 1, 0), 
            nn.Conv2d(512, 1024, 3, 1, 1)
        )

    def forward(self, inputs):
        """
        inputs: (batch_size, image_len, image_width, channels)
        returns: (batch_size, split_size, split_size, n_bboxes*5 + n_classes)
        """
        conv_output = self.conv_network(inputs)
        # conv_output = inputs
        # for layer in self.conv_network:
        #     conv_output = layer(conv_output)
        #     print(f'Layer {layer}: output shape = {conv_output.shape}')
        return self.fc_network(conv_output)


In [3]:
def lr_scheduler(epoch):
    if epoch < 10:
        return 1e-3 + (1e-2 - 1e-3) * (epoch / 10)  # Linearly increase from 1e-3 to 1e-2
    elif epoch < 85:
        return 1e-2  # Keep constant at 1e-2
    elif epoch < 115:
        return 1e-3  # Decrease to 1e-3
    else:
        return 1e-4  # Decrease to 1e-4


class CustomLRScheduler:
    def __init__(self, optimizer, lr_func):
        self.optimizer = optimizer
        self.lr_func = lr_func

    def step(self, epoch):
        lr = self.lr_func(epoch)
        for param_group in self.optimizer.param_groups:
            param_group['lr'] = lr

In [4]:
a = torch.randn(1, 2, 2)
b = torch.randn(1, 2, 2)
torch.max(a.unsqueeze(-1),b.unsqueeze(-2)).shape

torch.Size([1, 2, 2, 1])
torch.Size([1, 2, 1, 2])


torch.Size([1, 2, 2, 2])

In [5]:
def intersection_over_union(box1, box2):
    """
    box1: (batch_size, split_size, split_size, n_boxes, 5)
    box2: (batch_size, split_size, split_size, n_boxes, 5)

    return: (batch_size, split_size, split_size, n_boxes, n_boxes) float representing iou
    """
    # Unpack the coordinates and dimensions
    x1_center, y1_center, w1, h1 = box1[..., 0], box1[..., 1], box1[..., 2], box1[..., 3]
    x2_center, y2_center, w2, h2 = box2[..., 0], box2[..., 1], box2[..., 2], box2[..., 3]

    x1_min, y1_min = x1_center - w1 / 2, y1_center - h1 / 2
    x1_max, y1_max = x1_center + w1 / 2, y1_center + h1 / 2
    x2_min, y2_min = x2_center - w2 / 2, y2_center - h2 / 2
    x2_max, y2_max = x2_center + w2 / 2, y2_center + h2 / 2

    inter_xmin = torch.max(x1_min.unsqueeze(-1), x2_min.unsqueeze(-2))
    inter_ymin = torch.max(y1_min.unsqueeze(-1), y2_min.unsqueeze(-2))
    inter_xmax = torch.min(x1_max.unsqueeze(-1), x2_max.unsqueeze(-2))
    inter_ymax = torch.min(y1_max.unsqueeze(-1), y2_max.unsqueeze(-2))

    inter_width = torch.clamp(inter_xmax - inter_xmin, min=0)
    inter_height = torch.clamp(inter_ymax - inter_ymin, min=0)
    inter_area = inter_width * inter_height

    box1_area = w1 * h1
    box2_area = w2 * h2

    union_area = box1_area.unsqueeze(-1) + box2_area.unsqueeze(-2) - inter_area

    iou = inter_area / torch.clamp(union_area, min=1e-8)

    return iou

def non_max_suppression(boxes, scores, iou_threshold):
    """
    Perform Non-Maximum Suppression (NMS) on the bounding boxes.

    Parameters:
    boxes: shape (batch_size, 5)
           Each row contains [x, y, w, h, score] coordinates of a bounding box.
    iou_threshold: float
                   IoU threshold for suppressing boxes.

    Returns:
    indices: list of int
             Indices of the bounding boxes to keep.
    """
    if len(boxes) == 0:
        return []

    # Convert boxes to numpy arrays (if not already)
    boxes = torch.tensor(boxes)
    scores = torch.tensor(scores)

    # Compute the bottom-right coordinates and area of each box
    x1 = boxes[:, 0]
    y1 = boxes[:, 1]
    w = boxes[:, 2]
    h = boxes[:, 3]
    x2 = x1 + w
    y2 = y1 + h
    areas = w * h

    # Get the indices of the boxes sorted by scores in descending order
    order = boxes[:, 4].argsort()[::-1]

    keep = []
    while order.size > 0:
        # The index of the current box with the highest score
        i = order[0]
        keep.append(i)

        # Compute the IoU of the kept box with the remaining boxes
        xx1 = np.maximum(x1[i], x1[order[1:]])
        yy1 = np.maximum(y1[i], y1[order[1:]])
        xx2 = np.minimum(x2[i], x2[order[1:]])
        yy2 = np.minimum(y2[i], y2[order[1:]])

        w_inter = np.maximum(0, xx2 - xx1)
        h_inter = np.maximum(0, yy2 - yy1)
        inter_area = w_inter * h_inter

        iou = inter_area / (areas[i] + areas[order[1:]] - inter_area)

        # Keep boxes with IoU less than the threshold
        inds = np.where(iou <= iou_threshold)[0]
        order = order[inds + 1]

    return keep

        

In [6]:
test =  torch.randn(2,2,3)
iou_matrix = torch.rand(2,2,3,3) # batch, split, box, box

values, indices = torch.max(iou_matrix[..., 1, :] * test, dim = -1)

test.scatter_(-1, indices.unsqueeze(-1), -1)

tensor([[[-0.1923, -0.0761, -0.3437],
         [ 1.3099,  0.0792,  1.6286]],

        [[ 0.3423,  1.0475, -0.3291],
         [ 0.5795,  0.5726, -1.1467]]])
tensor([[[[0.2036, 0.1570, 0.2786],
          [0.8855, 0.2941, 0.7706],
          [0.6602, 0.2587, 0.5265]],

         [[0.4236, 0.5903, 0.1919],
          [0.4959, 0.5569, 0.1747],
          [0.2349, 0.2248, 0.6572]]],


        [[[0.8155, 0.0224, 0.3115],
          [0.2994, 0.4425, 0.2509],
          [0.6869, 0.0724, 0.8641]],

         [[0.5276, 0.1692, 0.9147],
          [0.5977, 0.8769, 0.9470],
          [0.8026, 0.5486, 0.6304]]]])
tensor([[[-0.1703, -0.0224, -0.2649],
         [ 0.6496,  0.0441,  0.2845]],

        [[ 0.1025,  0.4635, -0.0826],
         [ 0.3464,  0.5022, -1.0859]]])
tensor([[-0.0224,  0.6496],
        [ 0.4635,  0.5022]]) tensor([[1, 0],
        [1, 1]])
torch.Size([2, 2, 1])


tensor([[[-0.1923, -1.0000, -0.3437],
         [-1.0000,  0.0792,  1.6286]],

        [[ 0.3423, -1.0000, -0.3291],
         [ 0.5795, -1.0000, -1.1467]]])

In [7]:
class YoloLoss(nn.Module):
    def __init__(self, split_size = 7, n_bboxes = 2, n_classes = 20, lambda_coord = 5, lambda_noobj = 0.5):
        super().__init__()
        self.lambda_coord = lambda_coord
        self.lambda_noobj = lambda_noobj
        self.split_size = split_size
        self.n_bboxes = n_bboxes
        self.n_classes = n_classes
        self.mse_loss = nn.MSELoss(reduction="sum")

    def assign_boxes(self, pred_boxes, gt_boxes, batch_size):
        gt_boxes_reshaped = gt_boxes.view(batch_size, self.split_size, self.split_size, self.n_bboxes, 5)
        pred_boxes_reshaped = pred_boxes.view(batch_size, self.split_size, self.split_size, self.n_bboxes, 5)

        iou_matrix = intersection_over_union(pred_boxes_reshaped, gt_boxes_reshaped)

        gt_assigned = torch.full((batch_size, self.split_size, self.split_size, self.n_bboxes,), -1, dtype=torch.long)  # -1 means not assigned
        pred_assigned = torch.full((batch_size, self.split_size, self.split_size, self.n_bboxes,), -1, dtype=torch.long)
        
        gt_available = torch.ones((batch_size, self.split_size, self.split_size, self.n_bboxes), dtype=torch.bool)
        gt_available = gt_available.to(device)
        unavailable_gt_box = (gt_boxes_reshaped[..., 4] == 0)
        gt_available[unavailable_gt_box] = 0

        for i in range(self.n_bboxes):
            iou_max_values, gt_indices = torch.max(iou_matrix[..., i, :] * gt_available.float(), dim=-1)

            pred_assigned[..., i] = gt_indices
            
            gt_available.scatter_(-1, gt_indices.unsqueeze(-1), -1)
        
            gt_assigned[..., gt_indices] = i

        return gt_assigned, pred_assigned

    def get_gt_boxes(self, target):
        """
        target: (batch_size, split_size, split_size, n_bboxes, (4+1+n_classes))
        """
        gt_boxes = target[..., :5]
        gt_classes = target[..., 5:]
        gt_classes = gt_classes.any(dim = 3).float()
        return gt_boxes, gt_classes
        
    def forward(self, predictions, target):
        """
        predictions: (batch_size, split_size, split_size, n_bboxes*5 + n_classes)
        target: (batch_size, split_size, split_size, n_bboxes, (4+1+n_classes)) the second last field in the target tells us if object exists or not
        """
        batch_size = predictions.shape[0]
        predictions = predictions.view(batch_size, self.split_size, self.split_size, self.n_bboxes*5 + self.n_classes)
        
        pred_boxes = predictions[..., :self.n_bboxes * 5].view(batch_size, self.split_size, self.split_size, self.n_bboxes, 5)
        pred_classes = predictions[..., self.n_bboxes * 5:]

        gt_boxes, gt_classes = self.get_gt_boxes(target)
        if gt_boxes.size(0) == 0:
            return torch.tensor(0.0)

        gt_assigned, pred_assigned = self.assign_boxes(pred_boxes, gt_boxes, batch_size)

        assigned_pred = pred_assigned != -1
        unassigned_pred = pred_assigned == -1

        pred_box_coords = pred_boxes[..., :4][assigned_pred]
        pred_box_conf = pred_boxes[..., 4]

        indices = torch.nonzero(assigned_pred, as_tuple=False)
        if indices.shape[0] == 0:
            print("gt assigned", (gt_assigned != -1).any())
            print("gt", target[target[..., 4] == 1])
            print("predictions nan", torch.isnan(predictions).any())
            print("gt nan", torch.isnan(target).any())
            print("pred", pred_boxes[target[..., 4] == 1])
            print("assigned pred", assigned_pred.any())
            print("pred_box", pred_boxes.shape)
            print("object exists", (target[..., 4] == 1).any())
        gt_box_coords = torch.stack([gt_boxes[batch, split1, split2, pred_assigned[batch, split1, split2, box],:-1] for batch, split1, split2, box in indices])
        # gt_class_assigned = torch.stack([gt_classes[batch, split1, split2, pred_assigned[batch, split1, split2, box]] for batch, split1, split2, box in indices])
        
        # gt_box_coords = gt_boxes[..., :4][pred_assigned[assigned_pred]]

        coord_loss = self.lambda_coord * (
            self.mse_loss(pred_box_coords[..., :2], gt_box_coords[..., :2]) +
            self.mse_loss(torch.sqrt(pred_box_coords[..., 2:]), torch.sqrt(gt_box_coords[..., 2:]))
        )
        object_loss = self.mse_loss(pred_box_conf[assigned_pred], 
                                    torch.ones_like(pred_box_conf[assigned_pred]))
        
        no_object_mask = (pred_assigned == -1).float()
        no_object_loss = self.lambda_noobj * self.mse_loss(pred_box_conf[unassigned_pred], 
                                                           torch.zeros_like(pred_box_conf[unassigned_pred]))
        class_loss = self.mse_loss(pred_classes, gt_classes)

        total_loss = coord_loss + object_loss + no_object_loss + class_loss

        return total_loss

In [8]:
test_1 = torch.randn(2,2,2,4)
test_2 =torch.randint(3,(2,2,2))
cond = test_2 != 2
indices = torch.nonzero(cond, as_tuple=False)
values = torch.stack([test_1[i, j, test_2[i, j, k],:] for i, j, k in indices])


tensor([[[[ 0.0661, -0.9601,  0.2515,  2.4217],
          [ 1.3241, -0.3079, -1.5554, -0.4353]],

         [[ 0.8557,  0.1420, -0.9132, -0.9046],
          [ 2.1552, -0.0372, -0.6626,  0.0683]]],


        [[[ 1.4454,  0.6649, -1.6457,  3.0212],
          [-0.3633,  0.1150, -0.7448,  0.7338]],

         [[ 0.3736, -1.9806, -0.5421, -0.2067],
          [-0.2966, -0.3188, -0.8786, -0.5012]]]])
tensor([[[0, 2],
         [0, 0]],

        [[0, 2],
         [0, 1]]])
tensor([[[ True, False],
         [ True,  True]],

        [[ True, False],
         [ True,  True]]])
torch.Size([6])
tensor([0, 0, 0, 0, 0, 1])
tensor([[0, 0, 0],
        [0, 1, 0],
        [0, 1, 1],
        [1, 0, 0],
        [1, 1, 0],
        [1, 1, 1]])
torch.Size([6, 3])
(tensor([0, 0, 0, 1, 1, 1]), tensor([0, 1, 1, 0, 1, 1]), tensor([0, 0, 1, 0, 0, 1]))
torch.Size([6, 4])
tensor([[ 0.0661, -0.9601,  0.2515,  2.4217],
        [ 0.8557,  0.1420, -0.9132, -0.9046],
        [ 0.8557,  0.1420, -0.9132, -0.9046],
        [ 

In [9]:
annotations = "./VOCdevkit/VOC2012/Annotations/"
layout = "./VOCdevkit/VOC2012/ImageSets/Main/"
images = "./VOCdevkit/VOC2012/JPEGImages/"
def extract_filenames(train_file_path):
    filenames = []
    
    with open(train_file_path, 'r') as file:
        for line in file:
            parts = line.split()
            if parts:
                # The image ID is the first part of each line
                filename = parts[0]
                filenames.append(filename)
    
    return filenames

def parse_xml(xml_file):
    tree = ET.parse(xml_file)
    root = tree.getroot()

    filename = root.find('filename').text
    objects_and_bboxes = []

    size = root.find('size')
    img_width = float(size.find('width').text)   
    img_height = float(size.find('height').text)

    for obj in root.findall('object'):
        # Extract object class
        obj_class = obj.find('name').text

        # Extract bounding box coordinates
        bndbox = obj.find('bndbox')
        xmin = int(bndbox.find('xmin').text)
        xmax = int(bndbox.find('xmax').text)
        ymin = int(bndbox.find('ymin').text)
        ymax = int(bndbox.find('ymax').text)

        # Calculate x_center, y_center, width, height  
        x_center = ((xmin + xmax) / 2) / img_width
        y_center = ((ymin + ymax) / 2) / img_height
        width = (xmax - xmin) / img_width
        height = (ymax - ymin) / img_height

        # Store object and bounding box information
        objects_and_bboxes.append({
            'class': obj_class,
            'bbox': torch.tensor([x_center, y_center, width, height]),
        })

    return filename, objects_and_bboxes
def load_image(image_file):
    return Image.open(image_file)
    
train_files = extract_filenames(layout + "train.txt")
val_files = extract_filenames(layout + "val.txt")

classes = set()
for filename in train_files:
    _, objects = parse_xml(annotations + filename + '.xml')
    classes.update([c['class'] for c in objects])
        

In [10]:
for filename in train_files:
    _, objects = parse_xml(annotations + filename + '.xml')
    image = load_image(images + filename +'.jpg')
    image = image.convert("RGB")
    original_size = image.size
    print(original_size[0])
    break

500


In [11]:
class VOCDataset(torch.utils.data.Dataset):
    def __init__(self, data = train_files, split_size = 7, n_bboxes = 2, n_classes = 20, class_dictionary = None):
        self.data = data
        self.transformToTensor = transforms.ToTensor()
        self.transformToResize = transforms.Resize((448,448))
        self.split_size = split_size
        self.n_bboxes = n_bboxes
        self.n_classes = n_classes
        if class_dictionary == None:
            self.class_dictionary = {c:i for i, c in enumerate(classes)}
        else:
            self.class_dictionary = class_dictionary

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        file_name = train_files[index]
        _, objects = parse_xml(annotations + filename + '.xml')
        n_gt_boxes = len(objects)

        image = load_image(images + filename +'.jpg')
        image = image.convert("RGB")
        original_size = image.size
        image = self.transformToResize(image)
        new_size = image.size
        image = self.transformToTensor(image, )
        scale_x = new_size[0] / original_size[0]
        scale_y = new_size[1] / original_size[1]

        label_matrix = torch.zeros((self.split_size, self.split_size, self.n_bboxes, 5 + self.n_classes))
        labeled_map = {}
        for gt_box in range(n_gt_boxes):
            cl = objects[gt_box]['class']
            x, y, w, h = objects[gt_box]['bbox']
            x = x * scale_x
            y = y * scale_y
            w = w * scale_x
            h = h * scale_y

            i, j = int(self.split_size * y), int(self.split_size * x)
            x_cell, y_cell = self.split_size * x - j, self.split_size * y - i

            w_cell, h_cell = (
                w * self.split_size,
                h * self.split_size,
            )
            for n_box in range(self.n_bboxes):
                if label_matrix[i, j, n_box, 4] == 0:
                    label_matrix[i, j, n_box] = torch.cat([torch.tensor([x_cell, y_cell, w_cell, h_cell, 1]),
                                                           nn.functional.one_hot(torch.tensor(self.class_dictionary[cl]), num_classes=self.n_classes).float()])
        return image, label_matrix
        

In [13]:
LEARNING_RATE = 2e-5
BATCH_SIZE = 16 # 64 in original paper but resource exhausted error otherwise.
WEIGHT_DECAY = 0
EPOCHS = 20
NUM_WORKERS = 2
PIN_MEMORY = True
LOAD_MODEL = False
if torch.backends.mps.is_available():
    device = 'mps'
elif torch.cuda.is_available():
    device = 'cuda'
else:
    device = 'cpu'

device = 'cpu'

CHECKPOINT_PTH = 'checkpoint.pth'

SPLIT_SIZE = 7
N_BBOXES = 2
N_CLASSES = 20

In [14]:
def save_checkpoint(epoch, model, optimizer,loss):
    checkpoint = {
    'epoch': epoch,
    'model_state_dict': model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict(),
    'loss': loss,
    }
    
    torch.save(checkpoint, CHECKPOINT_PTH)

def load_checkpoint():
    checkpoint = torch.load(CHECKPOINT_PTH)
    model.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    epoch = checkpoint['epoch']
    loss = checkpoint['loss']

In [15]:
def train(train_loader, model, optimizer, loss_fn):
    loop = tqdm(train_loader, leave=True)
    mean_loss = []
    
    for batch_idx, (x, y) in enumerate(loop):
        x, y = x.to(device), y.to(device)
        out = model(x)
        loss = loss_fn(out, y)
        mean_loss.append(loss.item())
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        loop.set_postfix(loss = loss.item())
        
    print(f"Mean loss was {sum(mean_loss) / len(mean_loss)}")

In [None]:
model = Yolo(split_size = SPLIT_SIZE, n_bboxes = N_BBOXES, n_classes = N_CLASSES)
model = model.to(device)
optimizer = torch.optim.Adam(
        model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY
    )
scheduler = CustomLRScheduler(optimizer= optimizer, lr_func= lr_scheduler)
loss_fn = YoloLoss(split_size = SPLIT_SIZE, n_bboxes = N_BBOXES, n_classes = N_CLASSES)

if LOAD_MODEL:
        load_checkpoint(torch.load(LOAD_MODEL_FILE), model, optimizer)
train_dataset = VOCDataset(data = train_files)
val_dataset = VOCDataset(data = val_files)
train_loader = DataLoader(
        dataset=train_dataset,
        batch_size=BATCH_SIZE,
        shuffle=True,
        drop_last=False,
    )
test_loader = DataLoader(
        dataset=val_dataset,
        batch_size=BATCH_SIZE,
        shuffle=True,
        drop_last=False,
    )
for epoch in range(EPOCHS):
    train(train_loader, model, optimizer, loss_fn)
    scheduler.step(mean_avg_prec)

checkpoint = {
        "state_dict": model.state_dict(),
        "optimizer": optimizer.state_dict(),
}
save_checkpoint(checkpoint, filename=LOAD_MODEL_FILE)


  7%|███▏                                          | 25/358 [02:56<44:40,  8.05s/it, loss=nan]

In [1]:
import gc
del model
gc.collect()
torch.mps.empty_cache()

NameError: name 'model' is not defined