# YOLO model

In [None]:
import torch
import torch.nn as nn

class ConvBlock(nn.Module):
    """Convolutional block consisting of Conv2d + BatchNorm + LeakyReLU"""
    def __init__(self, in_channels, out_channels, kernel_size, stride, padding):
        super(ConvBlock, self).__init__()
        self.conv = nn.Conv2d(in_channels, out_channels, kernel_size, stride, padding, bias=False)
        self.bn = nn.BatchNorm2d(out_channels)
        self.leaky_relu = nn.LeakyReLU(0.1)

    def forward(self, x):
        return self.leaky_relu(self.bn(self.conv(x)))


class YOLOv3(nn.Module):
    def __init__(self, num_classes=20):
        super(YOLOv3, self).__init__()
        self.num_classes = num_classes

        # Define the layers of YOLOv3 architecture
        self.conv1 = ConvBlock(3, 32, 3, 1, 1)
        self.conv2 = ConvBlock(32, 64, 3, 2, 1)
        self.conv3 = ConvBlock(64, 128, 3, 2, 1)

        # Stack more layers as per the YOLOv3 design
        # For simplicity, I'll keep it small here
        self.conv_final = nn.Conv2d(128, (num_classes + 5) * 3, 1, 1, 0)  # (B, C+5)*3 for bounding boxes

    def forward(self, x):
        x = self.conv1(x)
        x = self.conv2(x)
        x = self.conv3(x)
        out = self.conv_final(x)  # Output layer: (B, (C+5)*3, H, W)

        # Output reshape to match bounding box output format
        return out.view(out.size(0), -1, self.num_classes + 5)  # (B, N, 25)

# Instantiate the model for 20 classes (e.g., Pascal VOC)
model = YOLOv3(num_classes=20)

# Training

In [None]:
# Pascal VOC class names to integer mapping
VOC_CLASSES = [
    'aeroplane', 'bicycle', 'bird', 'boat', 'bottle',
    'bus', 'car', 'cat', 'chair', 'cow', 'diningtable',
    'dog', 'horse', 'motorbike', 'person', 'pottedplant',
    'sheep', 'sofa', 'train', 'tvmonitor'
]

# Create a dictionary for class name to index mapping
class_to_idx = {cls_name: idx for idx, cls_name in enumerate(VOC_CLASSES)}

In [None]:
def yolo_loss(predictions, targets, num_classes=20, lambda_coord=5, lambda_noobj=0.5):
    """
    A simplified YOLO loss function.

    Parameters:
    - predictions: Predicted bounding boxes, objectness, and class probabilities (tensor)
    - targets: Ground truth bounding boxes and class labels (list of dictionaries)
    - num_classes: Number of classes
    - lambda_coord: Weight for the localization (coordinate) loss
    - lambda_noobj: Weight for the no-object confidence loss

    Returns:
    - Total loss (localization + objectness + classification)
    """

    # Unpack predictions
    pred_bboxes = predictions[..., :4]  # Predicted bounding boxes (x, y, w, h)
    pred_conf = predictions[..., 4]     # Predicted objectness score
    pred_class = predictions[..., 5:]   # Predicted class probabilities

    # Prepare the losses
    coord_loss = 0
    conf_loss = 0
    class_loss = 0

    mse_loss = nn.MSELoss()
    bce_loss = nn.BCEWithLogitsLoss()
    ce_loss = nn.CrossEntropyLoss()

    for i, target in enumerate(targets):  # Loop over each image in the batch
        objects = target['object']  # Access all objects in the image
        if isinstance(objects, dict):
            objects = [objects]  # Ensure it's a list if there is only one object

        for obj in objects:
            # Extract bounding boxes (as floats), confidence, and class
            gt_bboxes = torch.tensor([
                float(bbox['xmin']),
                float(bbox['ymin']),
                float(bbox['xmax']),
                float(bbox['ymax'])
            ]).to(pred_bboxes.device)

            gt_conf = 1.0  # Assume confidence is 1 (object exists)

            # Use the class_to_idx mapping to get the integer label for the class name
            gt_class = class_to_idx[obj['name']]

            # Localization loss: Compare predicted and ground truth bounding boxes
            coord_loss += lambda_coord * mse_loss(pred_bboxes[i], gt_bboxes)

            # Objectness loss: Compare predicted and ground truth object confidence
            gt_conf_tensor = torch.full_like(pred_conf[i], gt_conf).to(pred_conf.device)
            conf_loss += bce_loss(pred_conf[i], gt_conf_tensor)

            # Classification loss: Compare predicted and ground truth class probabilities
            target_class = torch.zeros(1, num_classes, dtype=torch.long).to(pred_class.device)
            target_class[0, gt_class] = 1
            class_loss += ce_loss(pred_class[i].unsqueeze(0), target_class)

    # Total loss = localization loss + confidence loss + classification loss
    total_loss = coord_loss + lambda_noobj * conf_loss + class_loss
    return total_loss

In [None]:
import torch.optim as optim
from torch.utils.data import DataLoader
from torchvision import datasets, transforms

# Define the training loop
def train(model, dataloader, optimizer, loss_fn, num_epochs=10, device="cuda"):
    model.to(device)
    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0

        for i, (images, targets) in enumerate(dataloader):
            images = images.to(device)

            # Move each target dictionary to the device
            for target in targets:
                for key, value in target.items():
                    if isinstance(value, torch.Tensor):
                        target[key] = value.to(device)

            # Forward pass
            outputs = model(images)
            loss = loss_fn(outputs, targets)

            # Backward pass and optimization
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            running_loss += loss.item()

            if i % 100 == 99:  # Print loss every 100 mini-batches
                print(f"[{epoch+1}, {i+1}] loss: {running_loss / 100:.3f}")
                running_loss = 0.0

# Optimizer
optimizer = optim.AdamW(model.parameters(), lr=0.001)

# Custom collate function to handle batches with varying annotation sizes
def collate_fn(batch):
    images = []
    targets = []

    for sample in batch:
        images.append(sample[0])  # Append image tensor
        targets.append(sample[1]['annotation'])  # Append target annotations

    images = torch.stack(images, 0)  # Stack images along the batch dimension
    return images, targets

# Define transforms to resize all images to the same size (e.g., 416x416)
transform = transforms.Compose([
    transforms.Resize((416, 416)),  # Resize all images to 416x416
    transforms.ToTensor(),  # Convert images to PyTorch tensors
])

# Load Pascal VOC dataset with resizing and the custom collate function
train_dataset = datasets.VOCDetection('data/VOCdevkit/', year='2012', image_set='train',
                                      download=True, transform=transform)

# DataLoader with custom collate_fn
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True, collate_fn=collate_fn)

# Train the model
train(model, train_loader, optimizer, yolo_loss, num_epochs=10)

Using downloaded and verified file: data/VOCdevkit/VOCtrainval_11-May-2012.tar
Extracting data/VOCdevkit/VOCtrainval_11-May-2012.tar to data/VOCdevkit/


  return F.mse_loss(input, target, reduction=self.reduction)


[1, 100] loss: 8057353.633
[1, 200] loss: 7179640.202
[1, 300] loss: 6347760.053
[1, 400] loss: 5512673.098
[1, 500] loss: 4753820.412
[1, 600] loss: 4167145.842
[1, 700] loss: 3480778.879
[2, 100] loss: 2905216.131
[2, 200] loss: 2607282.801
[2, 300] loss: 2364014.320
[2, 400] loss: 2317555.264
[2, 500] loss: 2256733.799
[2, 600] loss: 2060348.269
[2, 700] loss: 2243113.618
[3, 100] loss: 2261567.931
[3, 200] loss: 2237175.893
[3, 300] loss: 2143934.901
[3, 400] loss: 2183021.415
[3, 500] loss: 2198932.078
[3, 600] loss: 2219960.942
[3, 700] loss: 2174068.716
[4, 100] loss: 2281878.473
[4, 200] loss: 2180962.293
[4, 300] loss: 2251474.288
[4, 400] loss: 2161326.547
[4, 500] loss: 2146677.171
[4, 600] loss: 2111387.093
[4, 700] loss: 2234158.189
[5, 100] loss: 2220832.331
[5, 200] loss: 2187321.455
[5, 300] loss: 2230120.970
[5, 400] loss: 2197343.676
[5, 500] loss: 2226908.906
[5, 600] loss: 2234050.874
[5, 700] loss: 2103988.904
[6, 100] loss: 2156758.579
[6, 200] loss: 2359540.915
[

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Testing

In [None]:
import matplotlib.pyplot as plt
import numpy as np

def visualize_predictions(image, predictions):
    """Visualize bounding boxes and class predictions"""
    fig, ax = plt.subplots(1)
    ax.imshow(image.permute(1, 2, 0).cpu().numpy())

    # Assuming predictions are in the format [x1, y1, x2, y2, class_probabilities]
    for pred in predictions:
        x1, y1, x2, y2, conf, cls = pred[:4]
        rect = plt.Rectangle((x1, y1), x2 - x1, y2 - y1, linewidth=2, edgecolor='r', facecolor='none')
        ax.add_patch(rect)
        ax.text(x1, y1 - 10, f'Class: {cls} Conf: {conf:.2f}', color='white', backgroundcolor='red')

    plt.show()

# Testing function
def test(model, test_loader, device="cuda"):
    model.eval()
    with torch.no_grad():
        for i, (images, _) in enumerate(test_loader):
            images = images.to(device)
            outputs = model(images)
            # Post-process the predictions and visualize
            for image, output in zip(images, outputs):
                visualize_predictions(image, output)

# Load Pascal VOC dataset with resizing and the custom collate function
test_dataset = datasets.VOCDetection('data/VOCdevkit/', year='2012', image_set='test',
                                      download=True, transform=transform)

# Get a small sample (e.g., 10 images)
small_sample_dataset = torch.utils.data.Subset(test_dataset, indices=range(10))

# DataLoader with custom collate_fn
test_loader = DataLoader(small_sample_dataset, batch_size=8, shuffle=True, collate_fn=collate_fn)

test(model, test_loader)