# YOLO ( You only look once )
### Key Innovations
- Unified Detection Framework: Combines object localization and classification tasks into a single neural netowrk

- Grid Based Prediction: Divides the image into S x S grid. Each grid cell predicts: B bounding boxes with associated confidence scores, C class probablities

- Real time processing: Achieves high speed inference suitable for real-world applications.

- End to End training: Simplifies the object detection pipeline compared to traditional multistage approaches.

### Architectural Overview
- Backbone: A CNN such as Darknet, extracts spatial features frm the input image.

- Fully connected layer predicts: Bounding Box coordinates, confidence score, class probablities.

In [1]:
import torch
import torch.nn as nn
import torchvision.transforms as transforms
import torchvision.datasets as datasets
from torch.utils.data import DataLoader


In [4]:
class YOLO(nn.Module):
  def __init__(self, grid_size=7, num_boxes = 2, num_classes = 20):
    super(YOLO, self).__init__()
    self.grid_size = grid_size
    self.num_boxes = num_boxes
    self.num_classes = num_classes

    # backbone
    self.feature_extractor = nn.Sequential(
        nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3),
        nn.ReLU(),
        nn.MaxPool2d(kernel_size=2, stride=2),
        nn.Conv2d(64, 192, kernel_size=3, padding=1),
        nn.ReLU(),
        nn.MaxPool2d(kernel_size=2, stride=2),
        ## Additional conv layers....
    )

    self.fc= nn.Sequential(
        nn.Flatten(),
        nn.Linear(512 * grid_size * grid_size, 4096),
        nn.ReLU(),
        nn.Linear(4096, 7 * 7 * 30),
    )

    def forward(self, x):
      x = self.feature_extractor(x)
      x = self.fc(x)
      x = x.view(-1, self.grid_size, self.grid_size, self.num_boxes * 5 + self.num_classes)
      return x

In [2]:
class YOLOLoss(nn.Module):
    def __init__(self, lambda_coord=5, lambda_noobj=0.5):
        super(YOLOLoss, self).__init__()
        self.lambda_coord = lambda_coord
        self.lambda_noobj = lambda_noobj

    def forward(self, predictions, targets):
        # Parse predictions and targets
        pred_boxes = predictions[..., :4]
        pred_conf = predictions[..., 4]
        pred_classes = predictions[..., 5:]

        true_boxes = targets[..., :4]
        true_conf = targets[..., 4]
        true_classes = targets[..., 5:]

        # Calculate loss components
        coord_loss = self.lambda_coord * torch.sum((pred_boxes - true_boxes) ** 2)
        conf_loss = torch.sum((pred_conf - true_conf) ** 2)
        class_loss = torch.sum((pred_classes - true_classes) ** 2)

        total_loss = coord_loss + conf_loss + class_loss
        return total_loss


In [None]:
# Load a dataset (e.g., Pascal VOC)
transform = transforms.Compose([
    transforms.Resize((448, 448)),
    transforms.ToTensor()
])
dataset = datasets.VOCDetection(root="path_to_data", year="2012", image_set="train", download=True, transform=transform)
data_loader = DataLoader(dataset, batch_size=16, shuffle=True)

# Initialize model, loss function, and optimizer
model = YOLO(grid_size=7, num_boxes=2, num_classes=20)
criterion = YOLOLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

# Training loop
for epoch in range(50):
    for images, targets in data_loader:
        predictions = model(images)
        loss = criterion(predictions, targets)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch + 1}, Loss: {loss.item()}")
