In [None]:
import torch
from torchvision import transforms
from torchvision.models.detection import ssd300_vgg16, SSD300_VGG16_Weights
from torchvision.models.detection.ssd import SSDClassificationHead
from PIL import Image
import numpy as np
import matplotlib.pyplot as plt
import kagglehub
import os
from pycocotools.coco import COCO
from pycocotools.cocoeval import COCOeval
import torch.nn as nn

In [None]:
NUM_CLASSES = 38
EPOCHS = 10
BATCH_SIZE = 4

1. Create a custom PyTorch Dataset for object detectiong using COCO-format annotations
- `__init__`: store paths, load annotations, prepare label mapping
    - use lazy loading(memory efficient)
    - `COCO()`: parse JSON,build fast lookup tables(include images, annotations, categories)
    - `self.reverse_label_map`: used for evaluation, visualization and mapping prediction back to COCO IDs


In [None]:
class UrbanDataset(torch.utils.data.Dataset):
    def __init__(self, root, split, transforms=None):
        self.root = root
        self.split = split
        self.transforms = transforms
        self.dir = os.path.join(root, "Senior-Design-VIAD-4", split)
        
        ann_path = os.path.join(self.dir, "_annotations.coco.json")
        self.coco = COCO(ann_path)
        
        self.ids = list(sorted(self.coco.imgs.keys()))

        cat_ids = sorted(self.coco.getCatIds())
        # SSD reserves label 0 for background
        # avoid CUDA asserts errors during training
        self.label_map = {cat_id: i + 1 for i, cat_id in enumerate(cat_ids)}
        
        self.reverse_label_map = {v: k for k, v in self.label_map.items()}

    def __getitem__(self, index):
        img_id = self.ids[index]
        ann_ids = self.coco.getAnnIds(imgIds=img_id)
        
        # anns is a list of objects
        anns = self.coco.loadAnns(ann_ids)

        # load the image
        img_info = self.coco.loadImgs(img_id)[0]
        img = Image.open(os.path.join(self.dir, img_info["file_name"])).convert("RGB")

        # bounding box processing
        boxes, labels = [], []

        for ann in anns:
            x, y, w, h = ann["bbox"]
            if w > 1 and h > 1:
                # convert the COCO bbox format to SSD format
                boxes.append([x, y, x + w, y + h])
                # convert COCO category IDs to contiguous labels
                labels.append(self.label_map[ann["category_id"]])

        if len(boxes) == 0:
            boxes = torch.zeros((0, 4), dtype=torch.float32)
            labels = torch.zeros((0,), dtype=torch.int64)
        else:
            boxes = torch.as_tensor(boxes, dtype=torch.float32)
            labels = torch.as_tensor(labels, dtype=torch.int64)

        target = {
            "boxes": boxes,
            "labels": labels,
            "image_id": torch.tensor(img_id)
        }
    
        if self.transforms:
            img = self.transforms(img)

        return img, target

    def __len__(self):
        return len(self.ids)


2. Image Preprocessing
- `get_tranform`:
    - convert PIL image to PyTorch Tensor
    - pixel range: `[0,255] -> [0.0,1.0]`
    - shape: `(H,W,C) -> (C,H,W)`
- `collate_fn`:
    - object detection data cannot be stacked normally
    - inputs: 
    ```
    [
        (img1, target1),
        (img2, target2)
    ]
    ```
    - outptus:
    ```
    (
        (img1, img2),
        (target1, target2)
    )
    ```
    - required for variable-length annotations in detection tasks



In [None]:
path = kagglehub.dataset_download("mohamedgobara/26-class-object-detection-dataset")

def get_transform():
    return transforms.Compose([transforms.ToTensor()])

# creating dataset objects
train_dataset = UrbanDataset(path, "train", get_transform())
valid_dataset = UrbanDataset(path, "valid", get_transform())
test_dataset  = UrbanDataset(path, "test",  get_transform())

def collate_fn(batch):
    return tuple(zip(*batch))

# used for fast debugging
train_dataset = torch.utils.data.Subset(train_dataset, list(range(200)))
valid_dataset = torch.utils.data.Subset(valid_dataset, list(range(50)))

train_loader = torch.utils.data.DataLoader(
    train_dataset,
    batch_size=BATCH_SIZE,
    # prevent learning order bias
    shuffle=True,
    collate_fn=collate_fn,
    # load data in parallel, speed up training
    num_workers=4,
    # improve GPU transfer efficiency      
    pin_memory=True     
)

valid_loader = torch.utils.data.DataLoader(
    valid_dataset,
    batch_size=BATCH_SIZE,
    shuffle=False, # validation should be deterministic, reproducible and order independent
    collate_fn=collate_fn,
    num_workers=2,
    pin_memory=True
)


3. Setup model
- COCO weights: the model already understands edges,shapes and generic objects
- `num_anchors_per_location()`: return a list of integers, one per feature map
- anchor count determines how many predictions are made per feature map location
- change the original COCO head(91 classes) to `NUM_CLASSES`
- total outputs per location = `num_anchors` * `NUM_CLASSES`
- only replacing the classification head
- backbone: extracts features from the images
- anchor: predefined box placed on the images
- anchor generator: decides anchor sizes, aspect ratios and how many anchors per location
- regression head: predict how to move and resize each anchor box

```
SSD has fixed anchors

Each anchor predicts a class

Number of classes must match dataset

Classification head defines that mapping
```
### End-to-end-flow
Image
 ↓
Backbone (VGG16)
 ↓
Feature maps
 ↓
Anchor Generator → anchors
 ↓
Regression Head → box offsets
 ↓
Classification Head → class scores



In [None]:
weights = SSD300_VGG16_Weights.COCO_V1
model = ssd300_vgg16(weights=weights)

num_anchors = model.anchor_generator.num_anchors_per_location()

# channel depth of the six feature maps SSD uses
# comes from VGG16 backbone and extra SSD convolution layers
in_channels = [512, 1024, 512, 256, 256, 256]

model.head.classification_head = SSDClassificationHead(
    in_channels,
    num_anchors,
    NUM_CLASSES
)

In [None]:
# checks dataset labels to ensure they are within the expected range
max_label = 0
for i in range(len(train_dataset)):
    _, t = train_dataset[i]
    # only check images that have at least one object
    if t["labels"].numel() > 0:
        max_label = max(max_label, t["labels"].max().item())

assert max_label < NUM_CLASSES, "❌ Label exceeds NUM_CLASSES"
print("✅ Dataset labels valid")

4. Training
```
For each epoch:
    Train:
        For each batch:
            Move data → device
            Forward pass → compute loss
            Backprop → update weights
            Accumulate loss
    Validate:
        Move data → device
        Forward pass only (no backprop)
        Accumulate loss
    Print average train & val loss

```

** BatchNorm forces the input to have zero mean and unit variance. It stabilizes training, allows higher learning rates and acts as a slight regularizer.
It normalizes the inputs of each layer across a batch.

In [None]:
if torch.cuda.is_available():
    print("✅ CUDA is available! Using GPU.")
    print("Device name:", torch.cuda.get_device_name(0))
else:
    print("❌ CUDA is not available. Using CPU.")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Stochastic Gradient Descent(SGD)
# momentum: speed up convergence
# weight_decay: L2 regularization
optimizer = torch.optim.SGD(
    model.parameters(), lr=0.005, momentum=0.9, weight_decay=0.0005
)

for epoch in range(EPOCHS):
    model.train()
    train_loss = 0

    for images, targets in train_loader:
        images = [img.to(device) for img in images]
        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]

        loss_dict = model(images, targets)
        loss = sum(loss_dict.values())

        # backpropagation
        optimizer.zero_grad() # clear previous gradients
        loss.backward() # compute gradients w.r.t paramaters
        optimizer.step() # update weights

        train_loss += loss.item()

    # validation
    model.eval() # disables dropout and BatchNorm updates
    val_loss = 0

    with torch.no_grad(): # save memory & speed up
        for images, targets in valid_loader:
            images = [img.to(device) for img in images]
            targets = [{k: v.to(device) for k, v in t.items()} for t in targets]

            loss_dict = model(images, targets)
            val_loss += sum(loss_dict.values()).item()

    print(
        f"Epoch [{epoch+1}/{EPOCHS}] "
        f"Train: {train_loss/len(train_loader):.4f} "
        f"Val: {val_loss/len(valid_loader):.4f}"
    )


5. Testing
```
Switch model to eval mode

Load a single image

Forward pass with no gradients

Filter predictions by confidence

Draw boxes and labels on image

Show the image
```

In [None]:
# only show predictions the model is confident about
def predict(model, dataset, index=0, threshold=0.5):
    model.eval()
    # get the image
    img, _ = dataset[index]

    # forward pass without gradient
    with torch.no_grad():
        output = model([img.to(device)])[0]

    # prepare image for ploting
    img_np = img.permute(1, 2, 0).numpy() #PyTorch -> matplotlib
    plt.imshow(img_np)
    ax = plt.gca() # get current axis for adding boxes

    # loop through predictions
    for box, label, score in zip(
        output["boxes"], output["labels"], output["scores"]
    ):
        if score > threshold:
            x1, y1, x2, y2 = box.cpu().numpy()
            ax.add_patch(
                plt.Rectangle((x1, y1), x2 - x1, y2 - y1,
                              fill=False, edgecolor="red", linewidth=2)
            )
            ax.text(x1, y1, f"{label.item()} ({score:.2f})",
                    bbox=dict(facecolor="yellow", alpha=0.5))

    plt.axis("off")
    plt.show()

predict(model, test_dataset, index=10)


6. Evaluate Metrics
```
For each batch:
    Move images to device
    Forward pass → predictions
    Convert boxes to COCO format
Aggregate predictions
Use pycocotools.COCOeval to compute mAP, AP50, AP75, AR
Print summary
```

** Ground Truth boxes(GT boxes) is a bounding box that encapsulates an object in the image and comes with a class label
- Compute loss
- Regression loss
```
Image → model predicts boxes
GT boxes → true answer
Compare → measure how well model predicted
```

In [None]:
def evaluate_model(model, loader, device):
    model.eval()
    results = []

    # loop over batches
    for images, targets in loader:
        images = [img.to(device) for img in images]

        with torch.no_grad():
            outputs = model(images)

        # loop through predictions per images
        for target, output in zip(targets, outputs):
            img_id = int(target["image_id"])
            for box, score, label in zip(
                output["boxes"], output["scores"], output["labels"]
            ):
                x1, y1, x2, y2 = box.cpu().numpy()
                # convert predictions to COCO JSON format
                results.append({
                    "image_id": img_id,
                    "category_id": int(label),  
                    "bbox": [x1, y1, x2 - x1, y2 - y1],
                    "score": float(score),
                })
                
    # create COCO results objects
    # convert results to COCO evaluation object
    coco_dt = loader.dataset.coco.loadRes(results)
    coco_eval = COCOeval(loader.dataset.coco, coco_dt, "bbox")
    
    # matches predictions to GT boxes
    coco_eval.evaluate()
    
    # computes precision & recall across IoU thresolds
    coco_eval.accumulate()
    
    # print metrics(mean average precision(mAP), AP@0.50,AP@0.75,average recall(AR))
    coco_eval.summarize()

test_dataset = torch.utils.data.Subset(test_dataset, list(range(50)))

test_loader = torch.utils.data.DataLoader(
    test_dataset,
    batch_size=BATCH_SIZE,
    shuffle=False,
    collate_fn=collate_fn,
    num_workers=2,
    pin_memory=True
)

evaluate_model(model, test_loader, device)
