# 2025 DL Lab5: Object Detection on Pascal VOC

Before we start, please put **your name** and **SID** in following format: <br>
Hi I'm 陸仁賈, 314831000.

**Your Answer:**    
Hi I'm 吳禎哲, 313833003

## Overview

This project focuses on object detection using the Pascal VOC dataset. 

The goal is to identify and locate various objects within images by training and evaluating detection models.
 
The dataset provides annotated images across multiple categories, making it a standard benchmark for evaluating object detection performance.


## Kaggle Competition
Kaggle is an online community of data scientists and machine learning practitioners. Kaggle allows users to find and publish datasets, explore and build models in a web-based data-science environment, work with other data scientists and machine learning engineers, and enter competitions to solve data science challenges.

This assignment use kaggle to calculate your grade.  
Please use this [**LINK**](https://www.kaggle.com/t/e86ea95cb007416a85a07d8729ac838e) to join the competition.

## Unzip Data

Unzip `dataset.zip` 

+ `vocall_test.txt` : list for the training set
+ `vocall_test.txt` : list for the validation set
+ `vocall_test.txt` : list for the test set
+ `image/` : contains all images.


The train set contains 8,218 images, the val set contains 3,823 images, and the test set contains 8,920 images.


#### You are allowed to use a **backbone model**, but only those available from the **timm package** (https://huggingface.co/timm/models).

# Import package

In [None]:
import os
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
import numpy as np
from torch.amp import autocast, GradScaler
from src.yolo import getODmodel
from yolo_loss import YOLOv3Loss
from src.dataset import VocDetectorDataset, train_data_pipelines, test_data_pipelines, collate_fn
from src.eval_voc import evaluate
from src.config import GRID_SIZES, ANCHORS
from torch.optim.lr_scheduler import CosineAnnealingLR
import weave
import wandb
os.environ['WANDB_API_KEY'] = '38feb6c50e85ec500d8a256415926ee90e4b5094'
weave.init('LAB5-YOLOv3-Object-Detection')

In [None]:
#####hyperparameters#####
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
num_epochs = 50
batch_size = 64
learning_rate = 1e-3
lambda_coord=5.0
lambda_obj=1.0
lambda_noobj=0.5
lambda_class=1.0

In [None]:
# 初始化 Weights & Biases 追蹤
project_name = 'LAB5-YOLOv3-Object-Detection'
run = wandb.init(project=project_name, config={
    'device': str(device),
    'num_epochs': num_epochs,
    'batch_size': batch_size,
    'learning_rate': learning_rate,
    'lambda_coord': lambda_coord,
    'lambda_obj': lambda_obj,
    'lambda_noobj': lambda_noobj,
    'lambda_class': lambda_class,
})
# 定義指標與步進軸
wandb.define_metric('epoch')
wandb.define_metric('train/*', step_metric='epoch')
wandb.define_metric('val/*', step_metric='epoch')


In [None]:
# Data paths
file_root_train = './dataset/image/'
annotation_file_train = './dataset/vocall_train.txt'
file_root_val = './dataset/image/'
annotation_file_val = './dataset/vocall_val.txt'
 # Data paths
file_root_train = './dataset/image/'
annotation_file_train = './dataset/vocall_train.txt'
file_root_val = './dataset/image/'
annotation_file_val = './dataset/vocall_val.txt'

# Create datasets
print('Loading datasets...')
train_dataset = VocDetectorDataset(
    root_img_dir=file_root_train,
    dataset_file=annotation_file_train,
    train=True,
    transform=train_data_pipelines,
    grid_sizes=GRID_SIZES,
    encode_target=True
)
train_loader = DataLoader(
    train_dataset,
    batch_size=batch_size,
    collate_fn=collate_fn,
    shuffle=True,
    num_workers=4,
)
print(f'Loaded {len(train_dataset)} train images')

val_dataset = VocDetectorDataset(
    root_img_dir=file_root_val,
    dataset_file=annotation_file_val,
    train=False,
    transform=test_data_pipelines,
    grid_sizes=GRID_SIZES,
    encode_target=True,
)
val_loader = DataLoader(
    val_dataset,
    batch_size=batch_size,
    collate_fn=collate_fn,
    shuffle=False,
    num_workers=4,
)
#for computing val maps
eval_dataset = VocDetectorDataset(
    root_img_dir=file_root_val,
    dataset_file=annotation_file_val,
    train=False,
    transform=test_data_pipelines,
    grid_sizes=GRID_SIZES,
    encode_target=False,
)
eval_loader = DataLoader(
    eval_dataset,
    batch_size=batch_size,
    collate_fn=collate_fn,
    shuffle=False,
    num_workers=4
)
print(f'Loaded {len(val_dataset)} val images')

## Initialization

### Only backbone model on timm is acceptable (https://huggingface.co/timm/models).
### You can modify model name in yolo class

In [None]:
load_network_path = None #'checkpoints/best_detector.pth' 
pretrained = True
model = getODmodel(pretrained=pretrained).to(device)

# 記錄模型梯度與權重
wandb.watch(model, log='gradients', log_freq=100)


### Some training utils, use mix precision if valid

In [None]:
# Create loss and optimizer
criterion = YOLOv3Loss(lambda_coord, lambda_obj, lambda_noobj, lambda_class, ANCHORS).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=5e-4)
lr_scheduler = CosineAnnealingLR(optimizer, T_max=num_epochs, eta_min=1e-6)
use_amp = torch.cuda.is_available()
scaler = GradScaler(enabled=use_amp)

### Training Loop

In [None]:
# 建立紀錄容器：每個 epoch 紀錄一次
import numpy as np
history = {
    'train_total': [],
    'val_total': [],
    'map': [],
    'train_box': [],
    'train_obj': [],
    'train_noobj': [],
    'train_cls': [],
    'val_box': [],
    'val_obj': [],
    'val_noobj': [],
    'val_cls': []
}


In [None]:
# Training loop
print('\nStarting training...')
torch.cuda.empty_cache()
best_val_loss = np.inf
for epoch in range(num_epochs):
    model.train()
    print(f'\n\nStarting epoch {epoch + 1} / {num_epochs}')

    # 累積訓練 loss（本 epoch 平均）
    train_sum = {k: 0.0 for k in ['total','box','obj','noobj','cls']}
    train_batches = 0

    for i, (images, target) in enumerate(train_loader):
        # Move to device
        images = images.to(device)
        target = [t.to(device) for t in target]
        # Forward pass
        optimizer.zero_grad()
        with autocast("cuda", enabled=use_amp):
            pred = model(images)
            # pred and target are lists of each scales
            loss_dict = criterion(pred, target)
        # Backward pass with mixed precision support
        scaler.scale(loss_dict['total']).backward()
        scaler.step(optimizer)
        scaler.update()

        # 累積訓練統計
        for k in train_sum:
            train_sum[k] += float(loss_dict[k].detach().cpu())
        train_batches += 1

        # Print progress
        if i % 50 == 0:
            outstring = f'Epoch [{epoch+1}/{num_epochs}], Iter [{i+1}/{len(train_loader)}], Loss: '
            outstring += ', '.join(f"{key}={val :.3f}" for key, val in loss_dict.items())
            print(outstring)

    # epoch-end：學習率與平均訓練損失
    lr_scheduler.step()
    learning_rate = lr_scheduler.get_last_lr()[0]
    print(f'Learning Rate for this epoch: {learning_rate}')

    train_avg = {k: (train_sum[k] / max(train_batches, 1)) for k in train_sum}
    history['train_total'].append(train_avg['total'])
    history['train_box'].append(train_avg['box'])
    history['train_obj'].append(train_avg['obj'])
    history['train_noobj'].append(train_avg['noobj'])
    history['train_cls'].append(train_avg['cls'])

    # Validation
    with torch.no_grad():
        val_sum = {k: 0.0 for k in ['total','box','obj','noobj','cls']}
        model.eval()
        for i, (images, target) in enumerate(val_loader):
            # Move to device
            images = images.to(device)
            target = [t.to(device) for t in target]
            # Forward pass
            pred = model(images)
            loss_dict = criterion(pred, target)
            for k in val_sum:
                val_sum[k] += float(loss_dict[k].detach().cpu())

        val_avg = {k: (val_sum[k] / max(len(val_loader), 1)) for k in val_sum}
        print(f'Validation Loss: {val_avg["total"]:.4f}')

    # Save best model
    if best_val_loss > val_avg['total']:
        best_val_loss = val_avg['total']
        print(f'Updating best val loss: {best_val_loss:.5f}')
        os.makedirs('checkpoints', exist_ok=True)
        torch.save(model.state_dict(), 'checkpoints/best_detector.pth')

    # Save checkpoint
    if (epoch + 1) in [5, 10, 20, 30, 40]:
        torch.save(model.state_dict(), f'checkpoints/detector_epoch_{epoch+1}.pth')

    torch.save(model.state_dict(), 'checkpoints/detector.pth')

    # 紀錄驗證平均損失
    history['val_total'].append(val_avg['total'])
    history['val_box'].append(val_avg['box'])
    history['val_obj'].append(val_avg['obj'])
    history['val_noobj'].append(val_avg['noobj'])
    history['val_cls'].append(val_avg['cls'])

    # Wandb: 紀錄 epoch 統計
    wandb.log({
        'epoch': epoch + 1,
        'lr': learning_rate,
        'train/total': train_avg['total'],
        'train/box': train_avg['box'],
        'train/obj': train_avg['obj'],
        'train/noobj': train_avg['noobj'],
        'train/cls': train_avg['cls'],
        'val/total': val_avg['total'],
        'val/box': val_avg['box'],
        'val/obj': val_avg['obj'],
        'val/noobj': val_avg['noobj'],
        'val/cls': val_avg['cls'],
    }, step=epoch+1)

    # Evaluate on val set（每 5 個 epoch 計算一次 mAP）
    if (epoch + 1) % 5 == 0:
        print('\nEvaluating on validation set...')
        val_aps = evaluate(model, eval_loader)
        cur_map = float(np.mean(val_aps)) if len(val_aps) else 0.0
        print(f'Epoch {epoch+1}, mAP: {cur_map:.4f}')
        history['map'].append(cur_map)
        wandb.log({'epoch': epoch + 1, 'val/mAP': cur_map}, step=epoch+1)
    else:
        # 用 NaN 佔位，方便等長繪圖
        history['map'].append(float('nan'))


In [None]:
# 視覺化：Loss 與 ACC(mAP)
import matplotlib.pyplot as plt

epochs = list(range(1, len(history['train_total']) + 1))

plt.figure(figsize=(12,4))
plt.subplot(1,2,1)
plt.plot(epochs, history['train_total'], label='train')
plt.plot(epochs, history['val_total'], label='val')
plt.title('Loss (total)')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.grid(True, alpha=0.3)
plt.legend()

plt.subplot(1,2,2)
plt.plot(epochs, history['map'], label='mAP', color='tab:green')
plt.title('ACC (mAP)')
plt.xlabel('Epoch')
plt.ylabel('mAP')
plt.ylim(0, 1)
plt.grid(True, alpha=0.3)
plt.legend()
plt.tight_layout()
plt.show()


In [None]:
# Plot training/validation loss curves and mAP (ACC)
import matplotlib.pyplot as plt
import numpy as np

epochs = np.arange(1, len(history['train_total']) + 1)

# === Figure 1: Total Loss + mAP ===
plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.plot(epochs, history['train_total'], label='train_total')
plt.plot(epochs, history['val_total'], label='val_total')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Total Loss')
plt.legend()
plt.grid(True, alpha=0.3)

plt.subplot(1, 2, 2)
# mAP may be None/NaN on non-eval epochs; mask them
map_vals = np.array([np.nan if v is None else v for v in history['map']], dtype=float)
plt.plot(epochs, map_vals, marker='o', label='mAP')
plt.xlabel('Epoch')
plt.ylabel('mAP')
plt.title('Validation mAP (ACC)')
plt.grid(True, alpha=0.3)
plt.legend()
plt.tight_layout()
fig_total_map = plt.gcf()
plt.show()

# === Figure 2: component-wise loss trends (facet 2x2) ===
plt.figure(figsize=(12, 8))
comp_keys = [('box','Box'), ('obj','Obj'), ('noobj','NoObj'), ('cls','Cls')]
for idx, (k, name) in enumerate(comp_keys, start=1):
    plt.subplot(2, 2, idx)
    plt.plot(epochs, history[f'train_{k}'], label=f'train_{name.lower()}')
    plt.plot(epochs, history[f'val_{k}'], label=f'val_{name.lower()}')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.title(f'{name} Loss')
    plt.grid(True, alpha=0.3)
    plt.legend()
plt.tight_layout()
fig_components = plt.gcf()
plt.show()

# === Log figures to Weights & Biases ===
try:
    wandb.log({'fig/loss_total_map': fig_total_map, 'fig/loss_components': fig_components})
except Exception as e:
    print('wandb.log(fig) skipped:', e)


In [None]:
# 結束 W&B run（可選）
try:
    wandb.finish()
except Exception as e:
    print('wandb.finish() skipped:', e)


# Kaggle submission

### Predict Result

Predict the results based on testing set. Upload to [Kaggle](https://www.kaggle.com/t/e86ea95cb007416a85a07d8729ac838e).

**How to upload**

1. Click the folder icon in the left hand side of Colab.
2. Right click "result.csv". Select "Download"
3. To kaggle. Click "Submit Predictions"
4. Upload the result.csv
5. System will automaticlaly calculate the accuracy of 50% dataset and publish this result to leaderboard.


In [None]:
!python predict_test.py