In [1]:
import torch
from torch import nn
import numpy as np
import pandas as pd
import os, cv2
import xml.etree.ElementTree as ET
import matplotlib.pyplot as plt

import torch
from torch.utils.data import Dataset
from PIL import Image
import torchvision.transforms as transforms
from torch.utils.data import DataLoader
import torchvision.models as models

In [2]:
image_folder = "data/train/images"
annotation_folder = "data/train/labels"

In [3]:
class YOLOdataset(Dataset):
    def __init__(self, image_dir, annot_dir, S=4, B=2, C=20, transform=None):
        self.image_dir = image_dir
        self.annot_dir = annot_dir
        self.image_files = [f for f in os.listdir(image_dir) if f.endswith(".jpg")]
        self.S, self.B, self.C = S, B, C
        self.transform = transform

        self.classes = [
            "aeroplane", "bicycle", "bird", "boat", "bottle",
            "bus", "car", "cat", "chair", "cow", "diningtable",
            "dog", "horse", "motorbike", "person", "pottedplant",
            "sheep", "sofa", "train", "tvmonitor"
        ]

    def __len__(self):
        return len(self.image_files)

    def __getitem__(self, idx):
        img_filename = self.image_files[idx]
        image_path = os.path.join(self.image_dir, img_filename)
        annot_path = os.path.join(self.annot_dir, img_filename.replace(".jpg", ".xml"))

        image = Image.open(image_path).convert("RGB")
        boxes, labels = self.parse_voc_xml(annot_path, image.size)
        target = self.encode_target(boxes, labels)

        if self.transform:
            image = self.transform(image)
        return image, target

    def parse_voc_xml(self, xml_path, image_size):
        boxes, labels = [], []
        tree = ET.parse(xml_path)
        root = tree.getroot()
        w, h = image_size

        for obj in root.findall("object"):
            label = obj.find("name").text
            xml_box = obj.find("bndbox")
            xmin = float(xml_box.find("xmin").text)
            ymin = float(xml_box.find("ymin").text)
            xmax = float(xml_box.find("xmax").text)
            ymax = float(xml_box.find("ymax").text)

            x_center = ((xmin + xmax) / 2) / w
            y_center = ((ymin + ymax) / 2) / h
            box_w = (xmax - xmin) / w
            box_h = (ymax - ymin) / h

            boxes.append([x_center, y_center, box_w, box_h])
            labels.append(self.classes.index(label))
        return boxes, labels

    def encode_target(self, boxes, labels):
        S, B, C = self.S, self.B, self.C
        target = torch.zeros((S, S, C + 5*B))

        for box, label in zip(boxes, labels):
            x, y, w, h = box
            grid_x = min(int(S * x), S - 1)
            grid_y = min(int(S * y), S - 1)
            x_cell = S * x - grid_x
            y_cell = S * y - grid_y

            # fill first empty box in cell
            for b in range(B):
                if target[grid_y, grid_x, b*5 + 4] == 0:  # conf=0 → empty
                    target[grid_y, grid_x, b*5:b*5+5] = torch.tensor([x_cell, y_cell, w, h, 1])
                    break
        
            # class one-hot
            target[grid_y, grid_x, 5*B + label] = 1

        return target

In [4]:
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
])

In [5]:
train_dataset = YOLOdataset(
    image_dir="data/train/images",
    annot_dir="data/train/labels",
    transform=transform
)

test_dataset = YOLOdataset(
    image_dir="data/test/images",
    annot_dir="data/test/labels",
    transform=transform
)

image, target = train_dataset[0]
print(image.shape)  # [3, 224, 224]
print(target.shape) # [4, 4, 30]

torch.Size([3, 224, 224])
torch.Size([4, 4, 30])


In [6]:
train_loader = DataLoader(train_dataset,batch_size=4,shuffle=True,num_workers=0)
test_loader = DataLoader(test_dataset, batch_size=4, shuffle=False)

In [7]:
for imgs, targets in train_loader:
    print("Batch image shape:", imgs.shape)    # [batch_size, 3, 448, 448]
    print("Batch target shape:", targets.shape) # [batch_size, 7, 7, 30]
    break

Batch image shape: torch.Size([4, 3, 224, 224])
Batch target shape: torch.Size([4, 4, 4, 30])


In [8]:
def intersection_over_union(boxes_preds, boxes_labels):
    """
    boxes_preds: [N, S, S, 4] (x, y, w, h)
    boxes_labels: same
    """
    box1_x1 = boxes_preds[..., 0:1] - boxes_preds[..., 2:3] / 2
    box1_y1 = boxes_preds[..., 1:2] - boxes_preds[..., 3:4] / 2
    box1_x2 = boxes_preds[..., 0:1] + boxes_preds[..., 2:3] / 2
    box1_y2 = boxes_preds[..., 1:2] + boxes_preds[..., 3:4] / 2

    box2_x1 = boxes_labels[..., 0:1] - boxes_labels[..., 2:3] / 2
    box2_y1 = boxes_labels[..., 1:2] - boxes_labels[..., 3:4] / 2
    box2_x2 = boxes_labels[..., 0:1] + boxes_labels[..., 2:3] / 2
    box2_y2 = boxes_labels[..., 1:2] + boxes_labels[..., 3:4] / 2

    x1 = torch.max(box1_x1, box2_x1)
    y1 = torch.max(box1_y1, box2_y1)
    x2 = torch.min(box1_x2, box2_x2)
    y2 = torch.min(box1_y2, box2_y2)

    intersection = (x2 - x1).clamp(0) * (y2 - y1).clamp(0)
    box1_area = torch.abs((box1_x2 - box1_x1) * (box1_y2 - box1_y1))
    box2_area = torch.abs((box2_x2 - box2_x1) * (box2_y2 - box2_y1))

    return intersection / (box1_area + box2_area - intersection + 1e-6)

In [9]:
class YoloLoss(nn.Module):
    def __init__(self, S=4, B=2, C=20, λ_coord=5, λ_noobj=0.5):
        super(YoloLoss, self).__init__()
        self.mse = nn.MSELoss(reduction="sum")
        self.S = S
        self.B = B
        self.C = C
        self.lambda_coord = λ_coord
        self.lambda_noobj = λ_noobj

    def forward(self, predictions, target):
        # [N, S, S, C + 5B]
        predictions = predictions.reshape(-1, self.S, self.S, self.C + 5 * self.B)

        # --- CLASS LOSS ---
        class_pred = predictions[..., :self.C]
        class_target = target[..., 5 * self.B:]
        obj_mask = target[..., 4].unsqueeze(-1)
        class_loss = self.mse(obj_mask * class_pred, obj_mask * class_target)

        # --- BOX PREDICTIONS ---
        box_pred1 = predictions[..., self.C:self.C + 5]
        box_pred2 = predictions[..., self.C + 5:self.C + 10]
        box_target = target[..., 0:5]

        # --- IOU selection ---
        iou1 = intersection_over_union(box_pred1[..., :4], box_target[..., :4])
        iou2 = intersection_over_union(box_pred2[..., :4], box_target[..., :4])

        ious = torch.stack([iou1, iou2], dim=0)
        best_box = ious.argmax(0)  # shape [N, S, S, 1]
        best_box = best_box.unsqueeze(-1)

        box_pred = box_pred1 * (best_box == 0) + box_pred2 * (best_box == 1)

        # --- COORDINATE LOSS ---
        box_pred_xy = obj_mask * box_pred[..., 0:2]
        box_target_xy = obj_mask * box_target[..., 0:2]
        coord_loss_xy = self.mse(box_pred_xy, box_target_xy)

        box_pred_wh = torch.sign(box_pred[..., 2:4]) * torch.sqrt(torch.abs(box_pred[..., 2:4]) + 1e-6)
        box_target_wh = torch.sqrt(box_target[..., 2:4])
        coord_loss_wh = self.mse(obj_mask * box_pred_wh, obj_mask * box_target_wh)

        coord_loss = self.lambda_coord * (coord_loss_xy + coord_loss_wh)

        # --- CONFIDENCE LOSS ---
        conf_pred = box_pred[..., 4]
        conf_target = iou1 * (best_box.squeeze(-1) == 0) + iou2 * (best_box.squeeze(-1) == 1)
        conf_target = conf_target * target[..., 4]  # only for object cells

        obj_conf_loss = self.mse(obj_mask.squeeze(-1) * conf_pred, obj_mask.squeeze(-1) * conf_target)

        # --- NO OBJECT LOSS ---
        noobj_mask = 1 - obj_mask
        noobj_conf_loss1 = self.mse(noobj_mask.squeeze(-1) * box_pred1[..., 4], torch.zeros_like(conf_pred))
        noobj_conf_loss2 = self.mse(noobj_mask.squeeze(-1) * box_pred2[..., 4], torch.zeros_like(conf_pred))
        noobj_conf_loss = self.lambda_noobj * (noobj_conf_loss1 + noobj_conf_loss2)

        # --- TOTAL LOSS ---
        total_loss = coord_loss + obj_conf_loss + noobj_conf_loss + class_loss
        return total_loss

In [10]:
loss_fn = YoloLoss(S=4, B=2, C=20)

In [11]:
class YOLOv1(nn.Module):
    def __init__(self, S=4, B=2, C=20, backbone='resnet18', pretrained=True):
        super(YOLOv1, self).__init__()
        self.S = S
        self.B = B
        self.C = C

        base_model = models.resnet18(weights='IMAGENET1K_V1' if pretrained else None)
        layers = list(base_model.children())[:-2]  # remove avgpool & fc layer
        self.backbone = nn.Sequential(*layers)
        out_channels = 512  # resnet18 last conv output

        # ===== 2️⃣ DETECTION HEAD =====
        # The head converts [Batch, out_channels, feature_h, feature_w]
        # to [Batch, S, S, C + 5B]

        self.conv_head = nn.Sequential(
            nn.Conv2d(out_channels, 1024, kernel_size=3, padding=1),
            nn.BatchNorm2d(1024),
            nn.LeakyReLU(0.1, inplace=True),

            nn.Conv2d(1024, (C + 5 * B), kernel_size=1)  # final prediction map
        )

    def forward(self, x):
        x = self.backbone(x)
        x = self.conv_head(x)

        # Reshape output -> [N, S, S, C + 5B]
        # (feature map size should match SxS)
        x = nn.functional.adaptive_avg_pool2d(x, (self.S, self.S))
        x = x.permute(0, 2, 3, 1)  # [N, S, S, C + 5B]
        return x

In [12]:
device = "cuda"  if torch.cuda.is_available() else "cpu"
device

'cuda'

In [13]:
model = YOLOv1(S=4, B=2, C=20)
model.to(device)

YOLOv1(
  (backbone): Sequential(
    (0): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
    (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU(inplace=True)
    (3): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
    (4): Sequential(
      (0): BasicBlock(
        (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu): ReLU(inplace=True)
        (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
      (1): BasicBlock(
        (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)

In [14]:
import logging
import os
from datetime import datetime
from tqdm import tqdm
import matplotlib.pyplot as plt

# ==== CONFIG ====
epochs = 200
learning_rate = 1e-4
save_checkpoint = True
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
img_scale = 224
amp = False

# ==== LOGGING SETUP ====
os.makedirs("logs1", exist_ok=True)
os.makedirs("checkpoints1", exist_ok=True)
os.makedirs("detections1", exist_ok=True)

timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
log_file = f"logs1/train_{timestamp}.log"

logging.basicConfig(
    filename=log_file,
    filemode='w',
    level=logging.INFO,
    format='%(asctime)s | %(levelname)s | %(message)s',
    datefmt='%Y-%m-%d %H:%M:%S'
)

logging.info(f"Starting training at {timestamp}")
logging.info(f"Device: {device}")

# ==== MODEL / LOSS / OPTIMIZER ====
model = YOLOv1(S=4, B=2, C=20).to(device)
loss_fn = YoloLoss(S=4, B=2, C=20)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=5e-4)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=20, gamma=0.1)

# ==== CHECKPOINT HANDLING ====
# checkpoint_path = "checkpoints/best_model.pth"
checkpoint_path = "checkpoints1/last_checkpoint.pth"
best_loss = float('inf')
start_epoch = 0

if os.path.exists(checkpoint_path):
    checkpoint = torch.load(checkpoint_path, map_location=device)
    model.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    scheduler.load_state_dict(checkpoint['scheduler_state_dict'])
    start_epoch = checkpoint['epoch'] + 1
    best_loss = checkpoint.get('best_loss', float('inf'))
    logging.info(f"✅ Resumed from epoch {start_epoch}, best loss so far: {best_loss:.4f}")

# ==== TRAINING LOOP ====
for epoch in range(start_epoch, epochs):
    model.train()
    running_loss = 0.0

    loop = tqdm(train_loader, total=len(train_loader), desc=f"Epoch [{epoch+1}/{epochs}]")
    for imgs, targets in loop:
        imgs, targets = imgs.to(device), targets.to(device)

        # forward
        preds = model(imgs)
        loss = loss_fn(preds, targets)

        # backward
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        loop.set_postfix(loss=loss.item())

    avg_loss = running_loss / len(train_loader)
    scheduler.step()

    # ==== LOGGING ====
    logging.info(f"Epoch [{epoch+1}/{epochs}] | Avg Loss: {avg_loss:.4f}")
    print(f"✅ Epoch [{epoch+1}/{epochs}] | Avg Loss: {avg_loss:.4f}")

    # ==== SAVE CHECKPOINT ====
    if save_checkpoint:
        state = {
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'scheduler_state_dict': scheduler.state_dict(),
            'best_loss': best_loss
        }
        torch.save(state, "checkpoints1/last_checkpoint.pth")

    # ==== SAVE BEST MODEL ====
    if avg_loss < best_loss:
        best_loss = avg_loss
        torch.save(state, checkpoint_path)
        logging.info(f"💾 Saved Best Model at epoch {epoch+1} (Loss: {best_loss:.4f})")

  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
Epoch [50/200]: 100%|███████████████████████████████████████████████████| 5534/5534 [13:13<00:00,  6.98it/s, loss=2.72]


✅ Epoch [50/200] | Avg Loss: 2.8309


Epoch [51/200]: 100%|███████████████████████████████████████████████████| 5534/5534 [11:26<00:00,  8.06it/s, loss=2.62]


✅ Epoch [51/200] | Avg Loss: 2.8415


Epoch [52/200]: 100%|███████████████████████████████████████████████████| 5534/5534 [11:14<00:00,  8.21it/s, loss=1.42]


✅ Epoch [52/200] | Avg Loss: 2.8355


Epoch [53/200]: 100%|███████████████████████████████████████████████████| 5534/5534 [11:35<00:00,  7.95it/s, loss=1.44]


✅ Epoch [53/200] | Avg Loss: 2.8205


Epoch [54/200]: 100%|███████████████████████████████████████████████████| 5534/5534 [11:50<00:00,  7.78it/s, loss=4.13]


✅ Epoch [54/200] | Avg Loss: 2.7972


Epoch [55/200]: 100%|███████████████████████████████████████████████████| 5534/5534 [12:04<00:00,  7.64it/s, loss=1.84]


✅ Epoch [55/200] | Avg Loss: 2.8061


Epoch [56/200]: 100%|███████████████████████████████████████████████████| 5534/5534 [13:22<00:00,  6.89it/s, loss=2.72]


✅ Epoch [56/200] | Avg Loss: 2.8129


Epoch [57/200]: 100%|███████████████████████████████████████████████████| 5534/5534 [12:40<00:00,  7.27it/s, loss=2.02]


✅ Epoch [57/200] | Avg Loss: 2.7829


Epoch [58/200]: 100%|███████████████████████████████████████████████████| 5534/5534 [12:07<00:00,  7.60it/s, loss=1.75]


✅ Epoch [58/200] | Avg Loss: 2.7842


Epoch [59/200]: 100%|███████████████████████████████████████████████████| 5534/5534 [11:55<00:00,  7.74it/s, loss=2.01]


✅ Epoch [59/200] | Avg Loss: 2.7830


Epoch [60/200]: 100%|███████████████████████████████████████████████████| 5534/5534 [12:22<00:00,  7.45it/s, loss=4.45]


✅ Epoch [60/200] | Avg Loss: 2.7950


Epoch [61/200]: 100%|███████████████████████████████████████████████████| 5534/5534 [12:16<00:00,  7.52it/s, loss=1.07]


✅ Epoch [61/200] | Avg Loss: 2.7760


Epoch [62/200]: 100%|███████████████████████████████████████████████████| 5534/5534 [12:53<00:00,  7.15it/s, loss=2.09]


✅ Epoch [62/200] | Avg Loss: 2.7655


Epoch [63/200]:   1%|▎                                                    | 39/5534 [00:05<12:57,  7.07it/s, loss=2.45]


KeyboardInterrupt: 