# Task 3 â€” Fine-Tuning CSRNet on Mall CCTV Frames

In this task, the pre-trained CSRNet model is adapted to a new indoor CCTV
environment (shopping mall scenes). Since the dataset contains only images
without ground-truth annotations, a weakly supervised fine-tuning strategy
is adopted.

YOLO is used to generate pseudo head-point annotations, which are converted
into density maps. CSRNet is then fine-tuned on these pseudo-labels to improve
performance in real-world indoor surveillance scenarios.

In [1]:
# ======================================================
# Imports & Global Setup
# ======================================================

import os
import glob
import cv2
import numpy as np
from scipy.ndimage import gaussian_filter

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

print("PyTorch version:", torch.__version__)

PyTorch version: 2.9.0+cpu


In [3]:
# ======================================================
# Device Configuration
# ======================================================

DEVICE = torch.device("cpu")
print("Using device:", DEVICE)

Using device: cpu


In [None]:
# ====================
# CSRNet Architecture 
# ====================

from torchvision import models

class CSRNet(nn.Module):
    def __init__(self, load_pretrained_vgg=False):
        super().__init__()

        if load_pretrained_vgg:
            from torchvision.models import VGG16_Weights
            vgg = models.vgg16(weights=VGG16_Weights.IMAGENET1K_V1)
        else:
            vgg = models.vgg16(weights=None)

        # Frontend: VGG16 conv layers up to conv4_3
        self.frontend = nn.Sequential(*list(vgg.features.children())[:23])

        # Backend: dilated convolutions (ENDS AT 64 CHANNELS)
        self.backend = nn.Sequential(
            nn.Conv2d(512, 512, 3, padding=2, dilation=2),
            nn.ReLU(inplace=True),
            nn.Conv2d(512, 512, 3, padding=2, dilation=2),
            nn.ReLU(inplace=True),
            nn.Conv2d(512, 512, 3, padding=2, dilation=2),
            nn.ReLU(inplace=True),
            nn.Conv2d(512, 256, 3, padding=2, dilation=2),
            nn.ReLU(inplace=True),
            nn.Conv2d(256, 128, 3, padding=2, dilation=2),
            nn.ReLU(inplace=True),
            nn.Conv2d(128, 64, 3, padding=2, dilation=2),
            nn.ReLU(inplace=True),
        )

        self.output_layer = nn.Conv2d(64, 1, 1)

    def forward(self, x):
        x = self.frontend(x)
        x = self.backend(x)
        x = self.output_layer(x)
        return x


In [None]:
# ======================================================
# Load Pre-Trained CSRNet Weights
# ======================================================

PRETRAINED_PATH = "csrnet_weights.pth"   # from Task 1

assert os.path.exists(PRETRAINED_PATH), "Pre-trained CSRNet weights not found"

model = CSRNet().to(DEVICE)
state = torch.load(PRETRAINED_PATH, map_location=DEVICE)
model.load_state_dict(state)

print("Loaded pre-trained CSRNet weights.")

Loaded pre-trained CSRNet weights.


In [7]:
# ======================================================
# Freeze Frontend Layers
# ======================================================

for name, param in model.named_parameters():
    if "frontend" in name:
        param.requires_grad = False

print("Frozen CSRNet frontend layers.")


Frozen CSRNet frontend layers.


In [11]:
# ======================================================
# Inspect Mall Dataset
# ======================================================

MALL_DATA_ROOT = "C:\\Users\\ARTI\\OneDrive\\Desktop\\projects\\cd\\archive (1)" 

image_paths = []
for ext in ("*.jpg", "*.png", "*.jpeg"):
    image_paths.extend(
        glob.glob(os.path.join(MALL_DATA_ROOT, "**", ext), recursive=True)
    )

assert len(image_paths) > 0, "No images found in Mall dataset"

print(f"Total Mall frames found: {len(image_paths)}")
print("Sample image:", image_paths[0])

Total Mall frames found: 2000
Sample image: C:\Users\ARTI\OneDrive\Desktop\projects\cd\archive (1)\frames\frames\seq_000001.jpg


In [None]:
# ======================================================
# Generate Density Map from YOLO Detections
# ======================================================

def generate_density_from_boxes(boxes, shape, sigma=6):
    density = np.zeros(shape, dtype=np.float32)

    for box in boxes:
        x1, y1, x2, y2 = map(int, box)
        cx = int((x1 + x2) / 2)
        cy = int((y1 + y2) / 2)

        if 0 <= cy < shape[0] and 0 <= cx < shape[1]:
            density[cy, cx] += 1

    density = gaussian_filter(density, sigma=sigma)
    return density

In [17]:
# ======================================================
# YOLO Initialization (for pseudo-label generation)
# ======================================================

try:
    from ultralytics import YOLO
except ImportError as e:
    raise ImportError(
        "Ultralytics YOLO not installed.\n"
        "Install with: pip install ultralytics"
    )

# Load lightweight YOLOv8 model (CPU-safe)
yolo = YOLO("yolov8n.pt")
yolo.fuse()

print("YOLO loaded successfully for pseudo-labeling.")

YOLOv8n summary (fused): 72 layers, 3,151,904 parameters, 0 gradients, 8.7 GFLOPs
YOLO loaded successfully for pseudo-labeling.


In [18]:
# ======================================================
# Mall CCTV Dataset (Pseudo-Labeled)
# ======================================================

class MallPseudoDataset(Dataset):
    def __init__(self, image_paths):
        self.image_paths = image_paths

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        img_path = self.image_paths[idx]

        image = cv2.imread(img_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

        # YOLO pseudo-labels
        results = yolo(image, conf=0.4, classes=[0])[0]
        boxes = results.boxes.xyxy.cpu().numpy() if results.boxes is not None else []

        density = generate_density_from_boxes(
            boxes,
            shape=image.shape[:2]
        )

        image = image.astype(np.float32) / 255.0
        image = torch.from_numpy(image).permute(2, 0, 1)
        density = torch.from_numpy(density).unsqueeze(0)

        return image, density


In [19]:
# ======================================================
# Fine-Tuning Configuration
# ======================================================

FT_EPOCHS = 3
FT_LR = 1e-6
BATCH_SIZE = 1

In [20]:
# ======================================================
# DataLoader & Optimizer
# ======================================================

dataset = MallPseudoDataset(image_paths)
loader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)

criterion = nn.MSELoss()
optimizer = torch.optim.Adam(
    filter(lambda p: p.requires_grad, model.parameters()),
    lr=FT_LR
)

In [21]:
# ======================================================
# Fine-Tuning Loop
# ======================================================

model.train()

for epoch in range(1, FT_EPOCHS + 1):
    running_loss = 0.0

    for img, gt in loader:
        img = img.to(DEVICE)
        gt = gt.to(DEVICE)

        pred = model(img)

        if pred.shape != gt.shape:
            gt = nn.functional.interpolate(gt, size=pred.shape[2:])

        loss = criterion(pred, gt)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    avg_loss = running_loss / len(loader)
    print(f"[Task 3] Epoch {epoch}/{FT_EPOCHS} - Loss: {avg_loss:.6f}")



0: 480x640 10 persons, 372.5ms
Speed: 19.7ms preprocess, 372.5ms inference, 42.6ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 20 persons, 180.5ms
Speed: 4.1ms preprocess, 180.5ms inference, 2.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 9 persons, 224.8ms
Speed: 9.0ms preprocess, 224.8ms inference, 2.1ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 7 persons, 181.8ms
Speed: 2.9ms preprocess, 181.8ms inference, 1.8ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 14 persons, 236.3ms
Speed: 6.2ms preprocess, 236.3ms inference, 9.6ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 14 persons, 447.1ms
Speed: 12.5ms preprocess, 447.1ms inference, 2.1ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 15 persons, 495.3ms
Speed: 25.1ms preprocess, 495.3ms inference, 4.8ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 6 persons, 517.7ms
Speed: 12.6ms preprocess, 517.7ms inference, 5.0ms postpr

In [22]:
# ======================================================
# Save Mall-Adapted CSRNet Model
# ======================================================

SAVE_PATH = "csrnet_mall_adapted.pth"
torch.save(model.state_dict(), SAVE_PATH)

print(f"Saved fine-tuned model to: {SAVE_PATH}")

Saved fine-tuned model to: csrnet_mall_adapted.pth
