In [1]:
!pip install motmetrics
!pip install deep-sort-realtime
!git clone https://github.com/abewley/sort.git
!pip install filterpy


Collecting motmetrics
  Downloading motmetrics-1.4.0-py3-none-any.whl.metadata (20 kB)
Collecting xmltodict>=0.12.0 (from motmetrics)
  Downloading xmltodict-0.14.2-py2.py3-none-any.whl.metadata (8.0 kB)
Downloading motmetrics-1.4.0-py3-none-any.whl (161 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m161.5/161.5 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading xmltodict-0.14.2-py2.py3-none-any.whl (10.0 kB)
Installing collected packages: xmltodict, motmetrics
Successfully installed motmetrics-1.4.0 xmltodict-0.14.2
Collecting deep-sort-realtime
  Downloading deep_sort_realtime-1.3.2-py3-none-any.whl.metadata (12 kB)
Downloading deep_sort_realtime-1.3.2-py3-none-any.whl (8.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.4/8.4 MB[0m [31m71.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: deep-sort-realtime
Successfully installed deep-sort-realtime-1.3.2
Cloning into 'sort'...
remote: Enumerating objects: 208, d

<h3>Imports</h3>

In [2]:
import matplotlib
matplotlib.use('Agg')  # non-interactive backend suitable for headless environments

import os
import zipfile
import gdown
import torch
import torchvision
import numpy as np
import pandas as pd
import motmetrics as mm
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader, Dataset
from torchvision import transforms
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
from deep_sort_realtime.deepsort_tracker import DeepSort
from tqdm import tqdm
from PIL import Image
import sys

# Fix sort.py backend issue
with open('/content/sort/sort.py', 'r') as file:
    code = file.read()

# Replace TkAgg with Agg
fixed_code = code.replace('TkAgg', 'Agg')

with open('/content/sort/sort.py', 'w') as file:
    file.write(fixed_code)

print("Backend issue fixed!")

sys.path.append('/content/sort')


Backend issue fixed!


<h3>Configuration & Device Setup</h3>

In [3]:
# CONFIGURATION
CONFIG = {
    "dataset_url": "https://drive.google.com/uc?id=1yvOwbPks7dFzMX2z4JoUQlwdEfNYQd7-",
    "dataset_zip": "/content/MOT15.zip",
    "dataset_path": "/content/MOT15",
    "tracking": {"iou_threshold": 0.3, "max_age": 30},
    "training": {"epochs": 1, "batch_size": 8, "learning_rate": 0.0001},
}

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)


Using device: cuda


<h3>Dataset Download & Extraction Functions</h3>

In [4]:
# DOWNLOAD & EXTRACT DATASET
def download_dataset():
    if not os.path.exists(CONFIG["dataset_zip"]):
        print("Downloading MOT15 dataset from Google Drive...")
        gdown.download(CONFIG["dataset_url"], CONFIG["dataset_zip"], quiet=False)
    else:
        print("Dataset already downloaded.")

def extract_dataset():
    if not os.path.exists(CONFIG["dataset_path"]):
        print("Extracting dataset...")
        with zipfile.ZipFile(CONFIG["dataset_zip"], 'r') as zip_ref:
            zip_ref.extractall("/content/")
        print(f"Dataset extracted to {CONFIG['dataset_path']}")
    else:
        print("Dataset already extracted.")


<h3>Data Augmentation Function</h3>

In [5]:
# DATA AUGMENTATION
def apply_augmentations(image):
    transform = transforms.Compose([
        transforms.Resize((640, 640)),
        transforms.RandomCrop(600),
        transforms.RandomHorizontalFlip(p=0.5),
        transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1),
        transforms.GaussianBlur(kernel_size=(5, 9), sigma=(0.1, 2.0)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ])
    return transform(image)


<h3>MOT15Dataset Class</h3>

In [6]:
# MOT15 DATASET CLASS
class MOT15Dataset(Dataset):
    def __init__(self, root_dir, mode="train", transform=None):
        self.root_dir = os.path.join(root_dir, mode)
        self.transform = transform
        self.data = []
        for seq in os.listdir(self.root_dir):
            img_dir = os.path.join(self.root_dir, seq, "img1")
            gt_path = os.path.join(self.root_dir, seq, "gt/gt.txt")
            if os.path.exists(gt_path):
                gt_df = pd.read_csv(gt_path, header=None)
                gt_df.columns = ["frame", "track_id", "x", "y", "w", "h", "conf", "class", "visibility", "unused"]
                for img_name in sorted(os.listdir(img_dir)):
                    frame_id = int(img_name.split('.')[0])
                    frame_gt = gt_df[gt_df["frame"] == frame_id]
                    boxes_df = frame_gt[["x", "y", "w", "h"]].copy()
                    boxes_df = pd.DataFrame({
                        'x1': boxes_df['x'],
                        'y1': boxes_df['y'],
                        'x2': boxes_df['x'] + boxes_df['w'],
                        'y2': boxes_df['y'] + boxes_df['h']
                    })
                    boxes = boxes_df[['x1', 'y1', 'x2', 'y2']].values
                    labels = np.ones(len(boxes))
                    self.data.append((os.path.join(img_dir, img_name), boxes, labels))
            else:
              # If no ground truth is available, load the image with empty boxes and labels.
              for img_name in sorted(os.listdir(img_dir)):
                  self.data.append((os.path.join(img_dir, img_name), np.empty((0, 4)), np.empty((0,))))


    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        img_path, boxes, labels = self.data[idx]
        image = Image.open(img_path).convert("RGB")
        target = {"boxes": torch.tensor(boxes, dtype=torch.float32), "labels": torch.tensor(labels, dtype=torch.int64)}
        if self.transform:
            image = self.transform(image)
        return image, target


<h3>Object Detector Class</h3>

In [7]:
# OBJECT DETECTOR CLASS
class ObjectDetector:
    def __init__(self, num_classes=2):
        self.model = torchvision.models.detection.fasterrcnn_resnet50_fpn(weights="DEFAULT")
        in_features = self.model.roi_heads.box_predictor.cls_score.in_features
        self.model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)
        self.model.to(device)
        self.model.train()

    def detect_objects(self, images):
        img_tensors = [
            img.to(device) if isinstance(img, torch.Tensor) else apply_augmentations(img).to(device)
            for img in images
        ]
        with torch.no_grad():
            predictions = self.model(img_tensors)
        return predictions



<h3>Adaptive Tracker Class</h3>

In [8]:
# ADAPTIVE TRACKER CLASS
# Make sure that Sort is imported from your fixed sort.py file.
from sort import Sort

class AdaptiveTracker:
    def __init__(self):
        self.sort_tracker = Sort()
        # self.deep_sort = DeepSort(max_age=30, n_init=3, max_cosine_distance=0.2)
        self.previous_tracks = {}

    def track_objects(self, raw_detections, frame):
        if len(raw_detections) > 0:
            sort_dets = np.array([d[0] + [d[1]] for d in raw_detections])
            sort_tracked = self.sort_tracker.update(sort_dets)
        else:
            sort_tracked = np.empty((0, 5))

        # For a simpler pipeline, use only SORT results:
        consistent_tracks = []
        for track in sort_tracked:
            track_id = int(track[4])
            bbox = track[:4].tolist()
            # Here you can add consistency checks if needed
            consistent_tracks.append({'track_id': track_id, 'bbox': bbox})

        return sort_tracked, consistent_tracks





<h3>Training & Evaluation Functions</h3>

In [9]:
# TRAINING FUNCTION
def train_faster_rcnn(model, train_loader, epochs=10, lr=0.0001):
    optimizer = torch.optim.Adam(model.model.parameters(), lr=lr)
    model.model.train()
    for epoch in range(epochs):
        epoch_loss = 0
        for images, targets in tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}"):
            images = [img.to(device) for img in images]
            targets = [{"boxes": t["boxes"].to(device), "labels": t["labels"].to(device)} for t in targets]
            optimizer.zero_grad()
            loss_dict = model.model(images, targets)
            loss = sum(loss for loss in loss_dict.values())
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()
        print(f"Epoch {epoch+1}, Loss: {epoch_loss:.4f}")

# PERFORMANCE EVALUATION FUNCTION
def evaluate_performance(detections, dataset):
    acc = mm.MOTAccumulator(auto_id=True)
    for idx, det in enumerate(detections):
        print(f"Frame {idx}: Detected track IDs: {det['track_id']}, Bounding boxes: {det['bboxes']}")
        print("Detected track IDs:", det["track_id"])
        print("Detected bboxes:", det["bboxes"])
        gt_boxes = dataset[idx][1]["boxes"].numpy()
        gt_ids = np.arange(len(gt_boxes))
        det_boxes = np.array(det["bboxes"])
        det_ids = det["track_id"]
        distances = mm.distances.iou_matrix(gt_boxes, det_boxes, max_iou=0.3)
        acc.update(gt_ids, det_ids, distances)

    mh = mm.metrics.create()
    summary = mh.compute(acc, metrics=['mota', 'motp', 'idf1', 'num_switches'], name='Overall')
    print(summary)

def collate_fn(batch):
    return tuple(zip(*batch))


<h3>Training Execution</h3>

In [10]:
# ------------------------------
# Training Cell
# ------------------------------

# Download and extract dataset
download_dataset()
extract_dataset()

# Create datasets and dataloaders (using training augmentation)
train_dataset = MOT15Dataset(CONFIG["dataset_path"], mode="train", transform=apply_augmentations)
test_dataset = MOT15Dataset(CONFIG["dataset_path"], mode="test", transform=apply_augmentations)

train_loader = DataLoader(train_dataset, batch_size=CONFIG["training"]["batch_size"],
                          shuffle=True, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=CONFIG["training"]["batch_size"],
                         shuffle=False, collate_fn=collate_fn)

# Initialize and train detector
detector = ObjectDetector(num_classes=2)
train_faster_rcnn(detector, train_loader, epochs=CONFIG["training"]["epochs"],
                  lr=CONFIG["training"]["learning_rate"])

# Save the trained model checkpoint
torch.save(detector.model.state_dict(), "/content/fasterrcnn_checkpoint.pth")
print("Model checkpoint saved!")



Downloading MOT15 dataset from Google Drive...


Downloading...
From (original): https://drive.google.com/uc?id=1yvOwbPks7dFzMX2z4JoUQlwdEfNYQd7-
From (redirected): https://drive.google.com/uc?id=1yvOwbPks7dFzMX2z4JoUQlwdEfNYQd7-&confirm=t&uuid=20a72d8b-d7d6-42fb-b5cd-a72982a9c606
To: /content/MOT15.zip
100%|██████████| 1.31G/1.31G [00:23<00:00, 56.0MB/s]


Extracting dataset...
Dataset extracted to /content/MOT15


Downloading: "https://download.pytorch.org/models/fasterrcnn_resnet50_fpn_coco-258fb6c6.pth" to /root/.cache/torch/hub/checkpoints/fasterrcnn_resnet50_fpn_coco-258fb6c6.pth
100%|██████████| 160M/160M [00:01<00:00, 158MB/s]
Epoch 1/1: 100%|██████████| 688/688 [25:46<00:00,  2.25s/it]


Epoch 1, Loss: 4398.8666
Model checkpoint saved!


In [None]:
import numpy as np
import torch
from torch.utils.data import random_split, DataLoader
from tqdm import tqdm
import time

#########################################
# Utility Functions
#########################################

from tabulate import tabulate
def compute_iou(box1, box2):
    """
    Compute Intersection over Union (IoU) between two boxes.
    Boxes are in [x1, y1, x2, y2] format.
    """
    x1 = max(box1[0], box2[0])
    y1 = max(box1[1], box2[1])
    x2 = min(box1[2], box2[2])
    y2 = min(box1[3], box2[3])

    inter_area = max(0, x2 - x1) * max(0, y2 - y1)
    area1 = (box1[2] - box1[0]) * (box1[3] - box1[1])
    area2 = (box2[2] - box2[0]) * (box2[3] - box2[1])
    union_area = area1 + area2 - inter_area

    if union_area == 0:
        return 0
    return inter_area / union_area

def evaluate_detector(data_loader, detector, iou_threshold=0.5, compute_metrics=True):
    """
    Evaluate the detector over a given DataLoader.

    If compute_metrics is True, this function computes precision, recall,
    and F1-score based on the IoU between predicted boxes and ground-truth boxes.

    It also measures the average inference time per frame.

    Returns:
        precision, recall, f1_score, avg_inference_time
    """
    true_positives = 0
    false_positives = 0
    false_negatives = 0
    total_frames = 0

    start_time = time.time()

    for images, targets in tqdm(data_loader, desc="Evaluating Detector"):
        # Get predictions for the current batch
        predictions = detector.detect_objects(images)

        # For each image in the batch:
        for i, pred in enumerate(predictions):
            if compute_metrics and targets is not None and "boxes" in targets[i] and targets[i]["boxes"].numel() > 0:
                gt_boxes = targets[i]["boxes"].cpu().numpy()
                if len(pred["boxes"]) > 0:
                    pred_boxes = pred["boxes"].cpu().numpy()
                else:
                    pred_boxes = np.empty((0, 4))

                matched_gt = set()
                for pred_box in pred_boxes:
                    match_found = False
                    for j, gt_box in enumerate(gt_boxes):
                        if j not in matched_gt and compute_iou(pred_box.tolist(), gt_box.tolist()) >= iou_threshold:
                            true_positives += 1
                            matched_gt.add(j)
                            match_found = True
                            break
                    if not match_found:
                        false_positives += 1

                false_negatives += (len(gt_boxes) - len(matched_gt))
            total_frames += 1

    elapsed_time = time.time() - start_time
    avg_inference_time = elapsed_time / total_frames if total_frames > 0 else 0

    if compute_metrics:
        precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0
        recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0
        f1_score = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
    else:
        precision = recall = f1_score = 0

    return precision, recall, f1_score, avg_inference_time

#########################################
# 1. Split Training Dataset into Validation Subset
#########################################
# Assume train_dataset, CONFIG, and collate_fn are defined in your notebook.
train_size = int(0.8 * len(train_dataset))
val_size = len(train_dataset) - train_size
train_subset, val_subset = random_split(train_dataset, [train_size, val_size])

print(f"Train subset size: {len(train_subset)}")
print(f"Validation subset size: {len(val_subset)}")

# Create DataLoader for the validation subset
val_loader = DataLoader(val_subset, batch_size=CONFIG["training"]["batch_size"],
                        shuffle=False, collate_fn=collate_fn)

#########################################
# 2. Set Detector to Evaluation Mode
#########################################
detector.model.eval()

#########################################
# 3. Evaluate on Validation Subset
#########################################
print("\n--- Evaluating on Validation Subset ---")
precision, recall, f1_score, avg_time = evaluate_detector(val_loader, detector, iou_threshold=0.5, compute_metrics=True)
metrics_table = [
    ["Precision", f"{precision:.4f}"],
    ["Recall", f"{recall:.4f}"],
    ["F1 Score", f"{f1_score:.4f}"],
    ["Average Inference Time (s)", f"{avg_time:.4f}"]
]

print(tabulate(metrics_table, headers=["Metric", "Value"], tablefmt="github"))


#########################################
# 4. (Optional) Evaluate on Test Dataset
#########################################
# Since the test dataset does not include ground-truth annotations,
# we disable metric computation (compute_metrics=False) and only measure inference speed.
print("\n--- Evaluating on Test Dataset (Qualitative/Speed Only) ---")
test_dataset = MOT15Dataset(CONFIG["dataset_path"], mode="test", transform=apply_augmentations)
print("Test dataset size:", len(test_dataset))
test_loader = DataLoader(test_dataset, batch_size=CONFIG["training"]["batch_size"],
                         shuffle=False, collate_fn=collate_fn)

# For test evaluation, we set compute_metrics to False because ground truth is not available.
precision_test, recall_test, f1_test, avg_time_test = evaluate_detector(test_loader, detector, iou_threshold=0.5, compute_metrics=False)
print("Test Dataset Evaluation:")
print(f"Average Inference Time per Frame: {avg_time_test:.4f} seconds")


Train subset size: 4400
Validation subset size: 1100

--- Evaluating on Validation Subset ---


Evaluating Detector: 100%|██████████| 138/138 [03:13<00:00,  1.40s/it]


| Metric                     |   Value |
|----------------------------|---------|
| Precision                  |  0.0327 |
| Recall                     |  0.2745 |
| F1 Score                   |  0.0585 |
| Average Inference Time (s) |  0.1755 |

--- Evaluating on Test Dataset (Qualitative/Speed Only) ---
Test dataset size: 5783


Evaluating Detector:  93%|█████████▎| 670/723 [15:37<01:11,  1.36s/it]