# ECSE 415 - Assignment 5: Video Analysis

## Authors

| *Name*                         |*Student ID*|
|--------------------------------|------------|
| Ana Gordon                     | 261113440 |
# --- Quick CUDA / device diagnostics (run this cell to print details)
import torch
print('torch.__version__:', getattr(torch, '__version__', 'n/a'))
print('torch.version.cuda:', getattr(torch.version, 'cuda', None))
print('cuda available:', torch.cuda.is_available())
print('cuda device count:', torch.cuda.device_count())
if torch.cuda.is_available() and torch.cuda.device_count() > 0:
    dev = torch.cuda.get_device_properties(0)
    print('device name:', dev.name)
    print('major/minor:', dev.major, dev.minor)
    try:
        print('device total memory (GB):', round(dev.total_memory / (1024**3), 2))
    except Exception:
        pass
    try:
        print('torch.cuda.get_arch_list():', torch.cuda.get_arch_list())
    except Exception as e:
        print('get_arch_list() error:', e)

| Mathias Nahuel Pacheco Lemina  | 261116679  |

## Introduction
The goal of this assignment is 

https://www.freecodecamp.org/news/how-to-detect-objects-in-images-using-yolov8/#heading-how-to-get-started-with-yolov8

https://www.geeksforgeeks.org/python/saving-a-video-using-opencv/ --> for 1 (data preparation)

https://www.kaggle.com/code/nityampareek/using-deepsort-object-tracker-with-yolov5#DeepSORT --> for 2.1

## `0` Preparation

Run this in a terminal first

*python3 -m venv venv && source venv/bin/activate && python -m pip install --upgrade pip && python -m pip install -r requirements.txt*

In [1]:
!pip install ultralytics
!pip install opencv-python
!pip install deep-sort-realtime
!pip install datetime
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121

Looking in indexes: https://download.pytorch.org/whl/cu121
Looking in indexes: https://download.pytorch.org/whl/cu121


### `Imports`

In [7]:
# Set environment vars BEFORE importing torch/CUDA initialization
import os
os.environ["TORCH_USE_CUDA_DSA"] = "1"

In [8]:
# imports

import numpy as np
import cv2
import time
import subprocess
import shutil
from pathlib import Path
import sys
import glob
import torchvision
import torch
from ultralytics import YOLO
from deep_sort_realtime.deepsort_tracker import DeepSort
import datetime
import os


### `Paths`

In [9]:
data_path = './Object_Tracking/'

task1_images_path = os.path.join(data_path, 'Task1/Images/')

## `1` Data Preparation

In [13]:
#1.1 convert images to video and 1.2 save output video
input_video = 'task1_input.mp4'

image_files = sorted([img for img in os.listdir(task1_images_path) if img.endswith('.jpg')])
sample_image = cv2.imread(os.path.join(task1_images_path, image_files[0]))
height, width = sample_image.shape[:2]
fps = 14
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
video_writer = cv2.VideoWriter(input_video, fourcc, fps, (width, height))

for image_file in image_files:
    image_path = os.path.join(task1_images_path, image_file)
    frame = cv2.imread(image_path)
    video_writer.write(frame)
video_writer.release()
print(f"Video saved to: {input_video}")

def output_writer(video_cap, output_filename):
    frame_width = int(video_cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    frame_height = int(video_cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    fps = int(video_cap.get(cv2.CAP_PROP_FPS))
    fourcc = cv2.VideoWriter_fourcc(*'MP4V')
    writer = cv2.VideoWriter(output_filename, fourcc, fps, (frame_width, frame_height))
    return writer

Video saved to: task1_input.mp4


## `2` Model Implementation

In [None]:
input_video = globals().get('input_video', 'task1_input.mp4')
output_video = 'task1.mp4'
output_txt = 'task1.txt'

# load model
model = YOLO('yolov8n.pt')
device = 'cpu' 
print('Using device:', device)

# CPU embedder for compatibility 
tracker = DeepSort(max_age=30, embedder_gpu=False)

cap = cv2.VideoCapture(input_video)
writer = output_writer(cap, output_video)

# tracking results
f = open(output_txt, 'w')

frame_idx = 0
while True:
    ret, frame = cap.read()
    if not ret:
        break
    frame_idx += 1

    results = model(frame, device=device)
    res = results[0]  

    detections_for_tracker = []

    if hasattr(res, 'boxes') and hasattr(res.boxes, 'data') and res.boxes.data is not None:
        boxes = res.boxes.data.cpu().numpy() if hasattr(res.boxes.data, 'cpu') else res.boxes.data.numpy()
       
        for b in boxes:
            # bbox: [x1,y1,x2,y2,conf,class]
            x1, y1, x2, y2, conf, cls = (b.tolist() if hasattr(b, 'tolist') else list(b))
            cls = int(cls)
            
            # keep pedestrians only (COCO class 0)
            if cls != 0:
                continue
            xmin = int(max(0, round(x1)))
            ymin = int(max(0, round(y1)))
            w = int(round(x2 - x1))
            h = int(round(y2 - y1))
            detections_for_tracker.append([[xmin, ymin, w, h], float(conf), cls])

    # Update tracker
    tracks = tracker.update_tracks(detections_for_tracker, frame=frame)

    # 2.2 - draw bounding box and ID
    for tr in tracks:
        if not tr.is_confirmed():
            continue
        track_id = tr.track_id
        left, top, right, bottom = tr.to_ltrb()
        left, top, right, bottom  = int(left), int(top), int(right), int(bottom)
        width = right-left
        height = bottom-top
        
        # draw bbox and id
        cv2.rectangle(frame, (left, top), (right, bottom), (0, 255, 0), 2)
        cv2.rectangle(frame, (left, top - 20), (left + 60, top), (0, 255, 0), -1)
        cv2.putText(frame, str(track_id), (left + 5, top - 5), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255, 255, 255), 2)
        
        # results.txt: <frame>, <id>, <bb_left>, <bb_top>, <bb_width>, <bb_height>
        f.write(f'{frame_idx},{track_id},{left},{top},{width},{height}\n')

    writer.write(frame)

# Cleanup
f.close()
cap.release()
writer.release()
cv2.destroyAllWindows()

Using device: cpu

0: 384x640 16 persons, 1 umbrella, 18.9ms
Speed: 1.5ms preprocess, 18.9ms inference, 0.4ms postprocess per image at shape (1, 3, 384, 640)
0: 384x640 16 persons, 1 umbrella, 18.9ms
Speed: 1.5ms preprocess, 18.9ms inference, 0.4ms postprocess per image at shape (1, 3, 384, 640)


0: 384x640 18 persons, 1 umbrella, 19.5ms
Speed: 1.1ms preprocess, 19.5ms inference, 0.6ms postprocess per image at shape (1, 3, 384, 640)
0: 384x640 18 persons, 1 umbrella, 19.5ms
Speed: 1.1ms preprocess, 19.5ms inference, 0.6ms postprocess per image at shape (1, 3, 384, 640)


0: 384x640 15 persons, 1 umbrella, 19.7ms
Speed: 1.4ms preprocess, 19.7ms inference, 0.4ms postprocess per image at shape (1, 3, 384, 640)
0: 384x640 15 persons, 1 umbrella, 19.7ms
Speed: 1.4ms preprocess, 19.7ms inference, 0.4ms postprocess per image at shape (1, 3, 384, 640)


error: OpenCV(4.12.0) :-1: error: (-5:Bad argument) in function 'rectangle'
> Overload resolution failed:
>  - Can't parse 'pt2'. Sequence item with index 1 has a wrong type
>  - Can't parse 'rec'. Expected sequence length 4, got 2
>  - Can't parse 'pt2'. Sequence item with index 1 has a wrong type
>  - Can't parse 'rec'. Expected sequence length 4, got 2


## `3` Model Evaluation

## `4` Prediction & Kaggle Competition