## **CV HW4: Multi-object Tracking (MOT) with Detection**
**Detection**: YOLOv5, 
**Tracking**: Simple Online Realtime Tracking (SORT)

---



## **1. Unzip data folder**

In [9]:
# Change the path according to your setup 
from google.colab import drive
drive.mount("/content/gdrive")
!unzip "/content/gdrive/MyDrive/HW4_Resources/sort-master"
!unzip "/content/gdrive/MyDrive/HW4_Resources/KITTI_17_images"

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
Archive:  /content/gdrive/MyDrive/HW4_Resources/sort-master.zip
replace sort-master/.gitignore? [y]es, [n]o, [A]ll, [N]one, [r]ename: N
Archive:  /content/gdrive/MyDrive/HW4_Resources/KITTI_17_images.zip
replace KITTI_17_images/000001.jpg? [y]es, [n]o, [A]ll, [N]one, [r]ename: N


# **2. Install requirements**

In [10]:
%cd pytorch_objectdetecttrack
!pip install filterpy==1.1.0

[Errno 2] No such file or directory: 'pytorch_objectdetecttrack'
/content
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


# **3. Import libraries**

In [11]:
import torch
import torchvision
import cv2
import sys
import os
sys.path.insert(0,'./sort-master/')
import matplotlib
from google.colab.patches import cv2_imshow
from collections import namedtuple, OrderedDict

# **4. Load YOLOv5 detector from torch hub**

In [12]:
yolov5_detector = torch.hub.load('ultralytics/yolov5', 'yolov5s', pretrained = True)
yolov5_detector.float()
yolov5_detector.eval()

Using cache found in /root/.cache/torch/hub/ultralytics_yolov5_master
YOLOv5 🚀 2023-5-8 Python-3.10.11 torch-2.0.0+cu118 CPU

Fusing layers... 


[31m[1mrequirements:[0m /root/.cache/torch/hub/requirements.txt not found, check failed.


YOLOv5s summary: 213 layers, 7225885 parameters, 0 gradients
Adding AutoShape... 


AutoShape(
  (model): DetectMultiBackend(
    (model): DetectionModel(
      (model): Sequential(
        (0): Conv(
          (conv): Conv2d(3, 32, kernel_size=(6, 6), stride=(2, 2), padding=(2, 2))
          (act): SiLU(inplace=True)
        )
        (1): Conv(
          (conv): Conv2d(32, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
          (act): SiLU(inplace=True)
        )
        (2): C3(
          (cv1): Conv(
            (conv): Conv2d(64, 32, kernel_size=(1, 1), stride=(1, 1))
            (act): SiLU(inplace=True)
          )
          (cv2): Conv(
            (conv): Conv2d(64, 32, kernel_size=(1, 1), stride=(1, 1))
            (act): SiLU(inplace=True)
          )
          (cv3): Conv(
            (conv): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1))
            (act): SiLU(inplace=True)
          )
          (m): Sequential(
            (0): Bottleneck(
              (cv1): Conv(
                (conv): Conv2d(32, 32, kernel_size=(1, 1), stride=(1, 1))
  

# **5. Import SORT library**

In [13]:
from sort import *

#**6. Perform tracking with detection**

In [14]:
# Write your code here to perform tracking with detection using the provided YOLOv5 model and the SORT implementation
model = torch.hub.load("ultralytics/yolov5", "yolov5s", pretrained = True)
model.float()
model.eval()

Using cache found in /root/.cache/torch/hub/ultralytics_yolov5_master
YOLOv5 🚀 2023-5-8 Python-3.10.11 torch-2.0.0+cu118 CPU

Fusing layers... 


[31m[1mrequirements:[0m /root/.cache/torch/hub/requirements.txt not found, check failed.


YOLOv5s summary: 213 layers, 7225885 parameters, 0 gradients
Adding AutoShape... 


AutoShape(
  (model): DetectMultiBackend(
    (model): DetectionModel(
      (model): Sequential(
        (0): Conv(
          (conv): Conv2d(3, 32, kernel_size=(6, 6), stride=(2, 2), padding=(2, 2))
          (act): SiLU(inplace=True)
        )
        (1): Conv(
          (conv): Conv2d(32, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
          (act): SiLU(inplace=True)
        )
        (2): C3(
          (cv1): Conv(
            (conv): Conv2d(64, 32, kernel_size=(1, 1), stride=(1, 1))
            (act): SiLU(inplace=True)
          )
          (cv2): Conv(
            (conv): Conv2d(64, 32, kernel_size=(1, 1), stride=(1, 1))
            (act): SiLU(inplace=True)
          )
          (cv3): Conv(
            (conv): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1))
            (act): SiLU(inplace=True)
          )
          (m): Sequential(
            (0): Bottleneck(
              (cv1): Conv(
                (conv): Conv2d(32, 32, kernel_size=(1, 1), stride=(1, 1))
  

# **6.1. Generating Video from the KITTI_17_images dataset**


In [19]:
data = "KITTI_17_images"
video = "ImageToVideo.mp4"

img_files = sorted(os.listdir(data))

fps = 30.0
frame_size = cv2.imread(os.path.join(data, img_files[0])).shape[:2][::-1]
fourcc = cv2.VideoWriter_fourcc(*'mp4v')

out = cv2.VideoWriter(video, fourcc, fps, frame_size)

for img_file in img_files:
    img_path = os.path.join(data, img_file)
    img = cv2.imread(img_path)
    out.write(img)

out.release()
cv2.destroyAllWindows()

# **6.2. Implementing object tracking on the generated video**

In [20]:
vid = cv2.VideoCapture(video)
mot_tracker = Sort()
colours = [(0, 255, 0), (255, 255, 0), (255, 0, 0), (0, 0, 255)]

# Initialize video writer
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
fps = 30.0
frame_size = (int(vid.get(cv2.CAP_PROP_FRAME_WIDTH)), int(vid.get(cv2.CAP_PROP_FRAME_HEIGHT)))
out = cv2.VideoWriter('outputVideoFile.mp4', fourcc, fps, frame_size)

# Reading the first frame of input video
ret, frame = vid.read()
frame_counter = 1
with open('gt.txt', 'w') as file: 
    while True:
        if not ret:  # No more frames to read
            break
        # Generating predictions
        predictions = model(frame)
        # Loading detections
        detections = predictions.pred[0].numpy()
        # Filtering only person object detection
        person_detections = detections[detections[:, 5] == 0]

        retMotTracker = mot_tracker.update(person_detections).tolist()

        # Generating the ground truth value for this frame
        for id in range(len(retMotTracker)):
            x1, y1, w, h, score = retMotTracker[id]
            id = id + 1  
            file.write(f"{frame_counter},{id},{x1},{y1},{w},{h},{score},-1,-1,-1\n")

        # Making rectangles on the detected persons
        for j in range(len(retMotTracker)):
            idName = "ID : " + str(int(retMotTracker[j][4]))
            cv2.rectangle(frame, (int(retMotTracker[j][0]), int(retMotTracker[j][1])), (int(retMotTracker[j][2]), int(retMotTracker[j][3])), colours[int(retMotTracker[j][4])%4], 2)
            cv2.putText(frame, idName, (int(retMotTracker[j][0]), int(retMotTracker[j][1])-10), cv2.FONT_HERSHEY_SCRIPT_SIMPLEX, 0.9, colours[int(retMotTracker[j][4])%4], 2)

        # Write image frame to output video
        out.write(frame)
        frame_counter += 1

        # Read next frame of input video
        ret, frame = vid.read()

# Release video writer
out.release()
vid.release()

# **7. Report Evaluation Metrics**

In [None]:
# Use the Track-Eval kit to report the complete set of performance and accuracy metrics
# Comment on and interpret MOTA and MOTP values