# Object Tracing using SORT and YOLOv5

## Detection

### Extracting Frames from videos

In [1]:
import os
import cv2
import gc

In [2]:
import numpy as np

import torch
torch.cuda.empty_cache()
torch.cuda.synchronize()

In [3]:
FILE_NAME = 'test1'

VID_PATH = os.path.join('./Data', 'videos', f'{FILE_NAME}')
if not os.path.exists(VID_PATH):
    os.makedirs(VID_PATH)

# IMG_PATH = os.path.join('./Data', 'images', f'{FILE_NAME}')
# if not os.path.exists(IMG_PATH):
#     os.makedirs(IMG_PATH)

RESULT_PATH = os.path.join('./Results', 'coordinates', f'{FILE_NAME}')
if not os.path.exists(RESULT_PATH):
    os.makedirs(RESULT_PATH)

In [4]:
DEVICE = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

# PRETRAIN_WEIGHT = 'yolov5m6'

PRETRAIN_WEIGHT = 'custom'
WEIGHT_PATH = './pretrained_weights/yolov5m6.pt'

In [5]:
# model = torch.hub.load('ultralytics/yolov5', 'yolov5s', classes=1, trust_repo=True) # only predict persons class
# model = torch.hub.load('ultralytics/yolov5', PRETRAIN_WEIGHT, device=DEVICE, trust_repo=True)  # load on DEVICE = CUDA/CPU
model = torch.hub.load('ultralytics/yolov5', PRETRAIN_WEIGHT, path=WEIGHT_PATH, device=DEVICE, trust_repo=True)  # load on DEVICE = CUDA/CPU

# model.load_state_dict(torch.load('yolov5s_10cls.pt')['model'].state_dict())
model.to(DEVICE)
print()

Using cache found in /home/adhiraj/.cache/torch/hub/ultralytics_yolov5_master
YOLOv5 🚀 2022-12-8 Python-3.10.8 torch-1.13.0 CUDA:0 (NVIDIA GeForce GTX 1650, 3912MiB)

Fusing layers... 
YOLOv5m6 summary: 378 layers, 35704908 parameters, 0 gradients
Adding AutoShape... 





In [6]:
## Evaluation Modej
model.eval()

# model.conf = 0.25  # NMS confidence threshold
model.conf = 0.10  # NMS confidence threshold

# model.iou = 0.45  # NMS IoU threshold
model.iou = 0.30  # NMS IoU threshold

model.agnostic = False # NMS class-agnostic (means will detect objects even when no classes ?)
model.multi_label = False  # NMS multiple labels per box

# model.classes = None  # (optional list) filter by class, i.e. = [0, 15, 16] for COCO persons, cats and dogs
model.classes = [0]

model.max_det = 1000  # maximum number of detections per image
model.amp = False  # Automatic Mixed Precision (AMP) inference

In [7]:
vidcap = cv2.VideoCapture(f'{VID_PATH}.mp4')
success, image = vidcap.read()

count = 0
while success:
    if count == 100:
        break
    
    # cv2.imwrite(f"{IMG_PATH}/frame_{int(count)}.jpg", image) # save frame as JPG file
    
    with torch.no_grad():
        results = model(image, size=1280) # batch of images
        
    coordinates = results.xyxy[0].detach().cpu().numpy()[..., :5]
    
    np.savetxt(f'{RESULT_PATH}/frame_{count}.txt', coordinates, fmt='%0.2f', delimiter=',', newline='\n')
    
    
    ## Clearing Memory
    del results
    torch.cuda.empty_cache()
    
    del coordinates
    del image # does it make a difference though ?
    gc.collect()
    
    ## Reading Next Frame
    success, image = vidcap.read()
    
    # print('Read a new frame: ', success)
    count += 1

In [8]:
## Clearing Memory

del vidcap # doesn't make a lot of difference i think since it's just a object
gc.collect()

0

In [9]:
## Reset Kernel

%reset -f

In [10]:
# from __future__ import print_function

import os
import numpy as np
import cv2
import gc

import matplotlib
matplotlib.use('TkAgg')

import matplotlib.pyplot as plt
import matplotlib.patches as patches

# from skimage import io

import glob
import time

In [11]:
from sort import Sort

In [12]:
FILE_NAME = 'test1'

VID_PATH = os.path.join('./Data', 'videos', f'{FILE_NAME}')

if not os.path.exists(VID_PATH):
    print(f'ERROR: {VID_PATH} Folder not found!')
    exit()

# IMG_PATH = os.path.join('./Data', 'images', f'{FILE_NAME}')
# if not os.path.exists(IMG_PATH):
#     os.makedirs(IMG_PATH)

RESULT_PATH = os.path.join('./Results', 'coordinates', f'{FILE_NAME}')

if not os.path.exists(RESULT_PATH):
    print(f'ERROR: {RESULT_PATH} Folder not found!')
    exit()

In [13]:
# args = parse_args()
    
# display = args.display
# phase = args.phase

# total_time = 0.0
# total_frames = 0

colours = np.random.rand(32, 3) #used only for display

In [14]:
plt.ion()

fig = plt.figure()
ax1 = fig.add_subplot(111, aspect='equal')

In [15]:
max_age = 1
min_hits = 3
iou_threshold = 0.3

mot_tracker = Sort(max_age=max_age, 
                   min_hits=min_hits, 
                   iou_threshold=iou_threshold)

In [16]:
vidcap = cv2.VideoCapture(f'{VID_PATH}.mp4')
success, image = vidcap.read()

count = 0
while success:
    if count == 100:
        break
    
    frame = os.path.join(RESULT_PATH, f'frame_{count}.txt')
    dets = np.loadtxt(frame, delimiter=',')
    
    # dets[:, 2:4] += dets[:, 0:2] # don't need to do this
    
    ax1.imshow(image)
    plt.title(FILE_NAME + ' Tracked Targets')
    
    # start_time = time.time()
    
    trackers = mot_tracker.update(dets)
    
    # cycle_time = time.time() - start_time
    # total_time += cycle_time
    
    for d in trackers:
        # print('%d,%d,%.2f,%.2f,%.2f,%.2f,1,-1,-1,-1'%(frame,d[4],d[0],d[1],d[2]-d[0],d[3]-d[1]),file=out_file)
        
        d = d.astype(np.int32)
        ax1.add_patch(patches.Rectangle((d[0], d[1]), d[2] - d[0], d[3] - d[1], fill=False, lw=1, ec=colours[d[4]%32, :]))
        
    fig.canvas.flush_events()
    plt.draw()
    ax1.cla()
    
    del trackers
    del dets
    del frame
    del image # does it make a difference though ?
    gc.collect()
    
    ## Reading Next Frame
    success, image = vidcap.read()
    
    # print('Read a new frame: ', success)
    count += 1

In [17]:
plt.close(fig)
plt.ioff()

<contextlib.ExitStack at 0x7fbefc2339d0>

In [18]:
del vidcap
del ax1
del fig
gc.collect()

0

**Command to convert Video to Image Frames:**

%command: `ffmpeg -i test1.mp4 -f image2 test_frames/img_%04d.jpg`

More info: https://ffmpeg.org/ffmpeg.html