In [134]:
!pip install torch numpy opencv-python



In [135]:
import torch
import numpy as np
import cv2
import pafy
import random
import IPython.display as ipd

In [136]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [137]:
model = torch.hub.load("ultralytics/yolov5", "yolov5s", pretrained=True)

Using cache found in /Users/brianbianchi/.cache/torch/hub/ultralytics_yolov5_master
YOLOv5 ðŸš€ 2025-3-14 Python-3.12.9 torch-2.6.0 CPU

Fusing layers... 
YOLOv5s summary: 213 layers, 7225885 parameters, 0 gradients, 16.4 GFLOPs
Adding AutoShape... 


In [138]:
classes = model.names
from itertools import islice
dict(islice(classes.items(), 5))

{0: 'person', 1: 'bicycle', 2: 'car', 3: 'motorcycle', 4: 'airplane'}

In [139]:
class_colors = {
    name: (
        random.randint(0, 255),
        random.randint(0, 255),
        random.randint(0, 255),
    )
    for name in classes
}
dict(islice(class_colors.items(), 5))

{0: (159, 5, 213),
 1: (145, 141, 123),
 2: (215, 1, 124),
 3: (196, 43, 251),
 4: (146, 72, 58)}

In [140]:
def eval_frame(frame):
    """
    Takes a single frame as input, and evaluates the frame using yolo5 model.
    :param frame: input frame in numpy/list/tuple format.
    :return: Labels and Coordinates of objects detected by model in the frame.
    """
    # model.to(device)
    frame = [frame]
    results = model(frame)
    labels, cord = results.xyxyn[0][:, -1].numpy(), results.xyxyn[0][:, :-1].numpy()
    return labels, cord

In [141]:
def plot_boxes(results, frame):
    """
    Takes a frame and its results as input, and plots the bounding boxes and label on to the frame.
    :param results: contains labels and coordinates predicted by model on the given frame.
    :param frame: Frame which has been scored.
    :return: Frame with bounding boxes and labels ploted on it.
    """
    labels, cord = results
    n = len(labels)
    x_shape, y_shape = frame.shape[1], frame.shape[0]
    for i in range(n):
        row = cord[i]
        if row[4] >= 0.2:
            x1, y1, x2, y2 = (
                int(row[0] * x_shape),
                int(row[1] * y_shape),
                int(row[2] * x_shape),
                int(row[3] * y_shape),
            )
            # bgr = (0, 255, 0)
            bgr = class_colors[labels[i]]
            cv2.rectangle(frame, (x1, y1), (x2, y2), bgr, 2)
            cv2.putText(
                frame,
                classes[int(labels[i])],
                (x1, y1),
                cv2.FONT_HERSHEY_SIMPLEX,
                0.9,
                bgr,
                2,
            )

    return frame

In [None]:
# download a mp4 video
# tool: https://cobalt.tools/
# example: https://www.youtube.com/watch?v=PwUEo7zEtzg
# torch.set_warn_always(False)

player = cv2.VideoCapture("in.mp4")
assert player.isOpened()
x_shape = int(player.get(cv2.CAP_PROP_FRAME_WIDTH))
y_shape = int(player.get(cv2.CAP_PROP_FRAME_HEIGHT))
out = cv2.VideoWriter("out.mp4", cv2.VideoWriter_fourcc(*'mp4v'), 20, (x_shape, y_shape))
while True:
    ret, frame = player.read()
    if not ret:
        break
    results = eval_frame(frame)
    frame = plot_boxes(results, frame)
    out.write(frame)

In [143]:
ipd.Video('in.mp4', width=700)

In [144]:
ipd.Video('out.mp4', width=700)