In [91]:
!pip install torch numpy opencv-python



In [92]:
import torch
import numpy as np
import cv2
import pafy
import random
import IPython.display as ipd

In [93]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [94]:
model = torch.hub.load("ultralytics/yolov5", "yolov5s", pretrained=True)
model.eval()

Using cache found in /Users/brianbianchi/.cache/torch/hub/ultralytics_yolov5_master
YOLOv5 🚀 2025-3-14 Python-3.12.9 torch-2.6.0 CPU

Fusing layers... 
YOLOv5s summary: 213 layers, 7225885 parameters, 0 gradients, 16.4 GFLOPs
Adding AutoShape... 


AutoShape(
  (model): DetectMultiBackend(
    (model): DetectionModel(
      (model): Sequential(
        (0): Conv(
          (conv): Conv2d(3, 32, kernel_size=(6, 6), stride=(2, 2), padding=(2, 2))
          (act): SiLU(inplace=True)
        )
        (1): Conv(
          (conv): Conv2d(32, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
          (act): SiLU(inplace=True)
        )
        (2): C3(
          (cv1): Conv(
            (conv): Conv2d(64, 32, kernel_size=(1, 1), stride=(1, 1))
            (act): SiLU(inplace=True)
          )
          (cv2): Conv(
            (conv): Conv2d(64, 32, kernel_size=(1, 1), stride=(1, 1))
            (act): SiLU(inplace=True)
          )
          (cv3): Conv(
            (conv): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1))
            (act): SiLU(inplace=True)
          )
          (m): Sequential(
            (0): Bottleneck(
              (cv1): Conv(
                (conv): Conv2d(32, 32, kernel_size=(1, 1), stride=(1, 1))
  

In [95]:
classes = model.names
from itertools import islice
dict(islice(classes.items(), 5))

{0: 'person', 1: 'bicycle', 2: 'car', 3: 'motorcycle', 4: 'airplane'}

In [96]:
class_colors = {
    name: (
        random.randint(0, 255),
        random.randint(0, 255),
        random.randint(0, 255),
    )
    for name in classes
}
dict(islice(class_colors.items(), 5))

{0: (226, 155, 64),
 1: (154, 56, 147),
 2: (74, 84, 83),
 3: (18, 55, 239),
 4: (133, 59, 251)}

In [97]:
def eval_frame(frame):
    """
    Takes a single frame as input, and evaluates the frame using yolo5 model.
    :param frame: input frame in numpy/list/tuple format.
    :return: Labels and Coordinates of objects detected by model in the frame.
    """
    model.to(device)
    results = model([frame])
    #      xmin    ymin    xmax   ymax  confidence  class    name
    # 0  749.50   43.50  1148.0  704.5    0.874023      0  person
    # 1  433.50  433.50   517.5  714.5    0.687988     27     tie
    # 2  114.75  195.75  1095.0  708.0    0.624512      0  person
    # 3  986.00  304.00  1028.0  420.0    0.286865     27     tie
    labels, cord = results.xyxyn[0][:, -1].numpy(), results.xyxyn[0][:, :-1].numpy()
    return labels, cord

In [98]:
def plot_boxes(results, frame):
    """
    Takes a frame and its results as input, and plots the bounding boxes and label on to the frame.
    :param results: contains labels and coordinates predicted by model on the given frame.
    :param frame: Frame which has been scored.
    :return: Frame with bounding boxes and labels ploted on it.
    """
    labels, cord = results
    n = len(labels)
    x_shape, y_shape = frame.shape[1], frame.shape[0]
    for i in range(n):
        row = cord[i]
        x1, y1, x2, y2 = (
            int(row[0] * x_shape),
            int(row[1] * y_shape),
            int(row[2] * x_shape),
            int(row[3] * y_shape),
        )
        bgr = class_colors[labels[i]]
        cv2.rectangle(frame, (x1, y1), (x2, y2), bgr, 2)
        cv2.putText(
            frame,
            classes[int(labels[i])],
            (x1, y1),
            cv2.FONT_HERSHEY_SIMPLEX,
            0.9,
            bgr,
            2,
        )

    return frame

In [99]:
# download a mp4 video
# tool: https://cobalt.tools/
# example: https://www.youtube.com/watch?v=PwUEo7zEtzg
# torch.set_warn_always(False)
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

player = cv2.VideoCapture("in.mp4")
assert player.isOpened()
n_frames = int(player.get(cv2.CAP_PROP_FRAME_COUNT))
four_cc = cv2.VideoWriter_fourcc(*'mp4v')
fps = round(player.get(cv2.CAP_PROP_FPS), 2)
x_shape = int(player.get(cv2.CAP_PROP_FRAME_WIDTH))
y_shape = int(player.get(cv2.CAP_PROP_FRAME_HEIGHT))
out = cv2.VideoWriter("out.mp4", four_cc, fps, (x_shape, y_shape))
print(f"Number of frames: {n_frames}")
print(f"FPS: {fps}")
while True:
    ret, frame = player.read()
    if ret == False:
        break
    results = eval_frame(frame)
    frame = plot_boxes(results, frame)
    out.write(frame)
out.release()
player.release()

Number of frames: 540
FPS: 30.0


In [100]:
ipd.Video('in.mp4', width=700)

In [101]:
ipd.Video('out.mp4', width=700)