In [2]:
import os
import cv2
import numpy as np
from ultralytics import YOLO
from deep_sort_realtime.deepsort_tracker import DeepSort

In [3]:
# Load the latest YOLOv8 model 
model = YOLO("yolov8n.pt")

In [4]:
# Initialize DeepSORT tracker
tracker = DeepSort(max_age=30, n_init=3, nms_max_overlap=1.0, max_cosine_distance=0.2, nn_budget=None)


In [5]:
# Define paths for input frames and output annotated frames
image_folder = "./TP3_data/frames"  # Folder with your image frames
output_folder = "output_frames"
os.makedirs(output_folder, exist_ok=True)

In [6]:
# Get a sorted list of image file paths (ensure files are named sequentially)
image_paths = sorted([
    os.path.join(image_folder, fname)
    for fname in os.listdir(image_folder)
    if fname.lower().endswith(('.png', '.jpg', '.jpeg'))
])

In [8]:
for image_path in image_paths:
    frame = cv2.imread(image_path)
    if frame is None:
        continue

    # Run YOLOv8 inference on the frame
    results = model(frame)
    detections = results[0].boxes

    detections_filtered = []
    # Process each YOLOv8 detection and filter to only include "cup" detections
    for box in detections:
        # Extract bounding box coordinates and confidence
        x1, y1, x2, y2 = box.xyxy.cpu().numpy().flatten().astype(int)
        conf = float(box.conf.cpu().numpy().flatten()[0])
        # Get class id and corresponding label from model names
        class_id = int(box.cls.cpu().numpy()[0])
        label = model.names[class_id]

        # Only keep detections that are cups and exceed the confidence threshold
        if label == "cup" and conf > 0.5:
            # DeepSORT expects a tuple: ([x1, y1, x2, y2], confidence, label)
            detections_filtered.append(([x1, y1, x2, y2], conf, label))

    # Update DeepSORT tracker with current frame detections
    tracks = tracker.update_tracks(detections_filtered, frame=frame)

    # Draw tracked cups on the frame
    for track in tracks:
        if not track.is_confirmed():
            continue
        track_id = track.track_id
        # Get bounding box coordinates [left, top, right, bottom]
        ltrb = track.to_ltrb()
        x1, y1, x2, y2 = [int(i) for i in ltrb]
        cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
        cv2.putText(frame, f"ID: {track_id}", (x1, y1 - 10),
                    cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 2)

    # Save the annotated frame to the output folder
    output_path = os.path.join(output_folder, os.path.basename(image_path))
    cv2.imwrite(output_path, frame)

    # Optionally, display the frame (press 'q' to exit early)
    cv2.imshow("Frame", frame)
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cv2.destroyAllWindows()


0: 384x640 1 cup, 1 tv, 1 keyboard, 45.4ms
Speed: 2.3ms preprocess, 45.4ms inference, 0.9ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 cup, 1 dining table, 1 tv, 1 keyboard, 41.2ms
Speed: 1.6ms preprocess, 41.2ms inference, 0.7ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 cups, 1 dining table, 1 tv, 1 keyboard, 38.2ms
Speed: 1.9ms preprocess, 38.2ms inference, 0.7ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 1 mouse, 50.8ms
Speed: 2.0ms preprocess, 50.8ms inference, 1.2ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 7 cups, 1 vase, 45.1ms
Speed: 2.0ms preprocess, 45.1ms inference, 0.8ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 9 cups, 44.0ms
Speed: 2.0ms preprocess, 44.0ms inference, 0.8ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 8 cups, 46.0ms
Speed: 2.2ms preprocess, 46.0ms inference, 1.1ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 8 cups, 1 spoon, 

KeyboardInterrupt: 