In [1]:
from collections import defaultdict

import cv2
import numpy as np

from ultralytics import YOLO

In [3]:
model = YOLO('yolov8n.pt')

video_path="../data/people_walk.mp4"
cap = cv2.VideoCapture(video_path)

f = open("./segment_log.txt", "w+")

track_history = defaultdict(lambda : [])

while cap.isOpened():
    success, frame = cap.read()

    if success :
        results = model.track(frame, persist=True)

        boxes = results[0].boxes.xywh.cpu()
        track_ids = results[0].boxes.id.int().cpu().tolist()

        f.write("boxes :" + str(boxes))
        f.write("track_ids :" + str(track_ids))

        annotated_frame = results[0].plot()

        for box, track_id in zip(boxes, track_ids):
            x, y, w, h = box
            track = track_history[track_id]
            track.append((float(x), float(y)))

            if len(track) > 30 :
                track.pop(0)
            
            points = np.hstack(track).astype(np.int32).reshape((-1, 1, 2))
            cv2.polylines(annotated_frame, [points], isClosed=False, color=(0,0,255), thickness=10)

        cv2.imshow("YOLOv8 Tracking", annotated_frame)

        if cv2.waitKey(1) & 0xFF == ord("q"):
            break
    
    else:
        break

f.close()
cap.release()
cv2.destroyAllWindows()


0: 384x640 12 persons, 2 birds, 5.6ms
Speed: 1.0ms preprocess, 5.6ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 12 persons, 2 birds, 5.5ms
Speed: 2.6ms preprocess, 5.5ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 13 persons, 2 birds, 6.4ms
Speed: 2.3ms preprocess, 6.4ms inference, 1.3ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 13 persons, 2 birds, 5.5ms
Speed: 1.8ms preprocess, 5.5ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 15 persons, 2 birds, 5.1ms
Speed: 1.5ms preprocess, 5.1ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 15 persons, 2 birds, 21.0ms
Speed: 5.0ms preprocess, 21.0ms inference, 2.4ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 15 persons, 2 birds, 5.0ms
Speed: 1.8ms preprocess, 5.0ms inference, 0.9ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 15 persons, 2 birds, 5.8ms
Speed: 1.8ms p