In [4]:
from ultralytics import YOLO
import cv2

In [5]:
model = YOLO('yolov8s.pt')

In [6]:
def render_boxes(res, frame):
    
    for result in res:
     
        for box in result.boxes:
            if box.cls in [24,28]: # If the object is a bag
    
                x1, y1, x2, y2 = box.xyxy[0].tolist()
            
                if box.conf[0] > 0.6:
                    cv2.rectangle(frame, (int(x1), int(y1)), (int(x2), int(y2)), (0,0,255), 2)
                    # cv2.rectangle(image, start_point, end_point, color, thickness)

In [7]:
def detect_bags(video_path, output_path):

    cap = cv2.VideoCapture(video_path)
    fourcc  = cv2.VideoWriter_fourcc(*'mp4v')
    
    # Fetch fps, height, width to match the original video
    fps     = cap.get(cv2.CAP_PROP_FPS)
    width   = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height  = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    
    # Create a VideoWriter object with specified parameters
    out     = cv2.VideoWriter(output_path, fourcc, fps, (width, height))

    while cap.isOpened():

        ret, frame = cap.read()

        if not ret:  # There are no more frames left
            break 
            
        res = model(frame)
        
        render_boxes(res, frame)
        out.write(frame)

    cap.release()
    out.release()
    cv2.destroyAllWindows()

In [19]:
video_path = "img/carousel.mp4"
output_path = "img/carousel_out.mp4"

detect_bags(video_path, output_path)


0: 384x640 4 persons, 2 suitcases, 1 tv, 99.2ms
Speed: 1024.1ms preprocess, 99.2ms inference, 3.4ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 6 persons, 2 suitcases, 1 tv, 8.2ms
Speed: 2.1ms preprocess, 8.2ms inference, 2.8ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 7 persons, 2 suitcases, 1 tv, 14.2ms
Speed: 2.4ms preprocess, 14.2ms inference, 2.4ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 7 persons, 2 suitcases, 1 tv, 11.7ms
Speed: 2.7ms preprocess, 11.7ms inference, 1.5ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 6 persons, 2 suitcases, 1 tv, 8.0ms
Speed: 1.7ms preprocess, 8.0ms inference, 2.4ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 5 persons, 2 suitcases, 1 tv, 15.1ms
Speed: 1.9ms preprocess, 15.1ms inference, 1.3ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 5 persons, 2 suitcases, 1 tv, 8.1ms
Speed: 2.4ms preprocess, 8.1ms inference, 3.3ms postprocess per image at shap

In [9]:
import supervision as sv
import numpy as np
import cv2

# Settings
LINE_START = sv.Point(0 + 50, 1500)
LINE_END = sv.Point(3840 - 50, 1500)
TARGET_VIDEO_PATH = "img/result_bytetracker.mp4"
SELECTED_CLASS_IDS = [24, 28]  # Select class IDs for suitcase and backpack
SOURCE_VIDEO_PATH = "img/carousel.mp4"  # Replace with your source video path

# Create BYTETracker instance
byte_tracker = sv.ByteTrack(
    track_activation_threshold=0.4,
    lost_track_buffer=50,
    minimum_matching_threshold=0.8,
    frame_rate=30,
    minimum_consecutive_frames=3
)

byte_tracker.reset()

# Create VideoInfo instance
video_info = sv.VideoInfo.from_video_path(SOURCE_VIDEO_PATH)

# Create frame generator
generator = sv.get_video_frames_generator(SOURCE_VIDEO_PATH)

# Create LineZone instance
line_zone = sv.LineZone(start=LINE_START, end=LINE_END)

# Create instance of BoxAnnotator, LabelAnnotator, and TraceAnnotator
box_annotator = sv.BoxAnnotator(thickness=4)
label_annotator = sv.LabelAnnotator(text_thickness=2, text_scale=1.5, text_color=sv.Color.BLACK)
trace_annotator = sv.TraceAnnotator(thickness=4, trace_length=50)

# Create LineZoneAnnotator instance
line_zone_annotator = sv.LineZoneAnnotator(thickness=4, text_thickness=4, text_scale=2)

# Define the callback function to be used in video processing
def callback(frame: np.ndarray, index: int) -> np.ndarray:
    # Model prediction on single frame and conversion to supervision Detections
    results = model(frame, verbose=False)[0]
    detections = sv.Detections.from_ultralytics(results)
    
    # Only consider selected class IDs (suitcase, backpack)
    detections = detections[np.isin(detections.class_id, SELECTED_CLASS_IDS)]
    
    # Update ByteTrack with detections
    detections = byte_tracker.update_with_detections(detections)
    
    # Create labels that only show the track ID and confidence
    labels = [
        f"#{tracker_id} {confidence:0.2f}"
        for confidence, tracker_id in zip(detections.confidence, detections.tracker_id)
    ]
    
    # Annotate the frame
    annotated_frame = frame.copy()
    annotated_frame = trace_annotator.annotate(scene=annotated_frame, detections=detections)
    annotated_frame = box_annotator.annotate(scene=annotated_frame, detections=detections)
    annotated_frame = label_annotator.annotate(scene=annotated_frame, detections=detections, labels=labels)

    # Update line counter
    line_zone.trigger(detections)
    
    # Return frame with box and line annotations
    return line_zone_annotator.annotate(annotated_frame, line_counter=line_zone)

# Process the whole video
sv.process_video(
    source_path=SOURCE_VIDEO_PATH,
    target_path=TARGET_VIDEO_PATH,
    callback=callback
)
