In [1]:
import os
from pathlib import Path

In [2]:
SOURCE_VIDEO_DIRECTORY = Path("./videos/Game-1/")
SOURCE_VIDEO_PATH = SOURCE_VIDEO_DIRECTORY / "cam0_2025-11-14_19-48-45.mp4"

TARGET_VIDEO_DIRECTORY = Path("./output_videos/")

In [3]:
from IPython.display import Video
from typing import Dict, List, Optional, Union, Iterable, Tuple
from operator import itemgetter

import cv2
import numpy as np
import torch
from tqdm import tqdm
from pathlib import Path

import supervision as sv
from rfdetr import RFDETRBase

from inference import get_model


ModelDependencyMissing: Your `inference` configuration does not support SAM3 model. Install SAM3 dependencies and set CORE_MODEL_SAM3_ENABLED to True.


In [4]:
# model = RFDETRBase()
box_annotator = sv.BoxAnnotator(thickness=2)
label_annotator = sv.LabelAnnotator(text_color=sv.Color.BLACK)

infer_model = get_model("rfdetr-base", device="cuda:0")



In [5]:
from sam2.build_sam import build_sam2_camera_predictor

SAM2_HOME = Path("../segment-anything-2-real-time")
SAM2_CHECKPOINT = SAM2_HOME / "checkpoints/sam2.1_hiera_tiny.pt"
SAM2_CONFIG = "configs/sam2.1/sam2.1_hiera_t.yaml"

predictor = build_sam2_camera_predictor(str(SAM2_CONFIG), str(SAM2_CHECKPOINT))

In [6]:

class SAM2Tracker:
    def __init__(self, predictor) -> None:
        self.predictor = predictor
        self._prompted = False

    def prompt_first_frame(self, frame: np.ndarray, detections: sv.Detections) -> None:
        if len(detections) == 0:
            raise ValueError("detections must contain at least one box")

        if detections.tracker_id is None:
            detections.tracker_id = list(range(1, len(detections) + 1))

        with torch.inference_mode(), torch.autocast("cuda", dtype=torch.bfloat16):
            self.predictor.load_first_frame(frame)
            for xyxy, obj_id in zip(detections.xyxy, detections.tracker_id):
                bbox = np.asarray([xyxy], dtype=np.float32)
                self.predictor.add_new_prompt(
                    frame_idx=0,
                    obj_id=int(obj_id),
                    bbox=bbox,
                )

        self._prompted = True

    def propagate(self, frame: np.ndarray) -> sv.Detections:
        if not self._prompted:
            raise RuntimeError("Call prompt_first_frame before propagate")

        with torch.inference_mode(), torch.autocast("cuda", dtype=torch.bfloat16):
            tracker_ids, mask_logits = self.predictor.track(frame)

        tracker_ids = np.asarray(tracker_ids, dtype=np.int32)
        masks = (mask_logits > 0.0).cpu().numpy()
        masks = np.squeeze(masks).astype(bool)

        if masks.ndim == 2:
            masks = masks[None, ...]

        masks = np.array([
            sv.filter_segments_by_distance(mask, relative_distance=0.03, mode="edge")
            for mask in masks
        ])

        xyxy = sv.mask_to_xyxy(masks=masks)
        detections = sv.Detections(xyxy=xyxy, mask=masks, tracker_id=tracker_ids)
        return detections

    def reset(self) -> None:
        self._prompted = False

In [7]:
TARGET_VIDEO_PATH = TARGET_VIDEO_DIRECTORY / f"{SOURCE_VIDEO_PATH.stem}-mask{SOURCE_VIDEO_PATH.suffix}"
TARGET_VIDEO_COMPRESSED_PATH = TARGET_VIDEO_DIRECTORY / f"{TARGET_VIDEO_PATH.stem}-compressed{TARGET_VIDEO_PATH.suffix}"


In [8]:
mask_annotator = sv.MaskAnnotator(
    color_lookup=sv.ColorLookup.TRACK,
    opacity=0.5)
box_annotator = sv.BoxAnnotator(
    color_lookup=sv.ColorLookup.TRACK,
    thickness=2
)

frame_generator = sv.get_video_frames_generator(SOURCE_VIDEO_PATH)
frame = next(frame_generator)

model = RFDETRBase()



Loading pretrain weights


In [9]:
detections = model.predict(frame)

def filter_detections(detections: sv.Detections) -> sv.Detections:
    # only keep person human and balls
    detections = detections[np.isin(detections.class_id, (1, 37))]
    return detections

detections = filter_detections(detections)

# we prompt SAM2 using RF-DETR model detections
tracker = SAM2Tracker(predictor)
tracker.prompt_first_frame(frame, detections)

# we propagate tacks across all video frames

def callback(frame: np.ndarray, index: int) -> np.ndarray:
    detections = tracker.propagate(frame)
    annotated_frame = frame.copy()
    annotated_frame = mask_annotator.annotate(scene=annotated_frame, detections=detections)
    annotated_frame = box_annotator.annotate(scene=annotated_frame, detections=detections)
    return annotated_frame

sv.process_video(
    source_path=SOURCE_VIDEO_PATH,
    target_path=TARGET_VIDEO_PATH,
    callback=callback,
    max_frames = 3000,
    show_progress=True
)

!ffmpeg -y -loglevel error -i {TARGET_VIDEO_PATH} -vcodec libx264 -crf 28 {TARGET_VIDEO_COMPRESSED_PATH}


Skipping the post-processing step due to the error above. You can still use SAM 2 and it's OK to ignore the error above, although some post-processing functionality may be limited (which doesn't affect the results in most cases; see https://github.com/facebookresearch/sam2/blob/main/INSTALL.md).


Processing video:   0%|          | 0/3000 [00:00<?, ?it/s]


Skipping the post-processing step due to the error above. You can still use SAM 2 and it's OK to ignore the error above, although some post-processing functionality may be limited (which doesn't affect the results in most cases; see https://github.com/facebookresearch/sam2/blob/main/INSTALL.md).


[1;35m[vost#0:0 @ 0x55fb5348a740] [0m[4;31mUnknown encoder 'libx264'
[0m[1;35m[vost#0:0 @ 0x55fb5348a740] [0m[4;31mError selecting an encoder
[0m[1;31mError opening output file output_videos/cam0_2025-11-14_19-48-45-mask-compressed.mp4.
[0m[4;31mError opening output files: Encoder not found
[0m