In [1]:
import cv2
import numpy as np
import torch
from detectron2.config import get_cfg
from detectron2.engine import DefaultPredictor
from detectron2 import model_zoo
from detectron2.utils.visualizer import Visualizer
from detectron2.data import MetadataCatalog
from detectron2.structures import Instances

In [2]:
# Step 1: Setup Detectron2
def setup_detectron2():
    cfg = get_cfg()
    cfg.merge_from_file(model_zoo.get_config_file("COCO-Keypoints/keypoint_rcnn_R_50_FPN_3x.yaml"))
    cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = 0.5
    cfg.MODEL.WEIGHTS = model_zoo.get_checkpoint_url("COCO-Keypoints/keypoint_rcnn_R_50_FPN_3x.yaml")
    cfg.MODEL.DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
    predictor = DefaultPredictor(cfg)
    metadata = MetadataCatalog.get(cfg.DATASETS.TRAIN[0])
    return predictor, metadata

In [3]:
# Step 2: Manual ROI selection
def select_roi(video_path):
    cap = cv2.VideoCapture(video_path)
    ret, frame = cap.read()
    cap.release()
    if not ret:
        raise Exception("Failed to read video")

    bbox = cv2.selectROI("Select Player Area", frame, fromCenter=False, showCrosshair=True)
    cv2.destroyWindow("Select Player Area")
    return bbox

In [4]:
# Step 3: Score function based on proximity to selected ROI
def score_by_proximity(box, selected_box):
    sx, sy, sw, sh = selected_box
    sel_cx, sel_cy = sx + sw / 2, sy + sh / 2
    x1, y1, x2, y2 = box
    box_cx = (x1 + x2) / 2
    box_cy = (y1 + y2) / 2
    return -np.sqrt((box_cx - sel_cx) ** 2 + (box_cy - sel_cy) ** 2)

In [5]:
# Step 4: Main pipeline
def extract_keypoints_from_roi(input_video_path, output_npz_path, output_video_path):
    predictor, metadata = setup_detectron2()
    selected_box = select_roi(input_video_path)

    cap = cv2.VideoCapture(input_video_path)
    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    fps = cap.get(cv2.CAP_PROP_FPS)
    out_vid = cv2.VideoWriter(output_video_path, cv2.VideoWriter_fourcc(*"mp4v"), fps, (width, height))

    all_keypoints = []
    while True:
        ret, frame = cap.read()
        if not ret:
            break

        outputs = predictor(frame)
        instances = outputs["instances"].to("cpu")

        if len(instances) == 0:
            out_vid.write(frame)
            continue

        boxes = instances.pred_boxes.tensor.numpy()
        keypoints = instances.pred_keypoints.numpy()

        scores = [score_by_proximity(box, selected_box) for box in boxes]
        best_idx = int(np.argmax(scores))
        selected_kpts = keypoints[best_idx][:, :2]  # drop confidence

        all_keypoints.append(selected_kpts)

        # Draw only the selected person
        single_instance = Instances(image_size=frame.shape[:2])
        single_instance.pred_boxes = instances.pred_boxes[[best_idx]]
        single_instance.pred_keypoints = instances.pred_keypoints[[best_idx]]
        single_instance.scores = instances.scores[[best_idx]]
        single_instance.pred_classes = instances.pred_classes[[best_idx]]

        vis = Visualizer(frame[:, :, ::-1], metadata=metadata, scale=1.0)
        vis_frame = vis.draw_instance_predictions(single_instance)
        result = vis_frame.get_image()[:, :, ::-1]

        out_vid.write(result)

    cap.release()
    out_vid.release()

    np.savez_compressed(output_npz_path, keypoints=np.array(all_keypoints))
    print(f"Saved keypoints to: {output_npz_path}")
    print(f"Saved video to: {output_video_path}")


In [8]:
video='Clip14Miss'

In [9]:
# Usage
extract_keypoints_from_roi(f'inputdir/{video}.mp4',f'data/custom/{video}.npz',f'outputdir/New{video}.mp4')

Saved keypoints to: data/custom/Clip14Miss.npz
Saved video to: outputdir/NewClip14Miss.mp4
