In [1]:
!pip uninstall opencv-python opencv-python-headless opencv-contrib-python -y

Found existing installation: opencv-python 4.12.0.88
Uninstalling opencv-python-4.12.0.88:
  Successfully uninstalled opencv-python-4.12.0.88
Found existing installation: opencv-python-headless 4.12.0.88
Uninstalling opencv-python-headless-4.12.0.88:
  Successfully uninstalled opencv-python-headless-4.12.0.88
Found existing installation: opencv-contrib-python 4.12.0.88
Uninstalling opencv-contrib-python-4.12.0.88:
  Successfully uninstalled opencv-contrib-python-4.12.0.88


In [2]:
!pip -q install ultralytics fvcore

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/50.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.2/50.2 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.2/42.2 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m56.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.0/67.0 MB[0m [31m28.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for fvcore (setup.py) ... [?25l[?25hdone
  Building wheel for iopath (setup.py) ... [?25l[?25hdone


In [3]:
import os, csv, json, cv2, math, time
import numpy as np
import torch
import torch.nn as nn
from dataclasses import dataclass
from typing import Dict, List, Tuple, Optional
from collections import defaultdict
from tqdm.auto import tqdm
from ultralytics import YOLO
from IPython.display import Video

Creating new Ultralytics Settings v0.0.6 file ✅ 
View Ultralytics Settings with 'yolo settings' or at '/root/.config/Ultralytics/settings.json'
Update Settings with 'yolo settings key=value', i.e. 'yolo settings runs_dir=path/to/dir'. For help see https://docs.ultralytics.com/quickstart/#ultralytics-settings.


In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# **Utilities**

In [5]:
def _device():
    return torch.device("cuda" if torch.cuda.is_available() else "cpu")

def _clamp(v, lo, hi):
    return max(lo, min(hi, v))

def _expand_bbox(b, margin, W, H):
    """Expand bbox by `margin` while staying in-frame."""
    x1,y1,x2,y2 = map(float, b)
    w, h = x2-x1, y2-y1
    cx, cy = (x1+x2)/2., (y1+y2)/2.
    w2, h2 = w*margin/2., h*margin/2.
    nx1, ny1 = _clamp(int(round(cx - w2)), 0, W-1), _clamp(int(round(cy - h2)), 0, H-1)
    nx2, ny2 = _clamp(int(round(cx + w2)), 0, W-1), _clamp(int(round(cy + h2)), 0, H-1)
    if nx2 <= nx1 or ny2 <= ny1: return 0,0,W-1,H-1
    return nx1, ny1, nx2, ny2

def _expand_to_square(x1, y1, x2, y2, W, H, factor=1.25):
    w = max(1.0, x2 - x1); h = max(1.0, y2 - y1)
    side = max(w, h) * float(factor)
    cx = (x1 + x2) * 0.5; cy = (y1 + y2) * 0.5
    nx1 = _clamp(int(round(cx - side * 0.5)), 0, W - 1)
    ny1 = _clamp(int(round(cy - side * 0.5)), 0, H - 1)
    nx2 = _clamp(int(round(cx + side * 0.5)), 0, W - 1)
    ny2 = _clamp(int(round(cy + side * 0.5)), 0, H - 1)
    if nx2 <= nx1 or ny2 <= ny1:
        return 0, 0, W - 1, H - 1
    return nx1, ny1, nx2, ny2

def _center_resize_crop(img, side_size, crop_size):
    """Short-side resize to `side_size`, then center crop `crop_size`."""
    h, w = img.shape[:2]
    if h < w:
        new_h, new_w = side_size, int(round(w*side_size/h))
    else:
        new_w, new_h = side_size, int(round(h*side_size/w))
    rs = cv2.resize(img, (new_w, new_h), interpolation=cv2.INTER_AREA)
    y0 = _clamp((new_h - crop_size)//2, 0, max(0, new_h - crop_size))
    x0 = _clamp((new_w - crop_size)//2, 0, max(0, new_w - crop_size))
    return rs[y0:y0+crop_size, x0:x0+crop_size]

def _square_resize_no_crop(img, side):
    # img BGR -> resize to (side, side) with no further crop
    return cv2.resize(img, (side, side), interpolation=cv2.INTER_AREA)

def _short_side_resize_then_letterbox(img_bgr, out_size):
    """Resize keeping aspect to make short side == out_size, then pad to square.
    This is for pose only, not for SlowFast."""
    h, w = img_bgr.shape[:2]
    if h == 0 or w == 0:
        return np.zeros((out_size, out_size, 3), dtype=img_bgr.dtype)
    if h < w:
        new_h, new_w = out_size, int(round(w * out_size / h))
    else:
        new_w, new_h = out_size, int(round(h * out_size / w))
    rs = cv2.resize(img_bgr, (new_w, new_h), interpolation=cv2.INTER_AREA)
    canvas = np.zeros((out_size, out_size, 3), dtype=img_bgr.dtype)
    top  = max(0, (out_size - new_h) // 2)
    left = max(0, (out_size - new_w) // 2)
    canvas[top:top+new_h, left:left+new_w] = rs[:out_size, :out_size]  # safe slice
    return canvas

def _linspace_idx(a, b, n):
    """n indices uniformly from [a..b] inclusive, as ints (repeats if n>L)."""
    if n <= 1: return [int(round((a+b)/2))]
    return list(np.round(np.linspace(a, b, n)).astype(int))

def _color_for_id(tid: int):
    # stable-ish distinct color per track id
    return (37*tid % 256, 17*tid % 256, 93*tid % 256)

In [6]:
def _unwrap_phase(a: np.ndarray) -> np.ndarray:
    a = np.asarray(a, np.float32)
    return np.unwrap(a.astype(np.float64)).astype(np.float32)

def _unwrap_angle_series(theta):
    # theta in radians; unwrap to avoid 2π jumps
    theta = np.asarray(theta, np.float32)
    return np.unwrap(theta)

def _rot_to_align_shoulder(L_sh, R_sh):
    # rotate so vector (L_sh -> R_sh) lies on +x
    v = np.array(R_sh) - np.array(L_sh)
    theta = np.arctan2(v[1], v[0])  # angle to +x
    c, s = np.cos(-theta), np.sin(-theta)
    R = np.array([[c, -s], [s, c]], dtype=np.float32)  # rotate by -theta
    return R

def _to_torso_frame(pt, L_sh, R_sh, dominant='R'):
    """Translate so dominant shoulder at origin; rotate so shoulders horizontal;
       flip if dominant ends up on the left (ensure dominant shoulder at x>=0)."""
    if any(np.isnan(v).any() for v in [L_sh, R_sh, pt]):
        return np.array([np.nan, np.nan], np.float32)
    L_sh = np.array(L_sh, np.float32); R_sh = np.array(R_sh, np.float32); pt = np.array(pt, np.float32)
    R = _rot_to_align_shoulder(L_sh, R_sh)
    # pick which shoulder is "origin"
    S = R_sh if dominant == 'R' else L_sh
    # translate then rotate
    p = (pt - S) @ R.T
    # ensure dominant shoulder lies to the right of the other shoulder
    other_S = L_sh if dominant == 'R' else R_sh
    other_in = (other_S - S) @ R.T
    if other_in[0] > 0:  # dominant ended up at left; flip x to make it canonical
        p[0] = -p[0]
    return p  # (x,y) in torso frame

# **Load tracking CSV**

In [7]:
def load_tracks_csv(csv_path: str) -> Dict[int, List[Tuple[int, Tuple[int,int,int,int]]]]:
    """
    Returns {track_id: [(frame_idx, (x1,y1,x2,y2)), ...]} sorted by frame.
    """
    print(f"🎬 Loading track data from: {csv_path}")
    tracks = defaultdict(list)
    with open(csv_path, "r") as f:
        reader = csv.DictReader(f)
        # Assuming the CSV isn't too large for an in-memory load
        data = list(reader)
        for r in tqdm(data, desc="   -> Reading CSV lines"):
            t   = int(r["frame"])
            tid = int(r["id"])
            x1,y1,x2,y2 = int(r["x1"]), int(r["y1"]), int(r["x2"]), int(r["y2"])
            tracks[tid].append((t, (x1,y1,x2,y2)))

    print(f"   -> Found {len(tracks)} unique tracks. Sorting by frame...")
    for tid in tracks:
        tracks[tid].sort(key=lambda x: x[0])
    return tracks

def interpolate_track(timeline: List[Tuple[int, Tuple[int,int,int,int]]]):
    """
    Linear interpolation of bboxes across missing frames.
    Returns (t0, t1, dense_boxes: List[(x1,y1,x2,y2)]) covering every frame in [t0..t1].
    """
    ts  = np.array([t for t,_ in timeline], dtype=np.int32)
    bxs = np.array([b for _,b in timeline], dtype=np.float32)  # Nx4
    t0, t1 = int(ts[0]), int(ts[-1])
    T = np.arange(t0, t1+1, dtype=np.int32)
    out = [np.interp(T, ts, bxs[:,k]) for k in range(4)]
    dense = np.stack(out, axis=1).astype(np.int32)  # Lx4
    return t0, t1, [tuple(map(int, b)) for b in dense]

# **Dense TV-L1 flow**

In [8]:
class TVL1FlowDownscaled:
    """
    Downscale to width<=640 for speed. For each decoded frame, returns flow magnitude map.
    """
    def __init__(self, target_width=640):
        self.tvl1 = cv2.optflow.DualTVL1OpticalFlow_create()
        self.target_width = target_width
        self.prev_gray = None
        self.scale = 1.0
        self.small_shape = None

    def _prep(self, frame_bgr):
        H, W = frame_bgr.shape[:2]
        scale = self.target_width / float(W) if W > self.target_width else 1.0
        small = cv2.resize(frame_bgr, (int(W*scale), int(H*scale)), interpolation=cv2.INTER_AREA) if scale < 1.0 else frame_bgr
        gray = cv2.cvtColor(small, cv2.COLOR_BGR2GRAY)
        return gray, scale, small.shape[:2]

    def mag(self, frame_bgr):
        gray, scale, shp = self._prep(frame_bgr)
        if self.prev_gray is None:
            self.prev_gray = gray
            self.scale = scale
            self.small_shape = shp
            return np.zeros(shp, dtype=np.float32)
        flow = self.tvl1.calc(self.prev_gray, gray, None)
        self.prev_gray = gray
        self.scale = scale
        self.small_shape = shp
        return np.sqrt(flow[...,0]**2 + flow[...,1]**2)

# **Compute actionness**

In [9]:
@dataclass
class ActionnessCfg:
    """
    Flow-based actionness. Keep simple and robust.
    """
    ma_len: int = 5           # moving-average window on z-scored series
    high_k: float = 1.2       # start when z >= mu + high_k*std
    low_k: float  = 0.4       # stop when z <  mu + low_k*std
    min_dur_s: float = 0.26   # keep segments >= ~0.26s (≈ 8 frames @30fps)
    max_dur_s: float = 1.20   # clamp segments to <= ~1.2s
    bbox_margin: float = 1.25 # enlarge crop a bit so racket/arm stays in

def moving_average(x: np.ndarray, k: int) -> np.ndarray:
    if k <= 1: return x
    w = np.ones(k, dtype=np.float32)/k
    return np.convolve(x, w, mode="same")

def hysteresis_segments(z_sm: np.ndarray, high_k: float, low_k: float) -> List[Tuple[int,int]]:
    mu, sd = float(np.mean(z_sm)), float(np.std(z_sm) + 1e-6)
    high, low = mu + high_k*sd, mu + low_k*sd
    segs, on, s0 = [], False, None
    for i,v in enumerate(z_sm):
        if not on and v >= high:
            on, s0 = True, i
        elif on and v < low:
            on = False
            segs.append((s0, i))
            s0 = None
    if on: segs.append((s0, len(z_sm)-1))
    return segs

def compute_actionness_segments(video_path: str,
                                timeline: List[Tuple[int, Tuple[int,int,int,int]]],
                                fps: float,
                                cfg: ActionnessCfg) -> Tuple[np.ndarray, List[Tuple[int,int]]]:
    """
    Build per-frame actionness inside the player's bbox:
      - TV-L1 flow magnitude (95th percentile) + frame-diff (75th percentile)
      - Per-track z-score, MA smooth, hysteresis
      - Clamp segment durations (min_dur_s .. max_dur_s)

    Returns:
      z_sm  : smoothed z-score array aligned to [t0..t1]
      segsA : list of (t0_abs, t1_abs) frame indices
    """
    # interpolate bboxes densely for this track
    t0, t1, dense_boxes = interpolate_track(timeline)    # inclusive indices
    L = t1 - t0 + 1

    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened(): raise RuntimeError(f"Cannot open: {video_path}")
    W = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)); H = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))

    # We'll decode frames from t0-1 .. t1 to initialize flow
    cap.set(cv2.CAP_PROP_POS_FRAMES, max(0, t0-1))
    flow = TVL1FlowDownscaled(target_width=640)
    mags, diffs = [], []
    prev_small_gray = None

    for t in tqdm(range(max(0,t0-1), t1+1), desc=f"      -> Calculating flow ({L} frames)"):
        ok, frame = cap.read()
        if not ok: break
        mag = flow.mag(frame)               # downscaled mag
        gray_small = cv2.cvtColor(cv2.resize(frame, (mag.shape[1], mag.shape[0]), interpolation=cv2.INTER_AREA),
                                  cv2.COLOR_BGR2GRAY)
        if prev_small_gray is None:
            diff = np.zeros_like(gray_small, dtype=np.float32)
        else:
            diff = cv2.absdiff(gray_small, prev_small_gray).astype(np.float32)
        prev_small_gray = gray_small

        if t >= t0:                         # align to dense series
            mags.append(mag); diffs.append(diff)

    cap.release()
    if len(mags) != L:                      # robust fallback
        mags = [np.zeros(flow.small_shape, np.float32) for _ in range(L)]
        diffs = [np.zeros(flow.small_shape, np.float32) for _ in range(L)]

    # reduce to scalar per frame inside expanded bbox
    z_raw = []
    scale = flow.scale
    for k in range(L):
        b = dense_boxes[k]
        x1,y1,x2,y2 = _expand_bbox(b, cfg.bbox_margin, W, H)
        h_s, w_s = mags[k].shape
        xs1,xs2 = int(x1*scale), int(x2*scale)
        ys1,ys2 = int(y1*scale), int(y2*scale)
        xs1 = _clamp(xs1, 0, w_s-1); xs2 = _clamp(xs2, 0, w_s-1)
        ys1 = _clamp(ys1, 0, h_s-1); ys2 = _clamp(ys2, 0, h_s-1)
        if xs2 <= xs1 or ys2 <= ys1:
            z_raw.append(0.0); continue
        rm = mags[k][ys1:ys2, xs1:xs2]
        rd = diffs[k][ys1:ys2, xs1:xs2]
        # robust percentiles: peak motion > background noise
        m95 = float(np.percentile(rm, 95))
        d75 = float(np.percentile(rd, 75))
        z_raw.append(0.7*m95 + 0.3*d75)

    z_raw = np.asarray(z_raw, dtype=np.float32)
    mu, sd = float(z_raw.mean()), float(z_raw.std() + 1e-6)
    z = (z_raw - mu) / sd
    z_sm = moving_average(z, cfg.ma_len)

    coarse = hysteresis_segments(z_sm, cfg.high_k, cfg.low_k)

    # clamp durations in frames
    min_len = max(1, int(round(cfg.min_dur_s * fps)))
    max_len = max(min_len+1, int(round(cfg.max_dur_s * fps)))

    refined = []
    for a,b in coarse:
        if (b-a+1) < min_len: continue
        # center sub-window around the local peak, limited to max_len
        sub = z_sm[a:b+1]
        peak = int(np.argmax(sub))
        half = max(min((b-a+1)//2, max_len//2), min_len//2)
        c0 = a + max(0, peak - half)
        c1 = a + min(len(sub)-1, peak + half)
        if (c1-c0+1) >= min_len:
            refined.append((c0,c1))

    # merge overlaps and map to absolute indices
    refined = sorted(refined)
    merged = []
    for seg in refined:
        if not merged or seg[0] > merged[-1][1]:
            merged.append(list(seg))
        else:
            merged[-1][1] = max(merged[-1][1], seg[1])
    segs_abs = [(t0 + s0, t0 + s1) for s0,s1 in merged]
    return z_sm, segs_abs

# **Sampling with SlowFast**

In [10]:
@dataclass
class SFInferCfg:
    slow_t: int = 8            # <- from your cfg
    alpha: int = 4             # <- from your cfg
    side: int = 224            # short-side resize
    crop: int = 224            # center crop size
    mean: Tuple[float,float,float] = (0.45,0.45,0.45)   # Kinetics defaults (pytorchvideo)
    std:  Tuple[float,float,float] = (0.225,0.225,0.225)
    bbox_margin: float = 1.25
    bbox_ema: float = 0.8

def sample_indices(L: int, slow_t: int, alpha: int):
    """
    Produce indices for fast (slow_t*alpha) and slow (slow_t) pathways
    uniformly spanning the segment of length L.
    """
    need_fast = slow_t * alpha
    idx_fast = _linspace_idx(0, max(0, L-1), need_fast)
    idx_slow = idx_fast[::alpha]
    if len(idx_slow) < slow_t:
        idx_slow += [idx_slow[-1]] * (slow_t - len(idx_slow))
    elif len(idx_slow) > slow_t:
        idx_slow = idx_slow[:slow_t]
    return idx_slow, idx_fast

def extract_clip_slowfast(video_path: str,
                          segment: Tuple[int,int],
                          dense_boxes: List[Tuple[int,int,int,int]],
                          sfcfg: SFInferCfg,
                          content_mode: str = "square"  # "square" | "center_crop" | "letterbox"
                          ):
    """
    Build (slow, fast) tensors:
        slow: (3, slow_t, 224, 224)  fast: (3, slow_t*alpha, 224, 224)
    content_mode:
      - "square": expand bbox to square with margin, then RESIZE to sfcfg.side (no center crop).  <-- matches training
      - "center_crop": current behavior (short-side resize to side, then center crop to crop).
      - "letterbox": keep aspect, pad to square side.
    """
    t0, t1 = segment
    L = t1 - t0 + 1

    # EMA bboxes
    smoothed = []
    prev = None
    for b in dense_boxes:
        arr = np.array(b, dtype=np.float32)
        prev = arr if prev is None else sfcfg.bbox_ema*prev + (1-sfcfg.bbox_ema)*arr
        smoothed.append(tuple(prev.astype(int)))

    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened(): raise RuntimeError(f"Cannot open {video_path}")
    W = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)); H = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    cap.set(cv2.CAP_PROP_POS_FRAMES, t0)
    frames = []
    for _ in range(L):
        ok, fr = cap.read()
        if not ok: break
        frames.append(fr)
    cap.release()
    if len(frames) < L and len(frames) > 0:
        frames += [frames[-1]] * (L - len(frames))

    # crops
    crops = []
    for k in range(L):
        x1,y1,x2,y2 = smoothed[k]
        if content_mode == "square":
            sx1, sy1, sx2, sy2 = _expand_to_square(x1, y1, x2, y2, W, H, factor=sfcfg.bbox_margin)
            crop = frames[k][sy1:sy2, sx1:sx2]
            if crop.size == 0: crop = frames[k]
            crop = _square_resize_no_crop(crop, sfcfg.side)        # <-- NO center-crop
        elif content_mode == "letterbox":
            # rectangular margin, then letterbox to square 'side'
            cx, cy = (x1+x2)*0.5, (y1+y2)*0.5
            ww, hh = (x2-x1)*sfcfg.bbox_margin, (y2-y1)*sfcfg.bbox_margin
            rx1 = _clamp(int(round(cx - ww*0.5)), 0, W-1)
            ry1 = _clamp(int(round(cy - hh*0.5)), 0, H-1)
            rx2 = _clamp(int(round(cx + ww*0.5)), 0, W-1)
            ry2 = _clamp(int(round(cy + hh*0.5)), 0, H-1)
            crop = frames[k][ry1:ry2, rx1:rx2] if rx2>rx1 and ry2>ry1 else frames[k]
            # letterbox to sfcfg.side
            h, w = crop.shape[:2]
            if h < w:
                new_h, new_w = sfcfg.side, int(round(w * sfcfg.side / h))
            else:
                new_w, new_h = sfcfg.side, int(round(h * sfcfg.side / w))
            rs = cv2.resize(crop, (new_w, new_h), interpolation=cv2.INTER_AREA)
            canvas = np.zeros((sfcfg.side, sfcfg.side, 3), dtype=rs.dtype)
            top = max(0, (sfcfg.side - new_h)//2); left = max(0, (sfcfg.side - new_w)//2)
            canvas[top:top+new_h, left:left+new_w] = rs[:sfcfg.side, :sfcfg.side]
            crop = canvas
        else:  # "center_crop" (previous behavior)
            x1e,y1e,x2e,y2e = _expand_bbox((x1,y1,x2,y2), sfcfg.bbox_margin, W, H)
            crop = frames[k][y1e:y2e, x1e:x2e]
            if crop.size == 0: crop = frames[k]
            crop = _center_resize_crop(crop, sfcfg.side, sfcfg.crop)

        crops.append(crop)

    # sample temporal indices as before
    idx_slow, idx_fast = sample_indices(L, sfcfg.slow_t, sfcfg.alpha)
    slow_frames = [crops[i] for i in idx_slow]
    fast_frames = [crops[i] for i in idx_fast]

    def _to_tensor(frames_bgr):
        arr = np.stack([cv2.cvtColor(im, cv2.COLOR_BGR2RGB) for im in frames_bgr], axis=0).astype(np.float32)/255.0
        mean = np.array(sfcfg.mean, dtype=np.float32).reshape(1,1,1,3)
        std  = np.array(sfcfg.std,  dtype=np.float32).reshape(1,1,1,3)
        arr = (arr - mean) / (std + 1e-6)
        arr = np.transpose(arr, (3,0,1,2))  # C,T,H,W
        return torch.from_numpy(arr)

    slow_t = _to_tensor(slow_frames)
    fast_t = _to_tensor(fast_frames)
    return slow_t, fast_t

# **Load SlowFast**

In [11]:
def load_slowfast_classifier(cfg, ckpt_path: str, device: Optional[torch.device] = None):
    """
    Loads slowfast_r101 (pytorchvideo), replaces head with len(cfg.labels),
    loads checkpoint, sets to eval(). Matches your training code.
    """
    device = device or _device()
    print(f"🧠 Loading SlowFast model from: {ckpt_path}")
    torch.hub._validate_not_a_forked_repo = lambda a,b,c: True
    model = torch.hub.load('facebookresearch/pytorchvideo', 'slowfast_r101', pretrained=True)
    in_dim = model.blocks[-1].proj.in_features
    in_dim = model.blocks[-1].proj.in_features
    model.blocks[-1].proj = nn.Sequential(
        nn.Dropout(p=0.4), # Add a dropout layer
        nn.Linear(in_dim, len(cfg.labels))
    )
    ckpt = torch.load(ckpt_path, map_location=device)
    model.load_state_dict(ckpt["model"], strict=True)
    model.eval().to(device)
    print(f"   -> Model loaded and set to evaluation mode on device: {device}")
    return model

In [12]:
class SlowFastPredictor:
    """Thin wrapper that accepts lists of (slow,fast) tensors and returns probs."""
    def __init__(self, model, device=None):
        self.model = model.eval()
        self.device = device or _device()

    @torch.no_grad()
    def predict_batch(self, slow_list: List[torch.Tensor], fast_list: List[torch.Tensor]) -> np.ndarray:
        slow = torch.stack(slow_list).to(self.device)  # (B,3,T,H,W)
        fast = torch.stack(fast_list).to(self.device)  # (B,3,T,H,W)
        logits = self.model([slow, fast])              # pytorchvideo expects [slow, fast]
        return torch.softmax(logits, dim=1).cpu().numpy()

In [13]:
class Config:
    def __init__(self):
        self.root_dir = "/content/drive/MyDrive/FIT3163,3164/SlowFast"
        self.clips_dir = os.path.join(self.root_dir, "05_clips/3in1")
        self.splits_dir = os.path.join(self.root_dir, "06_splits/3in1")
        self.models_dir = os.path.join(self.root_dir, "07_models/3in1_train3")
        self.best_model_path = os.path.join(self.root_dir, "07_models", "slowfast_3.pt")

        self.labels = [
            "smash", "jump_smash", "block",
            "drop", "clear", "lift", "drive",
            "straight_net", "cross_net", "serve",
            "push", "tap",
        ]

        # Dataset parameters
        self.side = 224             # ori: 224
        self.slow_t = 8             # 8 frames for slow pathway
        self.alpha = 4              # ratio between fast and slow
        self.fast_t = self.slow_t * self.alpha
        self.fast_target = 224      # ori: 224

        # Training parameters
        self.epochs = 30
        self.batch_size = 8
        self.learning_rate = 0.001
        self.weight_decay = 0.001

        self.early_stopping_patience = 5

# Create a configuration object
cfg = Config()

# **POSE**

In [14]:
class PoseEngineYOLO:
    """
    Lightweight wrapper around Ultralytics YOLO Pose.
    Works on *cropped* single-person images (your stabilized crops).
    Returns COCO joints in pixel coords with confidences.
    """
    def __init__(self, weights="yolov8s-pose.pt", device=None, conf=0.25):
        from ultralytics import YOLO
        self.model = YOLO(weights)
        self.conf = conf
        self.device = str(device) if device is not None else ("cuda" if torch.cuda.is_available() else "cpu")
        self.model.to(self.device)

    @torch.no_grad()
    def infer_keypoints(self, img_bgr: np.ndarray):
        # Ultralytics expects RGB
        img_rgb = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2RGB)
        res = self.model.predict(
            img_rgb, conf=self.conf, verbose=False,
            device=self.device, imgsz=max(img_rgb.shape[:2])
        )
        if not res or res[0].keypoints is None or len(res[0].keypoints) == 0:
            return None

        kp = res[0].keypoints  # ultralytics.engine.results.Keypoints
        # xy: (N, K, 2), conf: (N, K) or (N, K, 1), data: (N, K, 3)
        # ---- extract numpy arrays
        if hasattr(kp, "xy") and kp.xy is not None:
            xy = kp.xy
            xy = xy.detach().cpu().numpy() if isinstance(xy, torch.Tensor) else np.asarray(xy)
        else:
            xy = kp.data[..., :2]
            xy = xy.detach().cpu().numpy() if isinstance(xy, torch.Tensor) else np.asarray(xy)

        if hasattr(kp, "conf") and kp.conf is not None:
            confs = kp.conf
            confs = confs.detach().cpu().numpy() if isinstance(confs, torch.Tensor) else np.asarray(confs)
            if confs.ndim == 3 and confs.shape[-1] == 1:
                confs = confs[..., 0]  # (N, K)
        else:
            # fallback from data[...,2]
            if hasattr(kp, "data") and kp.data is not None:
                confs = kp.data[..., 2]
                confs = confs.detach().cpu().numpy() if isinstance(confs, torch.Tensor) else np.asarray(confs)
            else:
                confs = None

        # shapes
        if xy.ndim != 3:
            return None
        N, K, _ = xy.shape
        if N == 0 or K < 13:  # need shoulders/elbows/wrists/hips (COCO ~17)
            return None

        # ---- choose a single person index j
        if confs is not None and confs.shape[:2] == (N, K):
            mean_scores = np.nanmean(confs, axis=1)  # (N,)
            j = int(np.nanargmax(mean_scores))
        else:
            # fallback: choose the detection closest to the crop center
            centers = xy.mean(axis=1)  # (N,2)
            cx, cy = img_rgb.shape[1] / 2.0, img_rgb.shape[0] / 2.0
            d2 = (centers[:, 0] - cx) ** 2 + (centers[:, 1] - cy) ** 2
            j = int(np.argmin(d2))

        xy_sel = xy[j]                 # (K, 2)
        if confs is not None and confs.shape[:2] == (N, K):
            conf_sel = confs[j]        # (K,)
        else:
            conf_sel = np.ones((K,), dtype=np.float32)

        # COCO indices (guard if model has fewer KPs)
        L_SH, R_SH = 5, 6
        L_EL, R_EL = 7, 8
        L_WR, R_WR = 9, 10
        L_HP, R_HP = 11, 12

        def safe_pack(i):
            if i is None or i >= K:
                return None
            x, y = float(xy_sel[i, 0]), float(xy_sel[i, 1])
            c = float(conf_sel[i]) if i < len(conf_sel) else 1.0
            return (x, y, c)

        return {
            "L_sh": safe_pack(L_SH), "R_sh": safe_pack(R_SH),
            "L_el": safe_pack(L_EL), "R_el": safe_pack(R_EL),
            "L_wr": safe_pack(L_WR), "R_wr": safe_pack(R_WR),
            "L_hp": safe_pack(L_HP), "R_hp": safe_pack(R_HP),
        }

In [15]:
def stabilized_crops_for_track_pose(
    video_path: str,
    timeline: List[Tuple[int, Tuple[int,int,int,int]]],
    out_side: int = 288,
    margin: float = 1.25,
    bbox_ema: float = 0.8,
    mode: str = "square",   # "square" | "letterbox"
):
    """
    Returns: t0, t1, crops[list of BGR], smoothed_dense_boxes[list[(x1,y1,x2,y2)]], (W,H)
    - square: expand to square with margin, then resize to out_side (no crop)
    - letterbox: rectangular margin crop -> letterbox to out_side (no crop)
    """
    t0, t1, dense_boxes = interpolate_track(timeline)
    # EMA smooth original boxes
    smoothed = []
    prev = None
    for b in dense_boxes:
        arr = np.array(b, dtype=np.float32)
        prev = arr if prev is None else bbox_ema*prev + (1.0-bbox_ema)*arr
        smoothed.append(tuple(prev.astype(int)))

    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened(): raise RuntimeError(f"Cannot open {video_path}")
    W = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)); H = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    cap.set(cv2.CAP_PROP_POS_FRAMES, t0)

    frames = []
    for _ in range(t1 - t0 + 1):
        ok, fr = cap.read()
        if not ok: break
        frames.append(fr)
    cap.release()
    L = t1 - t0 + 1
    if len(frames) < L and len(frames) > 0:
        frames += [frames[-1]] * (L - len(frames))

    crops = []
    for k in range(L):
        x1,y1,x2,y2 = smoothed[k]
        if mode == "square":
            sx1, sy1, sx2, sy2 = _expand_to_square(x1, y1, x2, y2, W, H, factor=margin)
            crop = frames[k][sy1:sy2, sx1:sx2]
            if crop.size == 0: crop = frames[k]
            crop = cv2.resize(crop, (out_side, out_side), interpolation=cv2.INTER_AREA)
        elif mode == "letterbox":
            # rectangular margin
            cx, cy = (x1+x2)*0.5, (y1+y2)*0.5
            w = (x2-x1) * margin; h = (y2-y1) * margin
            rx1 = _clamp(int(round(cx - w*0.5)), 0, W-1)
            ry1 = _clamp(int(round(cy - h*0.5)), 0, H-1)
            rx2 = _clamp(int(round(cx + w*0.5)), 0, W-1)
            ry2 = _clamp(int(round(cy + h*0.5)), 0, H-1)
            if rx2 <= rx1 or ry2 <= ry1:
                crop = frames[k]
            else:
                crop = frames[k][ry1:ry2, rx1:rx2]
            crop = _short_side_resize_then_letterbox(crop, out_side)
        else:
            raise ValueError("mode must be 'square' or 'letterbox'")
        crops.append(crop)

    return t0, t1, crops, smoothed, (W,H)

In [16]:
def _angle_deg(a, b, c):
    # angle ABC at point B
    a = np.array(a, np.float32); b = np.array(b, np.float32); c = np.array(c, np.float32)
    ba = a - b; bc = c - b
    num = float(np.dot(ba, bc))
    den = float(np.linalg.norm(ba)*np.linalg.norm(bc) + 1e-6)
    cosv = np.clip(num / den, -1.0, 1.0)
    return float(np.degrees(np.arccos(cosv)))

def _ema_series(x, alpha=0.7):
    y, m = [], None
    for v in x:
        m = v if m is None else alpha*m + (1.0-alpha)*v
        y.append(m)
    return np.asarray(y, np.float32)

def _central_diff(x):
    x = np.asarray(x, np.float32)
    if len(x) < 3: return np.zeros_like(x)
    d = np.zeros_like(x)
    d[1:-1] = 0.5*(x[2:] - x[:-2])
    d[0] = x[1] - x[0]
    d[-1] = x[-1] - x[-2]
    return d

def _robust_norm(x):
    x = np.asarray(x, np.float32)
    med = np.median(x); mad = np.median(np.abs(x - med)) + 1e-6
    return (x - med) / (1.4826*mad)

def _lin_interp_nans(x):
    x = np.asarray(x, np.float32)
    n = len(x)
    xs = np.arange(n)
    mask = ~np.isnan(x)
    if mask.sum() < 2:
        return np.nan_to_num(x, nan=0.0)
    x[~mask] = np.interp(xs[~mask], xs[mask], x[mask])
    return x

def _pick(pt):  # convert (x,y,conf)/None to (x,y) with possible NaNs
    if pt is None or pt[2] < 0.15:  # conf threshold
        return (np.nan, np.nan)
    return (pt[0], pt[1])

In [17]:
def pose_features_from_crops(crops: List[np.ndarray], pose_engine: "PoseEngineYOLO"):
    """
    Returns dict with per-frame features for left/right and a chosen dominant arm:
      elbow_vel_{L,R}, shoulder_vel_{L,R}, wrist_speed_{L,R}, dominant ('L'/'R')
    """
    L = len(crops)
    L_sh, R_sh = [], []
    L_el, R_el = [], []
    L_wr, R_wr = [], []
    L_hp, R_hp = [], []

    # 1) run pose
    for img in crops:
        kp = pose_engine.infer_keypoints(img)
        if kp is None:
            L_sh.append((np.nan,np.nan)); R_sh.append((np.nan,np.nan))
            L_el.append((np.nan,np.nan)); R_el.append((np.nan,np.nan))
            L_wr.append((np.nan,np.nan)); R_wr.append((np.nan,np.nan))
            L_hp.append((np.nan,np.nan)); R_hp.append((np.nan,np.nan))
            continue
        L_sh.append(_pick(kp["L_sh"])); R_sh.append(_pick(kp["R_sh"]))
        L_el.append(_pick(kp["L_el"])); R_el.append(_pick(kp["R_el"]))
        L_wr.append(_pick(kp["L_wr"])); R_wr.append(_pick(kp["R_wr"]))
        L_hp.append(_pick(kp["L_hp"])); R_hp.append(_pick(kp["R_hp"]))

    # 2) angles
    def angles_and_wrist(L_sh, L_el, L_wr, L_hp):
        # elbow angle at elbow; shoulder "torso" angle using shoulder-hip vector as torso
        el_ang = []
        sh_ang = []
        wspd   = []
        prev_w = None
        for t in range(L):
            S, E, W, H = L_sh[t], L_el[t], L_wr[t], L_hp[t]
            if any(np.isnan(v).any() for v in [S,E,W]):
                el_ang.append(np.nan); sh_ang.append(np.nan); wspd.append(np.nan)
                prev_w = None
                continue
            # torso reference: shoulder->hip (if hip missing, use vertical surrogate)
            if not np.isnan(H).any():
                torso_dir = (H[0], H[1]-100)
                sh_a = _angle_deg(E, S, torso_dir)  # angle between upper-arm and torso direction
            else:
                torso_dir = (S[0], S[1]-100)
                sh_a = _angle_deg(E, S, torso_dir)

            el_a = _angle_deg(S, E, W)
            # wrist speed (pixels per frame in crop coordinates)
            if prev_w is None or np.isnan(np.array(prev_w)).any():
                v = 0.0
            else:
                v = float(np.linalg.norm(np.array(W) - np.array(prev_w)))
            prev_w = W

            el_ang.append(el_a); sh_ang.append(sh_a); wspd.append(v)
        return np.array(el_ang, np.float32), np.array(sh_ang, np.float32), np.array(wspd, np.float32)

    L_el_ang, L_sh_ang, L_wspd = angles_and_wrist(L_sh, L_el, L_wr, L_hp)
    R_el_ang, R_sh_ang, R_wspd = angles_and_wrist(R_sh, R_el, R_wr, R_hp)

    # 3) smooth + differentiate
    def velocities(el, sh, ws):
        el = _lin_interp_nans(el); sh = _lin_interp_nans(sh); ws = _lin_interp_nans(ws)
        el_s = _ema_series(el, 0.7); sh_s = _ema_series(sh, 0.7); ws_s = _ema_series(ws, 0.7)
        el_vel = np.abs(_central_diff(el_s))
        sh_vel = np.abs(_central_diff(sh_s))
        return el_vel, sh_vel, ws_s

    L_elv, L_shv, L_w = velocities(L_el_ang, L_sh_ang, L_wspd)
    R_elv, R_shv, R_w = velocities(R_el_ang, R_sh_ang, R_wspd)

    # 4) choose dominant arm per track (more stable than per-frame)
    L_strength = np.nanpercentile(L_w, 80)
    R_strength = np.nanpercentile(R_w, 80)
    dominant = 'L' if L_strength >= R_strength else 'R'

    return {
        "L_elv": L_elv, "L_shv": L_shv, "L_w": L_w,
        "R_elv": R_elv, "R_shv": R_shv, "R_w": R_w,
        "dominant": dominant
    }

In [18]:
def backhand_score_from_crops(
    crops: List[np.ndarray],
    pose_engine: "PoseEngineYOLO",
    ma_len: int = 5,
    cross_pos_thresh: float = 0.6,   # how far across the midline counts as "on opposite side" (in shoulder-widths)
):
    """
    Compute a backhand-specific score per frame using pose only.
    Signals:
      - wrist on/over opposite side of torso (position)
      - wrist moving toward opposite side (signed lateral velocity)
      - torso yaw rate (small contribution)
      - small motion gate via wrist speed
    All signals are made scale-invariant via shoulder-width normalization.
    Returns: z_bh: (L,) float32
    """
    L = len(crops)
    if L == 0:
        return np.zeros((0,), np.float32)

    # ---- 1) Run pose once for this crop sequence
    L_sh = []; R_sh = []; L_wr = []; R_wr = []
    for img in crops:
        kp = pose_engine.infer_keypoints(img)
        if kp is None:
            L_sh.append((np.nan,np.nan)); R_sh.append((np.nan,np.nan))
            L_wr.append((np.nan,np.nan)); R_wr.append((np.nan,np.nan))
        else:
            L_sh.append(_pick(kp["L_sh"])); R_sh.append(_pick(kp["R_sh"]))
            L_wr.append(_pick(kp["L_wr"])); R_wr.append(_pick(kp["R_wr"]))

    # ---- 2) Interpolate missing points (per-dim) for stability
    def _interp2(arr2):
        arr2 = np.asarray(arr2, np.float32)
        out = np.zeros_like(arr2)
        for d in range(2):
            out[:, d] = _lin_interp_nans(arr2[:, d])
        return out

    L_sh = _interp2(L_sh); R_sh = _interp2(R_sh)
    L_wr = _interp2(L_wr); R_wr = _interp2(R_wr)

    # ---- 3) Torso frame: shoulder midpoint, unit shoulder axis ex, shoulder width sw
    M = 0.5 * (L_sh + R_sh)                        # (L,2)
    vLR = R_sh - L_sh                              # (L,2)
    sw = np.linalg.norm(vLR, axis=1) + 1e-6        # (L,)
    ex = vLR / sw[:, None]                         # (L,2) unit vector L->R

    # torso yaw (for a tiny boost)
    yaw = np.arctan2(ex[:,1], ex[:,0])
    yaw = np.unwrap(yaw)
    yaw = _ema_series(yaw, 0.7)
    yaw_rate = np.abs(_central_diff(yaw))
    yaw_rate = _robust_norm(yaw_rate)

    # ---- 4) Per-arm signed wrist position (in shoulder-widths) and velocities
    # lp = dot((wrist - mid), ex) / sw  (signed: + right side, - left side)
    def signed_lat_pos_vel(W):
        rel = (W - M)                                # (L,2)
        lp  = (rel[:,0]*ex[:,0] + rel[:,1]*ex[:,1]) / sw   # (L,)
        lp  = _ema_series(_lin_interp_nans(lp), 0.6)
        # central diff of lp (units: shoulder-widths / frame)
        vlat = _central_diff(lp)
        return lp, vlat

    L_lp, L_vlat = signed_lat_pos_vel(L_wr)
    R_lp, R_vlat = signed_lat_pos_vel(R_wr)

    # wrist speed gate (scale-invariant)
    def wrist_speed(W):
        W_s = _ema_series(W, 0.6)
        dW  = np.zeros_like(W_s)
        dW[1:-1] = 0.5 * (W_s[2:] - W_s[:-2])
        dW[0]  = W_s[1] - W_s[0]
        dW[-1] = W_s[-1] - W_s[-2]
        wsp = np.linalg.norm(dW, axis=1) / sw
        return _robust_norm(wsp)

    L_wspd = wrist_speed(L_wr)
    R_wspd = wrist_speed(R_wr)

    # ---- 5) Backhand signals per arm
    # Expected own-side sign: right arm expects + (right), left arm expects - (left)
    # Backhand = wrist on / moving toward the opposite side.
    def arm_backhand_score(lp, vlat, wspd, expect_sign):
        # position term: how far across the midline (0..1, ~1 when ≥ cross_pos_thresh)
        # lp is in shoulder-widths; opposite side => -expect_sign * lp > 0
        pos_raw = np.maximum(0.0, -expect_sign * lp)                      # 0..inf (in widths)
        pos_term = np.clip(pos_raw / max(1e-6, cross_pos_thresh), 0.0, 1.0)

        # motion term: moving toward the opposite side (robust z of vlat, positive part)
        vlat_z = _robust_norm(vlat)
        move_term = np.maximum(0.0, -expect_sign * vlat_z)                # >= 0 only when moving cross-body

        # small wrist-speed gate: encourage real motion (but don't kill static net blocks)
        wsp_gate = np.clip(0.5 + 0.5 * _robust_norm(wspd), 0.0, 1.0)

        # combine (keep simple; pos dominates, then motion; yaw added later globally)
        s = 0.65 * pos_term + 0.35 * move_term
        return s * wsp_gate

    sL = arm_backhand_score(L_lp, L_vlat, L_wspd, expect_sign=-1.0)  # left arm expects - (left side)
    sR = arm_backhand_score(R_lp, R_vlat, R_wspd, expect_sign=+1.0)  # right arm expects + (right side)

    # Fuse arms by max (no dominant-hand assumption)
    s = np.maximum(sL, sR)

    # Tiny torso-yaw boost (clipped) to help defense backhands with small wrist travel
    s = s + 0.12 * np.clip(yaw_rate, 0.0, 2.5)

    # Smooth
    z_bh = moving_average(s, ma_len).astype(np.float32)
    return z_bh

In [19]:
def pose_actionness_score(feat, ma_len=5):
    # Choose arm series based on track-dominant arm
    arm = feat["dominant"]
    elv = feat[f"{arm}_elv"]; shv = feat[f"{arm}_shv"]; w = feat[f"{arm}_w"]
    # robust normalization per track
    eN = _robust_norm(elv); sN = _robust_norm(shv); wN = _robust_norm(w)
    score = 0.5*eN + 0.3*sN + 0.2*wN
    return moving_average(score, ma_len)

In [20]:
def compute_pose_actionness_segments_old_working(
    video_path, timeline, fps, cfg, pose_engine, sfcfg=None, *,
    pose_side: int = 320,       # give pose more pixels
    pose_mode: str = "square",  # "square" (match training) or "letterbox" (no distortion)
    padding_frames: int = 10
):
    """
    Returns:
      z_sm : smoothed pose-actionness array aligned to [t0..t1]
      segs : list of (t0_abs, t1_abs)
    """
    sfcfg = sfcfg or SFInferCfg()
    # 1) stabilized crops for the whole track
    t0, t1, crops, smoothed_boxes, (W,H) = stabilized_crops_for_track_pose(
        video_path, timeline,
        out_side=pose_side,
        margin=cfg.bbox_margin,
        bbox_ema=sfcfg.bbox_ema,
        mode=pose_mode
    )
    L = t1 - t0 + 1
    if L <= 0: return np.zeros((0,), np.float32), []

    # 2) pose features + score
    feat = pose_features_from_crops(crops, pose_engine)
    z_sm = pose_actionness_score(feat, ma_len=cfg.ma_len)

    # for backhand:
    # z_bh = backhand_score_from_crops(crops, pose_engine, ma_len=cfg.ma_len)
    # z_sm = np.maximum(z_fh, z_bh)

    # 3) hysteresis
    # coarse = hysteresis_segments(z_fh, high_k=1.0, low_k=0.3)  # slightly lower for cleaner signal
    coarse = hysteresis_segments(z_sm, high_k=1.0, low_k=0.3)  # this line for backhand
    print(f"{len(coarse)} hysteresis segments found.")

    # 4) clamp durations + dead-time
    min_len = max(1, int(round(cfg.min_dur_s * fps)))
    max_len = max(min_len+1, int(round(cfg.max_dur_s * fps)))
    dead = max(1, int(round(0.08 * fps)))  # ~80ms refractory to avoid splitting

    refined = []
    last_end = -10**9
    for a,b in coarse:
        if (b-a+1) < min_len: continue
        # center around local peak but respect max_len
        sub = z_sm[a:b+1]
        peak = int(np.argmax(sub))
        half = max(min((b-a+1)//2, max_len//2), min_len//2)
        c0 = a + max(0, peak - half)
        c1 = a + min(len(sub)-1, peak + half)
        if c0 - last_end < dead:
            # merge with previous if too close
            if refined:
                refined[-1] = (refined[-1][0], max(refined[-1][1], c1))
            else:
                refined.append((c0,c1))
        else:
            refined.append((c0,c1))
        last_end = refined[-1][1]

    # 5) convert to absolute with padding, clip to video bounds, and re-merge
    #    (padding is in frames; use keyword when calling)
    cap = cv2.VideoCapture(video_path)
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT) or 0)
    cap.release()
    lo, hi = 0, max(0, total_frames - 1)

    padded = []
    for s0, s1 in refined:
        a = max(lo, t0 + s0 - padding_frames)
        b = min(hi, t0 + s1 + padding_frames)
        if b >= a:
            padded.append((a, b))

    padded.sort()
    merged = []
    for a, b in padded:
        if not merged or a > merged[-1][1] + 1:
            merged.append([a, b])
        else:
            merged[-1][1] = max(merged[-1][1], b)

    segs_abs = [(int(a), int(b)) for a, b in merged]
    return z_sm, segs_abs

In [21]:
def compute_pose_actionness_segments(
    video_path, timeline, fps, cfg, pose_engine, sfcfg=None, *,
    pose_side: int = 320,       # give pose more pixels
    pose_mode: str = "square",  # "square" (match training) or "letterbox" (no distortion)
    padding_frames: int = 10
):
    """
    Returns:
      z_sm : your ORIGINAL forehand score (for debug/plots)
      segs : list of (t0_abs, t1_abs) = UNION(FH segments, BH segments)
    """
    sfcfg = sfcfg or SFInferCfg()
    # 1) stabilized crops for the whole track (no-loss pose crops)
    t0, t1, crops, smoothed_boxes, (W,H) = stabilized_crops_for_track_pose(
        video_path, timeline,
        out_side=pose_side,
        margin=cfg.bbox_margin,
        bbox_ema=sfcfg.bbox_ema,
        mode=pose_mode
    )
    L = t1 - t0 + 1
    if L <= 0: return np.zeros((0,), np.float32), []

    # 2) scores (FH = your original; BH = separate backhand score)
    feat = pose_features_from_crops(crops, pose_engine)                 # your existing code
    z_fh = pose_actionness_score(feat, ma_len=cfg.ma_len)               # forehand score (unchanged)
    z_bh = backhand_score_from_crops(crops, pose_engine, ma_len=cfg.ma_len)  # backhand score

    # 3) hysteresis per score (separate; scale differences don't matter)
    coarse_fh = hysteresis_segments(z_fh, high_k=1.0, low_k=0.3)
    coarse_bh = hysteresis_segments(z_bh, high_k=1.0, low_k=0.3)
    print(f"{len(coarse_fh)} FH and {len(coarse_bh)} BH hysteresis segments found.")

    # 4) same refinement as before, done independently, then union
    min_len = max(1, int(round(cfg.min_dur_s * fps)))
    max_len = max(min_len+1, int(round(cfg.max_dur_s * fps)))
    dead    = max(1, int(round(0.08 * fps)))  # ~80ms

    def _refine(coarse, z):
        refined, last_end = [], -10**9
        for a,b in coarse:
            if (b-a+1) < min_len:
                continue
            sub  = z[a:b+1]
            peak = int(np.argmax(sub))
            half = max(min((b-a+1)//2, max_len//2), min_len//2)
            c0 = a + max(0, peak - half)
            c1 = a + min(len(sub)-1, peak + half)
            if refined and (c0 - refined[-1][1]) < dead:
                refined[-1] = (refined[-1][0], max(refined[-1][1], c1))
            else:
                refined.append((c0,c1))
            last_end = refined[-1][1]
        return refined

    refined_fh = _refine(coarse_fh, z_fh)
    refined_bh = _refine(coarse_bh, z_bh)

    # 5) convert to absolute with padding, clip to video bounds, and re-merge (UNION)
    cap = cv2.VideoCapture(video_path)
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT) or 0)
    cap.release()
    lo, hi = 0, max(0, total_frames - 1)

    def _pad_map(segs_rel):
        out = []
        for s0, s1 in segs_rel:
            a = max(lo, t0 + s0 - padding_frames)
            b = min(hi, t0 + s1 + padding_frames)
            if b >= a: out.append((a,b))
        return out

    padded = sorted(_pad_map(refined_fh) + _pad_map(refined_bh))  # union here
    merged = []
    for a, b in padded:
        if not merged or a > merged[-1][1] + 1:
            merged.append([a, b])
        else:
            merged[-1][1] = max(merged[-1][1], b)

    max_total_len = max_len + 2*padding_frames
    final = []
    for a, b in merged:
        Lseg = (b - a + 1)
        if Lseg <= max_total_len and Lseg >= min_len:
            print(f"adding segment from {a} to {b}")
            final.append((a, b))
        elif Lseg >= min_len:
            # split into chunks of at most max_total_len
            s = a
            while s <= b:
                e = min(s + max_total_len - 1, b)
                print(f"splitting, adding segment from {s} to {e}")
                if (e - s + 1) >= min_len:
                    final.append((s, e))
                s = e + 1

    segs_abs = [(int(a), int(b)) for a, b in final]

    # return your original FH score (unchanged) + unioned segments
    return z_fh.astype(np.float32), segs_abs

# **Main function**

In [22]:
def run_full_video_to_events(
    video_path: str,
    csv_path: str,
    cfg,                                 # your Config
    ckpt_path: str,
    action_cfg: Optional[ActionnessCfg] = None,
    sf_cfg: Optional[SFInferCfg] = None,
    pose_engine: Optional[PoseEngineYOLO] = None,   # <--- NEW
    use_pose_actionness: bool = True                # <--- toggle
) -> Dict:
    print("=========================================")
    print("🚀 Starting Action Event Inference Pipeline")
    print(f"Video: {os.path.basename(video_path)}")
    print(f"Tracking: {os.path.basename(csv_path)}")
    print("=========================================")

    action_cfg = action_cfg or ActionnessCfg()
    sf_cfg = sf_cfg or SFInferCfg(slow_t=cfg.slow_t, alpha=cfg.alpha, side=cfg.side, crop=cfg.side)

    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened(): raise RuntimeError(f"Cannot open: {video_path}")
    fps = cap.get(cv2.CAP_PROP_FPS) or 30.0
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    print(f"🎥 Video Meta: FPS={fps:.2f}, Total Frames={total_frames}")
    cap.release()

    tracks = load_tracks_csv(csv_path)
    model = load_slowfast_classifier(cfg, ckpt_path, _device())
    predictor = SlowFastPredictor(model, _device())
    total_tracks = len(tracks)
    print(f"🎯 Inference will run on {total_tracks} tracks.")
    print("-" * 40)

    events, debug = [], {}
    track_ids = list(tracks.keys())

    # CHANGE HERE
    padding_seconds = 0.3
    padding_frames = int(round(padding_seconds * fps))

    for i, tid in enumerate(tqdm(track_ids, desc="Total Track Progress")):
        timeline = tracks[tid]
        print(f"\n   [TRACK {i+1}/{total_tracks}] ID: {tid} ({len(timeline)} frames over {timeline[-1][0]-timeline[0][0]+1} total frames)")

        # 3a. Compute Actionness & Segments
        if use_pose_actionness and pose_engine is not None:
            z_sm, segs = compute_pose_actionness_segments(
                video_path, timeline, fps, action_cfg, pose_engine,
                sfcfg=sf_cfg,
                pose_side=320,
                pose_mode="square",      # or "letterbox"
                padding_frames=padding_frames
            )
            debug[int(tid)] = {"t0": int(timeline[0][0]), "z_pose": list(map(float, z_sm)), "segments": [(int(a),int(b)) for a,b in segs]}
        else:
            z_sm, segs = compute_actionness_segments(video_path, timeline, fps, action_cfg)
            debug[int(tid)] = {"t0": int(timeline[0][0]), "z_flow": list(map(float, z_sm)), "segments": [(int(a),int(b)) for a,b in segs]}

        print(f"      -> Found {len(segs)} potential action segments.")
        if not segs:
            continue

        # 3b. Prepare Clips for SlowFast
        t0, t1, dense_boxes = interpolate_track(timeline)
        slow_batch, fast_batch, metas = [], [], []
        for (a,b) in tqdm(segs, desc="      -> Preparing Segments"):
            if a < t0 or b > t1: continue
            db = dense_boxes[(a - t0):(b - t0 + 1)]
            slow_t, fast_t = extract_clip_slowfast(
                video_path, (a,b), db, sf_cfg,
                content_mode='square'
            )
            slow_batch.append(slow_t); fast_batch.append(fast_t)
            metas.append((a,b))
        if not slow_batch:
            continue

        # 3c. Batch Predict
        print(f"      -> Predicting {len(slow_batch)} segments in a batch...")
        probs = predictor.predict_batch(slow_batch, fast_batch)  # (B, K)

        # 3d. Collect Events
        for (a,b), p in zip(metas, probs):
            k = int(np.argmax(p))
            events.append({
                "track_id": int(tid),
                "t0": int(a),
                "t1": int(b),
                "label": cfg.labels[k],
                "p": float(p[k])
            })
        print(f"      -> Finished Track {tid}. Found {len(probs)} events.")

    events.sort(key=lambda e: (e["t0"], e["track_id"]))
    print("\n=========================================")
    print(f"✅ Pipeline Complete. Total Events Found: {len(events)}")
    print("=========================================")
    # return {"events": events, "debug": debug}
    return {"events": events}

In [35]:
INPUT_VIDEO_PATH = "/content/drive/MyDrive/FIT3163,3164/PRESENTATION/cheras_rally_2.mp4"
# CSV_PATH = "/content/cheras_rally_2_rectified.csv"
CSV_PATH = "/content/cheras_rally_2_strongsort_rectified.csv"
SLOWFAST_PATH = "/content/drive/MyDrive/FIT3163,3164/SlowFast/07_models/slowfast_cheras_5b.pt"

device = _device()
sf_model_path = cfg.best_model_path  # from your training

sf_cfg = SFInferCfg(
    slow_t=cfg.slow_t, alpha=cfg.alpha,
    side=cfg.side,           # 224
    crop=cfg.side,           # 224 (unused in "square" mode)
    mean=(0.485,0.456,0.406),  # ImageNet, to match your dataset
    std=(0.229,0.224,0.225),
    bbox_margin=1.4,        # match your training expand_factor=1.25
    bbox_ema=0.8             # match your EMA
)

actionness_cfg=ActionnessCfg(
    min_dur_s=0.3,
    max_dur_s=0.8,
    bbox_margin=1.4,
    ma_len=5
)

pose_engine = PoseEngineYOLO(weights="yolo11s-pose.pt")

In [37]:
out = run_full_video_to_events(
    INPUT_VIDEO_PATH, CSV_PATH, cfg,
    ckpt_path=SLOWFAST_PATH,
    action_cfg=actionness_cfg,
    sf_cfg=sf_cfg,
    pose_engine=pose_engine, use_pose_actionness=True
)

# with open(os.path.join(cfg.models_dir, "events.json"), "w") as f:
with open("cheras_rally_2_events.json", "w") as f:
    json.dump(out, f, indent=2)
print(f"Predicted {len(out['events'])} action events")

🚀 Starting Action Event Inference Pipeline
Video: cheras_rally_2.mp4
Tracking: cheras_rally_2_strongsort_rectified.csv
🎥 Video Meta: FPS=28.99, Total Frames=572
🎬 Loading track data from: /content/cheras_rally_2_strongsort_rectified.csv


   -> Reading CSV lines:   0%|          | 0/1110 [00:00<?, ?it/s]

   -> Found 2 unique tracks. Sorting by frame...
🧠 Loading SlowFast model from: /content/drive/MyDrive/FIT3163,3164/SlowFast/07_models/slowfast_cheras_5b.pt


Using cache found in /root/.cache/torch/hub/facebookresearch_pytorchvideo_main


   -> Model loaded and set to evaluation mode on device: cuda
🎯 Inference will run on 2 tracks.
----------------------------------------


Total Track Progress:   0%|          | 0/2 [00:00<?, ?it/s]


   [TRACK 1/2] ID: 1 (570 frames over 570 total frames)
8 FH and 1 BH hysteresis segments found.
adding segment from 173 to 208
adding segment from 231 to 267
adding segment from 306 to 339
adding segment from 371 to 394
splitting, adding segment from 425 to 465
splitting, adding segment from 466 to 492
adding segment from 506 to 541
      -> Found 7 potential action segments.


      -> Preparing Segments:   0%|          | 0/7 [00:00<?, ?it/s]

      -> Predicting 7 segments in a batch...
      -> Finished Track 1. Found 7 events.

   [TRACK 2/2] ID: 2 (540 frames over 570 total frames)
13 FH and 5 BH hysteresis segments found.
adding segment from 0 to 38
adding segment from 191 to 230
adding segment from 274 to 306
adding segment from 334 to 365
adding segment from 398 to 427
adding segment from 445 to 473
splitting, adding segment from 478 to 518
splitting, adding segment from 519 to 543
      -> Found 8 potential action segments.


      -> Preparing Segments:   0%|          | 0/8 [00:00<?, ?it/s]

      -> Predicting 7 segments in a batch...
      -> Finished Track 2. Found 7 events.

✅ Pipeline Complete. Total Events Found: 14
Predicted 14 action events


# **Visualise pose**

In [25]:
def _pt(kp_dict, name, thr=0.15):
    v = kp_dict.get(name)
    if v is None or v[2] < thr:
        return None
    return (int(round(v[0])), int(round(v[1])))

def draw_pose_skeleton(img_bgr, kp_dict, conf_thr=0.15):
    """
    Draws a minimal skeleton: shoulders–elbows–wrists (+ hips & shoulder line).
    Returns a copy with drawings.
    """
    out = img_bgr.copy()
    names = ["L_sh","R_sh","L_el","R_el","L_wr","R_wr","L_hp","R_hp"]
    P = {n: _pt(kp_dict, n, conf_thr) for n in names}

    # Bones to draw (pairs of joint names)
    bones = [
        ("L_sh","L_el"), ("L_el","L_wr"),
        ("R_sh","R_el"), ("R_el","R_wr"),
        ("L_sh","R_sh"),
        ("L_hp","R_hp"),
        ("L_sh","L_hp"), ("R_sh","R_hp"),
    ]

    # Lines
    for a,b in bones:
        pa, pb = P.get(a), P.get(b)
        if pa is not None and pb is not None:
            cv2.line(out, pa, pb, (0, 255, 255), 2, cv2.LINE_AA)

    # Joints
    for n, p in P.items():
        if p is not None:
            cv2.circle(out, p, 3, (0, 128, 255), -1, cv2.LINE_AA)

    return out

In [26]:
def visualize_pose_on_track(
    video_path: str,
    timeline,                      # [(frame_idx, (x1,y1,x2,y2)), ...]
    pose_engine,                   # PoseEngineYOLO(...)
    out_path="pose_track_preview.mp4",
    *,
    # >>> reflect latest changes for pose crops <<<
    pose_side: int = 320,          # give pose more pixels than 224
    pose_mode: str = "square",     # "square" (expand_to_square+resize) or "letterbox"
    margin: float = 1.25,          # same as your training expand_factor
    bbox_ema: float = 0.8,         # same smoothing you use elsewhere
    show_score: bool = True,       # optional pose-actionness overlay
    conf_thr: float = 0.20
):
    """
    Visualizes YOLO-Pose on the SAME crops used for pose-actionness
    (square-expand OR letterbox; no center-crop content loss).
    """
    # --- build pose-friendly stabilized crops (reflects latest changes) ---
    t0, t1, crops, _, _ = stabilized_crops_for_track_pose(
        video_path, timeline,
        out_side=pose_side, margin=margin, bbox_ema=bbox_ema, mode=pose_mode
    )
    if len(crops) == 0:
        raise RuntimeError("No frames decoded for this track.")

    # FPS for writing
    cap = cv2.VideoCapture(video_path)
    fps = cap.get(cv2.CAP_PROP_FPS) or 30.0
    cap.release()

    H, W = crops[0].shape[:2]
    fourcc = cv2.VideoWriter_fourcc(*"mp4v")
    writer = cv2.VideoWriter(out_path, fourcc, fps, (W, H))

    # Optional: compute pose-actionness score on THESE crops
    score = None
    if show_score:
        feat = pose_features_from_crops(crops, pose_engine)   # reuses same crops
        s = pose_actionness_score(feat, ma_len=5)
        s = (s - s.min()) / (s.max() - s.min() + 1e-6)
        score = s.astype(np.float32)

    # Run pose and draw
    for i, crop_img in enumerate(crops):
        kp = pose_engine.infer_keypoints(crop_img)
        canvas = crop_img if kp is None else draw_pose_skeleton(crop_img, kp, conf_thr=conf_thr)

        # HUD
        rel_t = i / fps
        cv2.putText(canvas, f"t={rel_t:5.2f}s  (f={t0+i})", (8, 20),
                    cv2.FONT_HERSHEY_SIMPLEX, 0.55, (0, 0, 0), 3, cv2.LINE_AA)
        cv2.putText(canvas, f"t={rel_t:5.2f}s  (f={t0+i})", (8, 20),
                    cv2.FONT_HERSHEY_SIMPLEX, 0.55, (255, 255, 255), 1, cv2.LINE_AA)

        if show_score and score is not None:
            bar_w = int(score[i] * (W - 20))
            y0 = H - 14
            cv2.rectangle(canvas, (10, y0), (10 + bar_w, y0 + 6), (0, 255, 0), -1)
            cv2.rectangle(canvas, (10, y0), (W - 10, y0 + 6), (255, 255, 255), 1)

        writer.write(canvas)

    writer.release()
    print(f"✅ wrote: {out_path}")


In [27]:
tracks = load_tracks_csv(CSV_PATH)
tid = list(tracks.keys())
print(tid)

🎬 Loading track data from: /content/cheras_rally_2_rectified.csv


   -> Reading CSV lines:   0%|          | 0/1111 [00:00<?, ?it/s]

   -> Found 2 unique tracks. Sorting by frame...
[1, 2]


In [28]:
# 1) Build pose engine
pose_engine = PoseEngineYOLO(weights="yolo11s-pose.pt", conf=0.30)

# 2) Load tracks
tracks = load_tracks_csv(CSV_PATH)

# 3) Visualize
tids = list(tracks.keys())

for tid in tids:
    output_path = f"pose_track_{tid}.mp4"

    visualize_pose_on_track(
        INPUT_VIDEO_PATH, tracks[tid], pose_engine,
        out_path=output_path,
        pose_side=320,          # bigger than 224 for better keypoints
        pose_mode="square",     # or "letterbox" if you prefer aspect-true padding
        margin=1.4,            # match training expand_factor
        bbox_ema=0.8,           # match smoothing
        show_score=True
    )

🎬 Loading track data from: /content/cheras_rally_2_rectified.csv


   -> Reading CSV lines:   0%|          | 0/1111 [00:00<?, ?it/s]

   -> Found 2 unique tracks. Sorting by frame...
✅ wrote: pose_track_1.mp4
✅ wrote: pose_track_2.mp4


In [29]:
!ffmpeg -i 'pose_track_1.mp4' -vcodec libx264 -pix_fmt yuv420p -y -loglevel error 'disp.mp4'
Video('disp.mp4', embed=True, width=640, height=480)

# **Overlay on video**

In [38]:
def load_events(events_json_path: str):
    with open(events_json_path, "r") as f:
        data = json.load(f)
    # be flexible (either {"events":[...]} or a plain list)
    events = data["events"] if isinstance(data, dict) and "events" in data else data
    return events

def build_event_map_by_frame(events):
    """
    Returns:
      event_by_frame: {frame_idx: {track_id: best_event_dict}}
      best_event = highest confidence if multiple overlap same frame & track.
    """
    event_by_frame = defaultdict(dict)
    for e in events:
        tid, t0, t1 = int(e["track_id"]), int(e["t0"]), int(e["t1"])
        lab, p = e["label"], float(e.get("p", 1.0))
        for t in range(t0, t1+1):
            cur = event_by_frame[t].get(tid)
            if (cur is None) or (p > float(cur.get("p", 0.0))):
                event_by_frame[t][tid] = {"track_id": tid, "t0": t0, "t1": t1, "label": lab, "p": p}
    return event_by_frame

def load_tracks_csv_overlay(csv_path: str):
    """
    Returns:
      boxes_by_frame: {frame_idx: [(tid, (x1,y1,x2,y2)) ...]}
    """
    boxes_by_frame = defaultdict(list)
    with open(csv_path, "r") as f:
        r = csv.DictReader(f)
        for row in r:
            fi = int(row["frame"])
            tid = int(row["id"])
            x1,y1,x2,y2 = int(row["x1"]), int(row["y1"]), int(row["x2"]), int(row["y2"])
            boxes_by_frame[fi].append((tid, (x1,y1,x2,y2)))
    return boxes_by_frame

In [39]:
def render_full_video_overlay(
    video_path: str,
    tracks_csv: str,
    events_json: str,
    out_path: str,
    show_ids: bool = True,
    label_bg_alpha: float = 0.4
):
    """
    Draws YOLO+StrongSORT boxes and overlays action labels on frames
    that fall inside classified segments for each track.

    - Progress bar drawn above the box shows position inside segment.
    - If track has no active segment on a frame, we draw just the box (and ID).
    """
    boxes_by_frame  = load_tracks_csv_overlay(tracks_csv)
    events          = load_events(events_json)
    event_by_frame  = build_event_map_by_frame(events)

    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        raise RuntimeError(f"Cannot open {video_path}")
    fps = cap.get(cv2.CAP_PROP_FPS) or 30.0
    W   = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    H   = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    N   = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) or None

    out = cv2.VideoWriter(out_path, cv2.VideoWriter_fourcc(*"mp4v"), fps, (W, H))
    i = 0
    while True:
        ok, frame = cap.read()
        if not ok: break

        # draw tracks for this frame
        items = boxes_by_frame.get(i, [])
        for tid, (x1,y1,x2,y2) in items:
            color = _color_for_id(tid)
            cv2.rectangle(frame, (x1,y1), (x2,y2), color, 2)

            # label overlay if we have an event active now
            ev = event_by_frame.get(i, {}).get(tid)
            if ev is not None:
                label = ev["label"]; p = ev["p"]; t0 = ev["t0"]; t1 = ev["t1"]
                # progress 0..1
                prog = (i - t0) / max(1, (t1 - t0 + 1))
                # translucent box for text
                text = f"{label}  {p:.2f}"
                (tw, th), bl = cv2.getTextSize(text, cv2.FONT_HERSHEY_SIMPLEX, 0.6, 2)
                # place above box if room, else inside
                tx, ty = x1, y1 - 8
                if ty - th - 6 < 0:
                    ty = y1 + th + 12
                # background rect
                bx1, by1 = tx - 2, ty - th - 6
                bx2, by2 = tx + tw + 6, ty + 4
                # alpha blend bg
                bg = frame.copy()
                cv2.rectangle(bg, (bx1,by1), (bx2,by2), color, -1)
                frame = cv2.addWeighted(bg, label_bg_alpha, frame, 1 - label_bg_alpha, 0)
                # text
                cv2.putText(frame, text, (tx, ty), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255,255,255), 2, cv2.LINE_AA)

                # progress bar on top edge of bbox
                bar_h = 6
                px1, py1 = x1, max(0, y1 - bar_h - 2)
                px2, py2 = x2, max(0, y1 - 2)
                # bar background (light)
                cv2.rectangle(frame, (px1,py1), (px2,py2), (200,200,200), -1)
                # bar fill with track color
                fill_w = int((px2 - px1) * _clamp(prog, 0.0, 1.0))
                cv2.rectangle(frame, (px1,py1), (px1 + fill_w, py2), color, -1)

            elif show_ids:
                # only ID label
                text = f"ID {tid}"
                (tw, th), bl = cv2.getTextSize(text, cv2.FONT_HERSHEY_SIMPLEX, 0.6, 2)
                tx, ty = x1, y1 - 8
                if ty - th - 6 < 0: ty = y1 + th + 12
                cv2.putText(frame, text, (tx, ty), cv2.FONT_HERSHEY_SIMPLEX, 0.6, color, 2)

        out.write(frame)
        i += 1

    cap.release(); out.release()
    print(f"Overlay saved → {out_path}")

In [40]:
# EVENTS_JSON = os.path.join(cfg.models_dir, "events.json")
EVENTS_JSON = 'cheras_rally_2_events.json'

# render_full_video_overlay(
#     video_path=INPUT_VIDEO_PATH,
#     tracks_csv=CSV_PATH,
#     events_json=EVENTS_JSON,
#     out_path="/content/out_overlay.mp4",
# )

render_full_video_overlay(
    video_path=INPUT_VIDEO_PATH,
    tracks_csv="/content/cheras_rally_2_strongsort_rectified.csv",
    events_json=EVENTS_JSON,
    out_path="/content/cheras_rally_2_overlay.mp4",
)

Overlay saved → /content/cheras_rally_2_overlay.mp4


In [32]:
!cp /content/chronos_rally_2_overlay.mp4 '/content/drive/MyDrive/FIT3163,3164/YOLO Phua'

In [34]:
!ffmpeg -i '/content/chronos_rally_2_overlay.mp4' -vcodec libx264 -pix_fmt yuv420p -y -loglevel error 'disp.mp4'
Video('disp.mp4', embed=True, width=720, height=480)