## **Build label map & splits**

In [None]:
!pip -q install torchmetrics decord fvcore pytorchvideo

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.2/50.2 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m132.7/132.7 kB[0m [31m16.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.2/42.2 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m983.2/983.2 kB[0m [31m61.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.6/13.6 MB[0m [31m103.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m39.9/39.9 MB[0m [31m62.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for fvcore (setup.py) ... [?25l[?25hdone
  Building wheel for pytorchvideo (s

In [None]:
import json, random, csv, glob, os
import torch, torch.nn as nn, torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchmetrics.classification import MulticlassAccuracy, MulticlassF1Score, MulticlassConfusionMatrix
import torch.nn.functional as F
from torchvision.transforms import v2
from decord import VideoReader, cpu
import torchvision
import numpy as np
from typing import Dict, Tuple, Optional, List
import pandas as pd

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

## **1. CONFIGURATION**
### This class centralizes all hyperparameters and file paths.

In [None]:
class Config:
    def __init__(self):
        self.root_dir = "/content/drive/MyDrive/FIT3163,3164/SlowFast"
        self.clips_dir = os.path.join(self.root_dir, "05_clips/3in1")
        self.splits_dir = os.path.join(self.root_dir, "06_splits/3in1")
        self.models_dir = os.path.join(self.root_dir, "07_models/3in1_train3")
        self.best_model_path = os.path.join(self.models_dir, "best.pt")

        self.labels = [
            "smash", "jump_smash", "block",
            "drop", "clear", "lift", "drive",
            "straight_net", "cross_net", "serve",
            "push", "tap",
            "average_joe"
        ]

        # Dataset parameters
        self.side = 224             # ori: 224
        self.slow_t = 8             # 8 frames for slow pathway
        self.alpha = 4              # ratio between fast and slow
        self.fast_t = self.slow_t * self.alpha
        self.fast_target = 224      # ori: 224

        # Training parameters
        self.epochs = 30
        self.batch_size = 8
        self.learning_rate = 0.001
        self.weight_decay = 0.001

        self.early_stopping_patience = 5

# Create a configuration object
cfg = Config()

## **2. DATA PREPARATION**
### This function handles all logic for splitting and saving the dataset.

In [None]:
def prepare_data_splits(config: Config):
    """
    Finds video clips, shuffles them, and splits them into train, val, and test sets.
    Saves the splits as CSV files and the label map as a JSON file.
    """
    os.makedirs(config.splits_dir, exist_ok=True)
    os.makedirs(config.models_dir, exist_ok=True)

    labels_map = {lab: i for i, lab in enumerate(config.labels)}
    with open(os.path.join(config.splits_dir, "labels_map.json"), "w") as f:
        json.dump(labels_map, f, indent=2)

    items = []
    for label in config.labels:
        # Use glob to find all video files for the current label
        for clip_path in glob.glob(os.path.join(config.clips_dir, label, "*.mp4")):
            items.append((clip_path, labels_map[label]))

    random.seed(1337)
    random.shuffle(items)

    total_items = len(items)
    train_count = int(0.8 * total_items)
    val_count = int(0.1 * total_items)
    print(f"Found {total_items} clips in total, splitting to train ({train_count}) and val ({val_count}).")

    splits = {
        "train.csv": items[:train_count],
        "val.csv": items[train_count:train_count + val_count],
        "test.csv": items[train_count + val_count:]
    }

    for name, data in splits.items():
        with open(os.path.join(config.splits_dir, name), "w", newline="") as f:
            csv_writer = csv.writer(f)
            csv_writer.writerows(data)

    print({k: len(v) for k, v in splits.items()})

In [None]:
prepare_data_splits(cfg)

Found 506 clips in total, splitting to train (404) and val (50).
{'train.csv': 404, 'val.csv': 50, 'test.csv': 52}


## **3. DATASET**
### The ClipDataset class handles video loading and preprocessing.

In [None]:
class ClipDataset(Dataset):
    def __init__(self, csv_path: str, config: Config, train: bool = True):
        self.items = [(p, int(y)) for p, y in csv.reader(open(csv_path))]
        self.config = config
        self.train = train

        # Pre-compute normalization tensors
        self.mean = torch.tensor([0.45, 0.45, 0.45]).view(3, 1, 1)
        self.std = torch.tensor([0.225, 0.225, 0.225]).view(3, 1, 1)

        # Define a composed transform for training
        if self.train:
            self.train_transforms = v2.Compose([
                v2.RandomResizedCrop(
                    size=self.config.side,
                    scale=(0.7, 1.0),
                    ratio=(0.75, 1.333),
                    antialias=True
                ),
                v2.RandomHorizontalFlip(p=0.5),
                v2.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.2),
                v2.RandomGrayscale(p=0.2),
            ])

    def _get_frame_indices(self, num_frames: int):
        """
        Return indices for fast (T = fast_t) and slow (stride alpha).
        Train: random crop; Eval: center crop.
        """
        # This part of the code remains unchanged.
        need = self.config.fast_t
        if num_frames >= need:
            start = np.random.randint(0, num_frames - need + 1) if self.train else max((num_frames - need) // 2, 0)
            fast_idx = list(range(start, start + need))
        else:
            fast_idx = list(range(num_frames)) + [num_frames - 1] * (need - num_frames)
        slow_idx = fast_idx[::self.config.alpha]
        return slow_idx, fast_idx

    def _read_and_process_frames(self, vr: VideoReader, indices: List[int]) -> torch.Tensor:
        """
        Returns (C, T, H, W) normalized to kinetics-style mean/std.
        """
        try:
            frames = vr.get_batch([min(i, len(vr)-1) for i in indices]).asnumpy()
        except Exception:
            frames = np.stack([vr[min(i, len(vr)-1)].asnumpy() for i in indices], axis=0)

        # Convert to tensor and permute dimensions
        x = torch.from_numpy(frames).permute(0, 3, 1, 2).float() / 255.0  # (T, C, H, W)

        # Apply data augmentation only for training
        if self.train:
            # Apply the same random transform to all frames
            x = self.train_transforms(x)

        # Resize to the required size if necessary
        x = F.interpolate(x, size=self.config.side, mode="bilinear", align_corners=False) # (T, C, 224, 224)

        # Normalize
        mean = self.mean.to(x)
        std = self.std.to(x)
        x = (x - mean) / std

        return x.permute(1, 0, 2, 3) # (C, T, H, W)

    def __getitem__(self, i: int) -> Tuple[Tuple[torch.Tensor, torch.Tensor], int]:
        """Loads and preprocesses a single clip and its label."""
        path, label = self.items[i]
        vr = VideoReader(path, ctx=cpu(0))

        # Randomly choose frames from the entire video
        slow_indices, fast_indices = self._get_frame_indices(len(vr))

        # Get and process clips
        slow_clip = self._read_and_process_frames(vr, slow_indices)
        fast_clip = self._read_and_process_frames(vr, fast_indices)

        return (slow_clip, fast_clip), label

    def __len__(self) -> int:
        return len(self.items)

#### **Generate datasets and loaders for training, validation, and testing**

In [None]:
def slowfast_collate(batch):
    # batch: list of [((slow, fast), y), ...]
    slows, fasts, ys = [], [], []
    for (s, f), y in batch:
        slows.append(s)
        fasts.append(f)
        ys.append(y)
    slow = torch.stack(slows, dim=0)  # (B,C,T,H,W)
    fast = torch.stack(fasts, dim=0)  # (B,C,T,H,W)
    y = torch.tensor(ys, dtype=torch.long)
    return [slow, fast], y

train_csv = os.path.join(cfg.splits_dir, "train.csv")
val_csv   = os.path.join(cfg.splits_dir, "val.csv")
test_csv  = os.path.join(cfg.splits_dir, "test.csv")

train_ds = ClipDataset(train_csv, cfg, train=True)
val_ds   = ClipDataset(val_csv,   cfg, train=False)
test_ds  = ClipDataset(test_csv,  cfg, train=False)

train_loader = DataLoader(
    train_ds, batch_size=cfg.batch_size, shuffle=True,
    num_workers=2, pin_memory=True, collate_fn=slowfast_collate, persistent_workers=False
)
val_loader = DataLoader(
    val_ds, batch_size=max(1, cfg.batch_size), shuffle=False,
    num_workers=2, pin_memory=True, collate_fn=slowfast_collate, persistent_workers=False
)

num_classes = len(cfg.labels)
print("Classes:", num_classes, cfg.labels)

Classes: 13 ['smash', 'jump_smash', 'block', 'drop', 'clear', 'lift', 'drive', 'straight_net', 'cross_net', 'serve', 'push', 'tap', 'average_joe']


## **4. TRAINING AND EVALUATION**
### This function orchestrates the entire training process.

#### **Load pre-trained model from hub**

In [None]:
# =========================
# 3) Model: load hub, replace head
# =========================
torch.hub._validate_not_a_forked_repo = lambda a,b,c: True
model = torch.hub.load('facebookresearch/pytorchvideo', 'slowfast_r101', pretrained=True)

# Replace classifier (ResNetBasicHead.proj)
in_dim = model.blocks[-1].proj.in_features
model.blocks[-1].proj = nn.Sequential(
    nn.Dropout(p=0.5), # Add a dropout layer
    nn.Linear(in_dim, num_classes)
)
model = model.to(device)

# Optional: freeze early blocks for faster convergence at small data sizes
for p in model.blocks[:-1].parameters():
    p.requires_grad = False

Using cache found in /root/.cache/torch/hub/facebookresearch_pytorchvideo_main


#### **Optional: load weights from checkpoint**

In [None]:
checkpoint_path = '/content/drive/MyDrive/FIT3163,3164/SlowFast/07_models/3in1_dropout0.2/best.pt'

# Load the saved checkpoint
checkpoint = torch.load(checkpoint_path, map_location=device)

# Load the model's state_dict from the checkpoint
model.load_state_dict(checkpoint['model'])
print(f"Model weights loaded successfully from {checkpoint_path}")

#### **Define training components**

In [None]:
# =========================
# 4) Optimizer, loss, metrics
# =========================
criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=cfg.learning_rate, weight_decay=cfg.weight_decay)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.5, patience=10)
scaler = torch.cuda.amp.GradScaler(enabled=(device.type == "cuda"))

acc = MulticlassAccuracy(num_classes=num_classes, average='micro').to(device)
f1  = MulticlassF1Score(num_classes=num_classes, average='macro').to(device)

  scaler = torch.cuda.amp.GradScaler(enabled=(device.type == "cuda"))


#### **Main training loop**

In [None]:
# =========================
# 5) Train / validate
# =========================
best_f1 = -1.0
os.makedirs(cfg.models_dir, exist_ok=True)

for epoch in range(cfg.epochs):
    model.train()
    acc.reset(); f1.reset()
    total_loss = 0.0

    first = True
    for (slow_fast, y) in train_loader:
        if first:
            s, f = slow_fast
            # print("slow:", tuple(s.shape), "fast:", tuple(f.shape))
            # Expect slow=(B,3,8,224,224) and fast=(B,3,32,224,224)
            first = False

        # slow_fast is [slow, fast]
        slow_fast = [t.to(device, non_blocking=True) for t in slow_fast]
        y = y.to(device, non_blocking=True)

        optimizer.zero_grad(set_to_none=True)
        with torch.cuda.amp.autocast(enabled=(device.type == "cuda")):
            logits = model(slow_fast)     # (B, num_classes)
            loss = criterion(logits, y)

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        total_loss += loss.item() * y.size(0)
        acc.update(logits, y)
        f1.update(logits, y)

    train_loss = total_loss / len(train_ds)
    train_acc  = acc.compute().item()
    train_f1   = f1.compute().item()

    # --- Validation ---
    model.eval()
    acc.reset(); f1.reset()
    val_loss = 0.0
    with torch.no_grad(), torch.cuda.amp.autocast(enabled=(device == "cuda")):
        for (slow_fast, y) in val_loader:
            slow_fast = [t.to(device, non_blocking=True) for t in slow_fast]
            y = y.to(device, non_blocking=True)
            logits = model(slow_fast)
            loss = criterion(logits, y)
            val_loss += loss.item() * y.size(0)
            acc.update(logits, y)
            f1.update(logits, y)

    val_loss /= len(val_ds)
    val_acc = acc.compute().item()
    val_f1  = f1.compute().item()

    scheduler.step(val_f1)

    print(f"\n[{epoch+1:02d}/{cfg.epochs}] "
          f"train_loss={train_loss:.4f} acc={train_acc*100:.2f}% f1={train_f1:.3f} | "
          f"val_loss={val_loss:.4f} acc={val_acc*100:.2f}% f1={val_f1:.3f}")

    if val_f1 > best_f1:
        best_f1 = val_f1
        torch.save({"model": model.state_dict(), "labels": cfg.labels}, cfg.best_model_path)
        print(f"  ↳ saved new best to {cfg.best_model_path} (val_f1={best_f1:.3f})")

print("Best val F1:", best_f1)

  with torch.cuda.amp.autocast(enabled=(device.type == "cuda")):
  with torch.no_grad(), torch.cuda.amp.autocast(enabled=(device == "cuda")):



[01/30] train_loss=2.3816 acc=18.56% f1=0.066 | val_loss=2.3100 acc=20.00% f1=0.060
  ↳ saved new best to /content/drive/MyDrive/FIT3163,3164/SlowFast/07_models/3in1_train3/best.pt (val_f1=0.060)

[02/30] train_loss=2.1701 acc=25.99% f1=0.119 | val_loss=2.1914 acc=26.00% f1=0.138
  ↳ saved new best to /content/drive/MyDrive/FIT3163,3164/SlowFast/07_models/3in1_train3/best.pt (val_f1=0.138)

[03/30] train_loss=1.9949 acc=31.68% f1=0.168 | val_loss=2.0802 acc=22.00% f1=0.066

[04/30] train_loss=1.9057 acc=35.40% f1=0.195 | val_loss=1.9340 acc=54.00% f1=0.348
  ↳ saved new best to /content/drive/MyDrive/FIT3163,3164/SlowFast/07_models/3in1_train3/best.pt (val_f1=0.348)

[05/30] train_loss=1.8550 acc=36.39% f1=0.221 | val_loss=1.8920 acc=44.00% f1=0.307

[06/30] train_loss=1.8198 acc=37.13% f1=0.235 | val_loss=1.9351 acc=38.00% f1=0.219

[07/30] train_loss=1.7714 acc=39.36% f1=0.253 | val_loss=1.7670 acc=56.00% f1=0.391
  ↳ saved new best to /content/drive/MyDrive/FIT3163,3164/SlowFast/07

## **Evaluate on test set**

In [None]:
class TestManager:
    """
    Manages the evaluation process for a SlowFast model on a test set.
    """
    def __init__(self, config: 'Config', device: str):
        self.config = config
        self.device = device
        self.num_classes = len(config.labels)
        self.model = self._load_model()
        self.test_loader = self._create_dataloader()
        self.metrics = self._initialize_metrics()
        self.softmax = nn.Softmax(dim=1)

    def _load_model(self):
        """Loads the pre-trained SlowFast model and the fine-tuned checkpoint."""
        print("Loading model and best checkpoint...")

        # Disable the internal hub check for local loading
        torch.hub._validate_not_a_forked_repo = lambda a,b,c: True

        model = torch.hub.load('facebookresearch/pytorchvideo', 'slowfast_r101', pretrained=True)
        in_dim = model.blocks[-1].proj.in_features
        model.blocks[-1].proj = nn.Sequential(
            nn.Dropout(p=0.2), # Add a dropout layer
            nn.Linear(in_dim, num_classes)
        )

        # Load the state dictionary from the checkpoint file
        ckpt = torch.load(self.config.best_model_path, map_location=self.device)
        model.load_state_dict(ckpt["model"])
        model = model.to(self.device)
        model.eval()
        return model

    def _create_dataloader(self):
        """Creates and returns the DataLoader for the test set."""
        test_ds = ClipDataset(os.path.join(self.config.splits_dir, "test.csv"), self.config, train=False)
        return DataLoader(
            test_ds,
            batch_size=max(1, self.config.batch_size),
            shuffle=False,
            num_workers=2,
            pin_memory=True,
            # collate_fn=slowfast_collate,  # Make sure this is imported if needed
            persistent_workers=False
        )

    def _initialize_metrics(self):
        """Initializes all the evaluation metrics."""
        return {
            'top1': MulticlassAccuracy(num_classes=self.num_classes, average="micro").to(self.device),
            'top3': MulticlassAccuracy(num_classes=self.num_classes, top_k=3).to(self.device),
            'f1_macro': MulticlassF1Score(num_classes=self.num_classes, average="macro").to(self.device),
            'f1_perclass': MulticlassF1Score(num_classes=self.num_classes, average=None).to(self.device),
            'cm': MulticlassConfusionMatrix(num_classes=self.num_classes).to(self.device)
        }

    def run_inference(self):
        """Runs the inference loop and computes all metrics and predictions."""
        print("Starting inference on the test set...")
        test_loss = 0.0
        criterion = nn.CrossEntropyLoss()
        all_predictions = []

        with torch.no_grad(), torch.amp.autocast(self.device, enabled=(self.device == "cuda")):
            for batch_idx, (slow_fast, y) in enumerate(self.test_loader):
                # Ensure input tensors are lists
                if not isinstance(slow_fast, list):
                    slow_fast = [slow_fast]

                slow_fast = [t.to(self.device, non_blocking=True) for t in slow_fast]
                y = y.to(self.device, non_blocking=True)

                logits = self.model(slow_fast)
                loss = criterion(logits, y)
                test_loss += loss.item() * y.size(0)

                # Update metrics
                for metric in self.metrics.values():
                    metric.update(logits, y)

                # Collect per-sample predictions for later saving
                probs = self.softmax(logits)
                conf, pred = probs.max(dim=1)
                topk_conf, topk_idx = probs.topk(3, dim=1)

                start_idx = batch_idx * self.test_loader.batch_size

                for i in range(y.size(0)):
                    idx = start_idx + i
                    path = self.test_loader.dataset.items[idx][0]
                    row = {
                        "path": path,
                        "file": os.path.basename(path),
                        "true_idx": int(y[i]),
                        "true_label": self.config.labels[int(y[i])],
                        "pred_idx": int(pred[i]),
                        "pred_label": self.config.labels[int(pred[i])],
                        "pred_prob": float(conf[i]),
                        "top1_label": self.config.labels[int(topk_idx[i,0])],
                        "top1_prob":  float(topk_conf[i,0]),
                        "top2_label": self.config.labels[int(topk_idx[i,1])],
                        "top2_prob":  float(topk_conf[i,1]),
                        "top3_label": self.config.labels[int(topk_idx[i,2])],
                        "top3_prob":  float(topk_conf[i,2]),
                    }
                    all_predictions.append(row)

        test_loss /= len(self.test_loader.dataset)
        return test_loss, all_predictions

    def compute_and_print_results(self, test_loss):
        """Computes and prints the final metrics."""
        acc1 = self.metrics['top1'].compute().item()
        acc3 = self.metrics['top3'].compute().item()
        f1M = self.metrics['f1_macro'].compute().item()
        percls = self.metrics['f1_perclass'].compute().detach().cpu().tolist()
        confmat = self.metrics['cm'].compute().detach().cpu().numpy()

        print(f"\nTEST: loss={test_loss:.4f} | acc@1={acc1*100:.2f}% | acc@3={acc3*100:.2f}% | macro-F1={f1M:.3f}")
        print("\nPer-class F1:")
        for lab, s in sorted(zip(self.config.labels, percls), key=lambda x: x[1], reverse=True):
            print(f"  {lab:15s} {s:.3f}")

        print("\nConfusion Matrix (rows=true, cols=predicted):")
        print(confmat)

    def save_predictions(self, predictions: list, print_n: int=10):
        """Saves the list of predictions to a CSV file."""
        df = pd.DataFrame(predictions)
        save_path = os.path.join(self.config.models_dir, "test_predictions.csv")
        df.to_csv(save_path, index=False)
        print(f"\nSaved per-sample predictions to: {save_path}")
        print("\nQuick peek at the predictions:")
        print(df.head(print_n)[["file", "true_label", "pred_label", "pred_prob", "top2_label", "top2_prob", "top3_label", "top3_prob"]])

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
test_manager = TestManager(cfg, device)
test_loss, all_predictions = test_manager.run_inference()
test_manager.compute_and_print_results(test_loss)
test_manager.save_predictions(all_predictions, print_n=len(all_predictions))

Loading model and best checkpoint...


Using cache found in /root/.cache/torch/hub/facebookresearch_pytorchvideo_main


Starting inference on the test set...

TEST: loss=1.5431 | acc@1=57.69% | acc@3=75.73% | macro-F1=0.447

Per-class F1:
  serve           1.000
  jump_smash      0.857
  lift            0.769
  straight_net    0.696
  cross_net       0.500
  drop            0.400
  clear           0.250
  smash           0.000
  block           0.000
  drive           0.000
  push            0.000
  tap             0.000
  average_joe     0.000

Confusion Matrix (rows=true, cols=predicted):
[[ 0  0  0  0  0  0  0  0  1  0  0  0  0]
 [ 0  3  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  1  0  1  1  0  0  0  0  0  0  0  0]
 [ 0  0  0  1  1  1  0  0  0  0  0  0  0]
 [ 0  0  0  0  0 10  0  1  1  0  1  0  0]
 [ 0  0  0  0  3  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  1  8  0  0  1  0  0]
 [ 0  0  0  0  0  1  0  3  4  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  3  0  0  0]
 [ 0  0  0  0  0  1  1  1  2  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  

## **end-to-end match inference & overlay**

In [None]:
!pip -q install ultralytics opencv-python-headless

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.1 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━[0m [32m1.0/1.1 MB[0m [31m29.7 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m21.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import os, cv2, numpy as np, torch
from collections import deque, defaultdict
from ultralytics import YOLO
import torch.nn.functional as F

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
def load_slowfast_classifier(cfg, ckpt_path):
    torch.hub._validate_not_a_forked_repo = lambda a,b,c: True
    model = torch.hub.load('facebookresearch/pytorchvideo', 'slowfast_r101', pretrained=True)
    in_dim = model.blocks[-1].proj.in_features
    model.blocks[-1].proj = torch.nn.Linear(in_dim, len(cfg.labels))
    ckpt = torch.load(ckpt_path, map_location=device)
    model.load_state_dict(ckpt["model"], strict=True)
    model.eval().to(device)
    return model

In [None]:
def resize_pad_square(img_rgb: np.ndarray, side: int = 224) -> np.ndarray:
    """Keep aspect ratio; resize the longer side to `side`, then pad to (side, side)."""
    h, w = img_rgb.shape[:2]
    if h == 0 or w == 0:
        return np.zeros((side, side, 3), dtype=img_rgb.dtype)
    scale = side / max(h, w)
    nh, nw = int(round(h * scale)), int(round(w * scale))
    resized = cv2.resize(img_rgb, (nw, nh), interpolation=cv2.INTER_LINEAR)
    top  = (side - nh) // 2
    bottom = side - nh - top
    left = (side - nw) // 2
    right = side - nw - left
    out = cv2.copyMakeBorder(resized, top, bottom, left, right, cv2.BORDER_CONSTANT, value=(128,128,128))
    return out

def expand_box(x1, y1, x2, y2, scale: float, W: int, H: int):
    """Optionally enlarge the bbox to keep some context (e.g., racket)."""
    cx, cy = (x1 + x2) / 2.0, (y1 + y2) / 2.0
    bw, bh = (x2 - x1) * scale, (y2 - y1) * scale
    nx1, ny1 = int(max(0, cx - bw / 2)), int(max(0, cy - bh / 2))
    nx2, ny2 = int(min(W - 1, cx + bw / 2)), int(min(H - 1, cy + bh / 2))
    return nx1, ny1, nx2, ny2

In [None]:
class SlowFastPredictor:
    def __init__(self, cfg, model):
        self.cfg = cfg
        self.model = model
        self.mean = torch.tensor([0.45, 0.45, 0.45]).view(3,1,1).to(device)
        self.std  = torch.tensor([0.225, 0.225, 0.225]).view(3,1,1).to(device)

    def _prep(self, frames_rgb_list):
        """
        frames_rgb_list: list of 32 frames, each HxWx3 in RGB
        Returns: [slow, fast] tensors shaped (1,C,T,H,W)
        """
        # Stack to (T,H,W,3) -> (T,C,H,W)
        x = torch.from_numpy(np.stack(frames_rgb_list)).permute(0,3,1,2).float() / 255.0  # (T,C,H,W)
        # Resize treating T as batch
        x = F.interpolate(x, size=self.cfg.side, mode="bilinear", align_corners=False)    # (T,C,224,224)
        # Normalize
        mean = self.mean.to(device=x.device, dtype=x.dtype)
        std  = self.std.to(device=x.device, dtype=x.dtype)
        x = (x - mean) / std                                                   # (T,C,224,224)
        # (C,T,H,W)
        x = x.permute(1,0,2,3)
        fast = x.unsqueeze(0).to(device)             # (1,C,32,224,224)
        slow = x[:, ::self.cfg.alpha, :, :].unsqueeze(0).to(device)  # stride-4 -> (1,C,8,224,224)
        return [slow, fast]

    @torch.no_grad()
    def predict_probs(self, frames_rgb_list):
        assert len(frames_rgb_list) == self.cfg.fast_t  # 32
        with torch.amp.autocast('cuda', enabled=(device.type == "cuda")):
            inp = self._prep(frames_rgb_list)
            logits = self.model(inp)                  # (1, num_classes)
            probs = torch.softmax(logits, dim=1)[0].detach().cpu().numpy()
        return probs  # (C,)

In [None]:
def annotate_match_video(
    cfg,
    video_path,
    out_path,
    yolo_weights="yolo11n.pt", # change to your custom weights if you have them
    person_class=0,            # COCO 'person'
    det_conf=0.5,
    iou=0.5,
    pred_thr=0.60,             # minimum prob to show label
    cooldown=12                # frames to cool after showing a shot to reduce spam
):
    # Get video props for the writer
    cap = cv2.VideoCapture(video_path)
    fps = max(1.0, cap.get(cv2.CAP_PROP_FPS))
    W   = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    H   = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    cap.release()

    fourcc = cv2.VideoWriter_fourcc(*"mp4v")
    writer = cv2.VideoWriter(out_path, fourcc, fps, (W, H))

    # Load detector+tracker
    yolo = YOLO(yolo_weights)

    # Load classifier
    clf_model = load_slowfast_classifier(cfg, cfg.best_model_path)
    clf = SlowFastPredictor(cfg, clf_model)

    # Per-track state
    buffers = defaultdict(lambda: deque(maxlen=cfg.fast_t))            # 32-frame RGB crops per track
    last_shown_frame = defaultdict(lambda: -99999)                     # cooldown control
    hist = defaultdict(lambda: deque(maxlen=5))                        # small temporal smoothing buffer

    frame_idx = 0
    for res in yolo.track(source=video_path, stream=True, persist=True,
                          classes=[person_class], conf=det_conf, iou=iou, verbose=False):
        frame_bgr = res.orig_img  # BGR
        h, w = frame_bgr.shape[:2]

        # If no boxes/ids in this frame, just write it
        if res.boxes is None or res.boxes.id is None:
            writer.write(frame_bgr)
            frame_idx += 1
            continue

        ids = res.boxes.id.int().cpu().numpy()
        xyxy = res.boxes.xyxy.int().cpu().numpy()  # (N,4)

        to_draw = []  # (x1,y1,x2,y2,label,prob,tid)

        for j, tid in enumerate(ids):
            x1, y1, x2, y2 = xyxy[j]
            x1, y1 = max(0, x1), max(0, y1)
            x2, y2 = min(w-1, x2), min(h-1, y2)
            if x2 <= x1 or y2 <= y1:
                continue

            # NEW: enlarge a bit for context (optional, try 1.2–1.4)
            x1, y1, x2, y2 = expand_box(x1, y1, x2, y2, scale=1.25, W=w, H=h)

            # Crop -> RGB -> letterbox to fixed square
            crop = frame_bgr[y1:y2, x1:x2, :]
            if crop.size == 0:
                continue
            crop_rgb = cv2.cvtColor(crop, cv2.COLOR_BGR2RGB)
            crop_rgb = resize_pad_square(crop_rgb, side=cfg.side)  # now every frame is 224x224

            buffers[tid].append(crop_rgb)

            label_to_show = None
            prob_to_show  = 0.0

            # Classify when we have a full 32-frame clip
            if len(buffers[tid]) == cfg.fast_t:
                probs = clf.predict_probs(list(buffers[tid]))  # (C,)
                ci = int(probs.argmax())
                pi = float(probs[ci])
                hist[tid].append((ci, pi))

                # Small smoothing: require at least 2 of the last 3 agreeing + prob >= thr
                if len(hist[tid]) >= 3:
                    last3 = list(hist[tid])[-3:]
                else:
                    last3 = list(hist[tid])

                # Choose the label with the highest mean prob among last3
                if last3:
                    classes = [c for c, p in last3 if cfg.labels[c] != "average_joe" and p >= pred_thr]
                    if classes:
                        # pick the most common; break ties by highest avg prob
                        uniq = set(classes)
                        best_c, best_score = None, -1.0
                        for u in uniq:
                            avgp = np.mean([p for (c, p) in last3 if c == u])
                            score = (classes.count(u), avgp)  # (count, avgp)
                            if score > (classes.count(best_c) if best_c is not None else -1, best_score):
                                best_c, best_score = u, avgp
                        if best_c is not None and (frame_idx - last_shown_frame[tid] >= cooldown):
                            label_to_show = cfg.labels[best_c]
                            prob_to_show = float(best_score)
                            last_shown_frame[tid] = frame_idx

            # Queue drawing if we have a confident non-background label
            if label_to_show is not None:
                to_draw.append((x1, y1, x2, y2, label_to_show, prob_to_show, int(tid)))

        # ---- Draw all overlays on this frame ----
        for (x1, y1, x2, y2, lab, p, tid) in to_draw:
            color = (0, 220, 0)
            cv2.rectangle(frame_bgr, (x1, y1), (x2, y2), color, 2)
            txt = f"#{tid} {lab} {p*100:.1f}%"
            cv2.putText(frame_bgr, txt, (x1, max(20, y1-10)),
                        cv2.FONT_HERSHEY_SIMPLEX, 0.6, color, 2, cv2.LINE_AA)

        writer.write(frame_bgr)
        frame_idx += 1

    writer.release()
    print(f"Saved annotated video to: {out_path}")

In [None]:
in_video  = "/content/drive/MyDrive/FIT3163,3164/SlowFast/01_raw/lcw_ld_2016_short/1/master.mp4"
out_video = "/content/match_annotated.mp4"
yolo_weights = "/content/drive/MyDrive/FIT3163,3164/YOLO/my_yolov8_1.pt"

annotate_match_video(cfg, in_video, out_video,
                     yolo_weights=yolo_weights,  # swap if you have a better person/badminton model
                     det_conf=0.35, iou=0.5,
                     pred_thr=0.60, cooldown=12)

Using cache found in /root/.cache/torch/hub/facebookresearch_pytorchvideo_main


Saved annotated video to: /content/match_annotated.mp4
