## **Build label map & splits**

In [None]:
!pip -q install torchmetrics decord fvcore pytorchvideo

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.2/50.2 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m132.7/132.7 kB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.2/42.2 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m983.2/983.2 kB[0m [31m36.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.6/13.6 MB[0m [31m113.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m39.9/39.9 MB[0m [31m61.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for fvcore (setup.py) ... [?25l[?25hdone
  Building wheel for pytorchvideo (s

In [None]:
import json, random, csv, glob, os
import torch, torch.nn as nn, torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchmetrics.classification import MulticlassAccuracy, MulticlassF1Score, MulticlassConfusionMatrix
import torch.nn.functional as F
import torchvision.transforms.functional as TF
from torchvision.transforms import InterpolationMode
from torchvision.transforms import v2
from decord import VideoReader, cpu
from PIL import Image
from dataclasses import dataclass
import torchvision
import numpy as np
from typing import Dict, Tuple, Optional, List
import pandas as pd

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

## **1. CONFIGURATION**

In [None]:
IMAGENET_MEAN = torch.tensor([0.485, 0.456, 0.406]).view(3,1,1)
IMAGENET_STD  = torch.tensor([0.229, 0.224, 0.225]).view(3,1,1)

In [None]:
class Config:
    def __init__(self):
        self.root_dir = "/content/drive/MyDrive/FIT3163,3164/SlowFast"
        self.clips_dir = os.path.join(self.root_dir, "05_clips/3in1")
        self.splits_dir = os.path.join(self.root_dir, "06_splits/3in1")
        self.models_dir = os.path.join(self.root_dir, "07_models/3in1_train4")
        self.best_model_path = os.path.join(self.models_dir, "best.pt")

        self.labels = [
            "smash", "jump_smash", "block",
            "drop", "clear", "lift", "drive",
            "straight_net", "cross_net", "serve",
            "push", "tap",
            "average_joe"
        ]

        # Training parameters
        self.epochs = 30
        self.batch_size = 8
        self.learning_rate = 0.005
        self.weight_decay = 0.001

        self.early_stopping_patience = 5

# Create a configuration object
cfg = Config()

In [None]:
@dataclass
class VideoCfg:
    clips_dir: str
    splits_dir: str
    labels: List[str]
    side: int = 224           # input size (square)
    num_frames: int = 8       # frames per clip
    sample: str = "uniform"   # "uniform" or "rand"
    train_scale: Tuple[float,float] = (0.7, 1.0)
    train_ratio: Tuple[float,float] = (0.75, 1.333)

## **2. DATA PREPARATION**
### This function handles all logic for splitting and saving the dataset.

In [None]:
def prepare_data_splits(cfg: VideoCfg):
    os.makedirs(cfg.splits_dir, exist_ok=True)
    labels_map = {lab: i for i, lab in enumerate(cfg.labels)}
    with open(os.path.join(cfg.splits_dir, "labels_map.json"), "w") as f:
        json.dump(labels_map, f, indent=2)

    items = []
    for lab in cfg.labels:
        for p in glob.glob(os.path.join(cfg.clips_dir, lab, "*.mp4")):
            items.append((p, labels_map[lab]))

    random.seed(1337)
    random.shuffle(items)
    n = len(items)
    n_tr = int(0.8*n)
    n_va = int(0.1*n)

    splits = {
        "train.csv": items[:n_tr],
        "val.csv":   items[n_tr:n_tr+n_va],
        "test.csv":  items[n_tr+n_va:],
    }
    for name, rows in splits.items():
        with open(os.path.join(cfg.splits_dir, name), "w", newline="") as f:
            w = csv.writer(f); w.writerows(rows)
    print({k: len(v) for k,v in splits.items()})

In [None]:
prepare_data_splits(cfg)

{'train.csv': 404, 'val.csv': 50, 'test.csv': 52}


## **3. DATASET**
### The ClipDataset class handles video loading and preprocessing.

In [None]:
class VideoClipDataset(Dataset):
    """
    Returns:
      x: (T, C, H, W) float32 normalized (ImageNet) in [~N(0,1)]
      y: int label
    """
    def __init__(self, csv_path: str, cfg: VideoCfg, train: bool=True):
        self.items = [(p, int(y)) for p, y in csv.reader(open(csv_path))]
        self.cfg = cfg
        self.train = train

    def __len__(self): return len(self.items)

    def _sample_indices(self, num_frames_total: int) -> List[int]:
        T = self.cfg.num_frames
        if num_frames_total <= 0:
            return [0]*T
        if self.cfg.sample == "uniform" or not self.train:
            # uniform over the whole clip
            idx = np.linspace(0, max(0, num_frames_total-1), T)
            return [int(round(i)) for i in idx]
        else:
            if num_frames_total >= T:
                start = np.random.randint(0, num_frames_total - T + 1)
                return list(range(start, start+T))
            # pad by repeating last frame
            base = list(range(num_frames_total))
            return base + [num_frames_total-1]*(T-num_frames_total)

    @staticmethod
    def _to_tensor(frames: np.ndarray) -> torch.Tensor:
        # frames: (T, H, W, C) uint8 -> (T, C, H, W) float32 [0,1]
        x = torch.from_numpy(frames).permute(0,3,1,2).float() / 255.0
        return x

    def _train_augment_same(self, x: torch.Tensor) -> torch.Tensor:
        """
        Apply the SAME random crop/flip to all frames in a clip.
        x: (T,C,H,W)
        """
        Tn, C, H, W = x.shape
        # RandomResizedCrop params (once) – use first frame for sizing
        pil0 = Image.fromarray((x[0].permute(1,2,0).cpu().numpy()*255).astype(np.uint8))
        i, j, h, w = torchvision.transforms.RandomResizedCrop.get_params(
            pil0, scale=self.cfg.train_scale, ratio=self.cfg.train_ratio
        )
        out_frames = []
        for t in range(Tn):
            fr = TF.resized_crop(x[t], i, j, h, w,
                                 size=[self.cfg.side, self.cfg.side],
                                 interpolation=InterpolationMode.BILINEAR, antialias=True)
            out_frames.append(fr)
        x = torch.stack(out_frames, dim=0)  # (T,C,side,side)

        # Same horizontal flip for all frames
        if random.random() < 0.5:
            x = torch.flip(x, dims=[-1])  # flip width

        return x

    def _eval_resize_center(self, x: torch.Tensor) -> torch.Tensor:
        # Resize shorter side to side, then center crop to (side, side)
        Tn, C, H, W = x.shape
        out = []
        for t in range(Tn):
            fr = x[t]
            # keep aspect: resize so min(H,W) -> side
            scale = self.cfg.side / min(H, W)
            newH, newW = int(round(H*scale)), int(round(W*scale))
            fr = TF.resize(fr, [newH, newW], interpolation=InterpolationMode.BILINEAR, antialias=True)
            # center crop
            top = max((newH - self.cfg.side)//2, 0)
            left = max((newW - self.cfg.side)//2, 0)
            fr = TF.crop(fr, top, left, self.cfg.side, self.cfg.side)
            out.append(fr)
        return torch.stack(out, dim=0)

    def __getitem__(self, idx: int):
        path, label = self.items[idx]
        vr = VideoReader(path, ctx=cpu(0))
        T_total = len(vr)
        indices = self._sample_indices(T_total)

        try:
            frames = vr.get_batch([min(i, T_total-1) for i in indices]).asnumpy()  # (T,H,W,C)
        except Exception:
            frames = np.stack([vr[min(i, T_total-1)].asnumpy() for i in indices], axis=0)

        x = self._to_tensor(frames)  # (T,C,H,W)

        if self.train:
            x = self._train_augment_same(x)
        else:
            x = self._eval_resize_center(x)

        # Normalize (broadcast)
        mean = IMAGENET_MEAN.to(x)
        std  = IMAGENET_STD.to(x)
        x = (x - mean) / std
        return x, label

#### **Generate datasets and loaders for training, validation, and testing**

In [None]:
train_csv = os.path.join(cfg.splits_dir, "train.csv")
val_csv   = os.path.join(cfg.splits_dir, "val.csv")
test_csv  = os.path.join(cfg.splits_dir, "test.csv")

vcfg = VideoCfg(
    clips_dir=cfg.clips_dir, splits_dir=cfg.splits_dir, labels=cfg.labels,
    side=224, num_frames=8, sample="uniform"
)

train_ds = VideoClipDataset(train_csv, vcfg, train=True)
val_ds   = VideoClipDataset(val_csv,   vcfg, train=False)

train_loader = DataLoader(
    train_ds, batch_size=cfg.batch_size, shuffle=True,
    num_workers=2, pin_memory=True, persistent_workers=False
    )

val_loader = DataLoader(
    val_ds, batch_size=max(1, cfg.batch_size), shuffle=False,
    num_workers=2, pin_memory=True, persistent_workers=False
    )

num_classes = len(vcfg.labels)
print("Classes:", num_classes, vcfg.labels)

Classes: 13 ['smash', 'jump_smash', 'block', 'drop', 'clear', 'lift', 'drive', 'straight_net', 'cross_net', 'serve', 'push', 'tap', 'average_joe']


## **4. TRAINING AND EVALUATION**

In [None]:
def compute_loss(model, loss_fn, loader):
    total = 0.0
    model.eval()
    with torch.no_grad():
        for (batchX, batchY) in tqdm(loader, desc="Computing Loss"):
            batchX = batchX.to(device).float()
            batchY = batchY.to(device).long()
            loss = loss_fn(model(batchX), batchY).item()
            total += loss
    model.train()
    return total / len(loader)

In [None]:
def compute_acc(model, loader):
    correct = 0
    totals = 0
    model.eval()
    with torch.no_grad():
        for (batchX, batchY) in tqdm(loader, desc="Computing Accuracy"):
            batchX = batchX.to(device).float()
            batchY = batchY.to(device)
            outputs = model(batchX)
            preds = outputs.argmax(dim=1)
            totals += batchY.size(0)
            correct += (preds == batchY).sum().item()
    return correct / totals

### **DINOv3**

In [None]:
class AttnPool(nn.Module):
    def __init__(self, dim, hidden=256):
        super().__init__()
        self.proj = nn.Linear(dim, hidden)
        self.ctx  = nn.Linear(hidden, 1, bias=False)
    def forward(self, x):           # x: (B, T, D)
        a = torch.tanh(self.proj(x))
        w = torch.softmax(self.ctx(a).squeeze(-1), dim=1)   # (B, T)
        return (w.unsqueeze(-1) * x).sum(dim=1)             # (B, D)

In [None]:
class DINOv3VideoClassifier(nn.Module):
    def __init__(self,
                 repo_dir, model,
                 num_classes=20,
                 pretrained=True,
                 weights_path=None,
                 fine_tune_backbone=False,
                 complex_head=True,
                 temporal_pool: str = "mean"  # "mean" or "max"
                 ):
        super().__init__()
        torch.hub._validate_not_a_forked_repo = lambda *a, **k: True

        # Load DINOv3 backbone (returns per-image embedding)
        self.backbone = torch.hub.load(
            repo_or_dir=repo_dir,
            model=model,
            source='local',
            pretrained=pretrained,
            weights=weights_path
        )

        # (optionally) freeze
        if not fine_tune_backbone:
            for p in self.backbone.parameters():
                p.requires_grad = False

        # infer embedding dim
        with torch.no_grad():
            mock = torch.randn(1, 3, 224, 224)
            emb = self.backbone(mock)
        embed_dim = emb.shape[-1]

        if complex_head:
            self.classifier = nn.Sequential(
                nn.Linear(embed_dim, 512), nn.ReLU(), nn.Dropout(0.5),
                nn.Linear(512, 256), nn.ReLU(), nn.Dropout(0.4),
                nn.Linear(256, num_classes)
            )
        else:
            self.classifier = nn.Sequential(nn.Dropout(0.5), nn.Linear(embed_dim, num_classes))

        self.temporal_pool = temporal_pool

        if temporal_pool == "attn":
            self.attn_pool = AttnPool(embed_dim)

    def forward(self, x):
        """
        x can be:
          - (B, C, H, W)   -> image path
          - (B, T, C, H, W)-> video path (preferred)
          - (T, C, H, W)   -> single video, unsqueezed to B=1
        """
        if x.dim() == 4:
            # Images: (B,C,H,W)
            feats = self.backbone(x)
            logits = self.classifier(feats)
            return logits

        if x.dim() == 5:
            B, T, C, H, W = x.shape
            # Flatten time into batch for backbone pass
            x_2d = x.reshape(B*T, C, H, W)
            feats = self.backbone(x_2d)             # (B*T, D)
            feats = feats.view(B, T, -1)            # (B, T, D)

            if self.temporal_pool == "max":
                clip_feat, _ = feats.max(dim=1)
            elif self.temporal_pool == "attn":
                clip_feat = self.attn_pool(feats)
            else:
                clip_feat = feats.mean(dim=1)

            logits = self.classifier(clip_feat)     # (B, num_classes)
            return logits

        if x.dim() == 3:  # (C,H,W) rare path
            return self.forward(x.unsqueeze(0))

        if x.dim() == 4 and x.shape[0] != x.shape[1]:
            # If someone fed (T,C,H,W), make it (1,T,C,H,W)
            return self.forward(x.unsqueeze(0))

        raise ValueError(f"Unexpected input shape {tuple(x.shape)}")

In [None]:
!git clone https://github.com/facebookresearch/dinov3.git

Cloning into 'dinov3'...
remote: Enumerating objects: 409, done.[K
remote: Counting objects: 100% (198/198), done.[K
remote: Compressing objects: 100% (137/137), done.[K
remote: Total 409 (delta 124), reused 61 (delta 61), pack-reused 211 (from 2)[K
Receiving objects: 100% (409/409), 9.83 MiB | 20.83 MiB/s, done.
Resolving deltas: 100% (156/156), done.


In [None]:
# Get base dinov3 L model
!gdown --fuzzy https://drive.google.com/file/d/1_JqEppwurlG0V0WNsfTiKNzejXP9Rs-U/view?usp=sharing

Downloading...
From (original): https://drive.google.com/uc?id=1_JqEppwurlG0V0WNsfTiKNzejXP9Rs-U
From (redirected): https://drive.google.com/uc?id=1_JqEppwurlG0V0WNsfTiKNzejXP9Rs-U&confirm=t&uuid=d3ba1bd4-c4fe-47b7-8710-2e49e7976280
To: /content/dinov3_vitl16_pretrain_lvd1689m-8aa4cbdd.pth
100% 1.21G/1.21G [00:13<00:00, 87.3MB/s]


In [None]:
dinov3_L_model = DINOv3VideoClassifier(
    repo_dir='dinov3',
    model='dinov3_vitl16',
    pretrained=True,
    weights_path='dinov3_vitl16_pretrain_lvd1689m-8aa4cbdd.pth',
    fine_tune_backbone=False,   # start frozen, then unfreeze
    complex_head=True,
    temporal_pool="attn"
).to(device)

#### **Optional: load weights from checkpoint**

In [None]:
# Load weights from previous run
!gdown --fuzzy https://drive.google.com/file/d/1oO037Eer4aU2OmY4XjJGNiNgzNImVOaq/view?usp=sharing

dinov3_L_model.load_state_dict(torch.load('L_iter1_base.pth'))
print(f"Model weights loaded successfully from {checkpoint_path}")

#### **Main training loop**

In [None]:
import time
import copy
from tqdm import tqdm

def fit(model= None, train_loader = None, valid_loader= None,
        optimizer = None, scheduler = None,
        num_epochs = 50, patience = 5, verbose = True,
        use_mixup = False, mixup_alpha = 0.15, mixup_prob = 0.4,
        use_cutmix = False, cutmix_alpha = 1.0, cutmix_prob = 0.4,
        history = None, check_interval = 1
       ):

    model.to(device)
    print(f"Training on device: {device}")

    optim = torch.optim.AdamW(model.parameters(), lr=0.001) if optimizer is None else optimizer

    def lr_lambda(epoch):
        warmup_epochs = 5

        if epoch < warmup_epochs:
            return (epoch + 1) / warmup_epochs
        else:
            # Check to prevent ZeroDivisionError
            cos_epochs = num_epochs - warmup_epochs
            if cos_epochs <= 0:
                return 1.0  # Return the last warmup multiplier (or a suitable value)

            cos_epoch = epoch - warmup_epochs
            return 0.5 * (1 + np.cos(np.pi * cos_epoch / cos_epochs))

    scheduler = torch.optim.lr_scheduler.LambdaLR(optim, lr_lambda) if scheduler is None else scheduler

    if history is None:
        history = dict()
        history['val_loss'] = list()
        history['val_acc'] = list()
        history['train_loss'] = list()
        history['train_acc'] = list()
        history['best_valacc'] = 0
        history['best_valacc_model'] = None
        history['best_valloss'] = 100
        history['best_valloss_model'] = None
        history['best_trainacc'] = 0
        history['best_trainacc_model'] = None
        history['best_trainloss'] = 100
        history['best_trainloss_model'] = None

    patience_counter = 0

    for epoch in range(num_epochs):
        start = time.time()
        model.train()

        train_loop = tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs} - Training")

        for (X, y) in train_loop:
            # Move input data to the same device as the model
            X,y = X.to(device), y.to(device)

            # Forward pass
            if use_mixup and np.random.rand() < mixup_prob:
                mixed_X, y_a, y_b, lam = mixup_data(X, y, alpha=mixup_alpha)
                outputs = model(mixed_X.type(torch.float32))
                loss = mix_criterion(loss_fn, outputs, y_a.type(torch.long), y_b.type(torch.long), lam)

            elif use_cutmix and np.random.rand() < cutmix_prob:
                mixed_X, y_a, y_b, lam = cutmix_data(X, y, alpha=cutmix_alpha)
                outputs = model(mixed_X)
                loss = mix_criterion(loss_fn, outputs, y_a, y_b, lam)

            else:
                outputs = model(X.type(torch.float32))
                loss = loss_fn(outputs, y.type(torch.long))

            # Backward and optimize
            optim.zero_grad()
            loss.backward()
            optim.step()

        # train_acc  = acc.compute().item()
        # train_f1   = f1.compute().item()
        # val_acc = acc.compute().item()
        # val_f1  = f1.compute().item()

        # scheduler.step()

        # Clear cache to prevent CUDA OOM for extremely large models
        torch.cuda.empty_cache()

        # Losses and accuracies for epoch
        # if (epoch + 1) % check_interval == 0 or epoch == num_epochs - 1:
        if (epoch + 1) % check_interval == 0:
            val_loss = compute_loss(model, loss_fn, valid_loader)
            history['val_loss'].append(val_loss)
            val_acc = compute_acc(model, valid_loader)
            history['val_acc'].append(val_acc)
            train_loss = compute_loss(model, loss_fn, train_loader)
            history['train_loss'].append(train_loss)
            train_acc = compute_acc(model, train_loader)
            history['train_acc'].append(train_acc)

            # Track four models: highest validation accuracy, lowest validation loss, highest training accuracy, lowest training loss
            if val_acc > history['best_valacc']:
                print('Best val acc change:', val_acc)
                history['best_valacc'] = val_acc
                history['best_valacc_model'] = copy.deepcopy(model.state_dict())
                best_val_acc = val_acc
                patience_counter = 0
            else:
                if patience_counter is None: patience_counter = 0
                patience_counter += 1

            if val_loss < history['best_valloss']:
                print('Best val loss change:', val_loss)
                history['best_valloss'] = val_loss
                history['best_valloss_model'] = copy.deepcopy(model.state_dict())

            if train_acc > history['best_trainacc']:
                print('Best train acc change:', train_acc)
                history['best_trainacc'] = train_acc
                history['best_trainacc_model'] = copy.deepcopy(model.state_dict())
                best_train_acc = train_acc
                patience_counter = 0
            else:
                if patience_counter is None: patience_counter = 0
                patience_counter += 1

            if train_loss < history['best_trainloss']:
                print('Best train loss change:', train_loss)
                history['best_trainloss'] = train_loss
                history['best_trainloss_model'] = copy.deepcopy(model.state_dict())

            end = time.time()
            print(f"total time for each epoch {end - start}") # time in seconds
            if verbose:
                # print(f"Epoch {epoch+1}/{num_epochs}")
                print(f"train loss= {train_loss:.4f} - train acc= {train_acc*100:.2f}% - valid loss= {val_loss:.4f} - valid acc= {val_acc*100:.2f}%\n")

        else:
            if verbose:
                end = time.time()
                print(f"Epoch {epoch+1}/{num_epochs} - total time for training loop {end - start}\n")

        if patience_counter >= patience:
                print(f"Early stopping triggered at epoch {epoch+1}! No improvement for {patience} epochs.\n")
                break

    return history

#### **Define training components**

In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(dinov3_L_model.parameters(), lr=cfg.learning_rate, weight_decay=cfg.weight_decay)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.5, patience=10)
scaler = torch.cuda.amp.GradScaler(enabled=(device.type == "cuda"))
loss_fn = nn.CrossEntropyLoss()

acc = MulticlassAccuracy(num_classes=num_classes, average='micro').to(device)
f1  = MulticlassF1Score(num_classes=num_classes, average='macro').to(device)

  scaler = torch.cuda.amp.GradScaler(enabled=(device.type == "cuda"))


In [None]:
# Run this to obtain a fresh set of history

new_history = dict()
new_history['val_loss'] = list()
new_history['val_acc'] = list()
new_history['train_loss'] = list()
new_history['train_acc'] = list()
new_history['best_valacc'] = 0
new_history['best_valacc_model'] = None
new_history['best_valloss'] = 100
new_history['best_valloss_model'] = None
new_history['best_trainacc'] = 0
new_history['best_trainacc_model'] = None
new_history['best_trainloss'] = 100
new_history['best_trainloss_model'] = None

In [None]:
history = fit(
    model = dinov3_L_model,

    train_loader = train_loader, valid_loader = val_loader,

    optimizer = optimizer, scheduler = None,

    num_epochs = 10, verbose = True, patience = 5,

    use_mixup = False, mixup_alpha = 0.75, mixup_prob = 0.4,
    use_cutmix = False, cutmix_alpha = 1.0, cutmix_prob = 0.75,

    # Change to history to continue from previous run.
    history = new_history,

    # Only calculate training and validation accuracy and loss every n rounds.
    check_interval = 2
)

Training on device: cuda


Epoch 1/10 - Training: 100%|██████████| 51/51 [01:02<00:00,  1.23s/it]


Epoch 1/10 - total time for training loop 62.69083213806152



Epoch 2/10 - Training: 100%|██████████| 51/51 [01:02<00:00,  1.22s/it]
Computing Loss: 100%|██████████| 7/7 [00:08<00:00,  1.18s/it]
Computing Accuracy: 100%|██████████| 7/7 [00:08<00:00,  1.18s/it]
Computing Loss: 100%|██████████| 51/51 [01:03<00:00,  1.25s/it]
Computing Accuracy: 100%|██████████| 51/51 [01:03<00:00,  1.24s/it]


Best val acc change: 0.2
Best val loss change: 2.204367092677525
Best train acc change: 0.27970297029702973
Best train loss change: 2.085348218095069
total time for each epoch 206.37662601470947
train loss= 2.0853 - train acc= 27.97% - valid loss= 2.2044 - valid acc= 20.00%



Epoch 3/10 - Training: 100%|██████████| 51/51 [01:02<00:00,  1.23s/it]


Epoch 3/10 - total time for training loop 63.30129957199097



Epoch 4/10 - Training: 100%|██████████| 51/51 [01:02<00:00,  1.23s/it]
Computing Loss: 100%|██████████| 7/7 [00:08<00:00,  1.17s/it]
Computing Accuracy: 100%|██████████| 7/7 [00:08<00:00,  1.18s/it]
Computing Loss: 100%|██████████| 51/51 [01:03<00:00,  1.24s/it]
Computing Accuracy: 100%|██████████| 51/51 [01:03<00:00,  1.24s/it]


Best val acc change: 0.26
Best val loss change: 1.9819190161568778
Best train acc change: 0.38613861386138615
Best train loss change: 1.8100769706800872
total time for each epoch 206.37111473083496
train loss= 1.8101 - train acc= 38.61% - valid loss= 1.9819 - valid acc= 26.00%



Epoch 5/10 - Training: 100%|██████████| 51/51 [01:02<00:00,  1.23s/it]


Epoch 5/10 - total time for training loop 63.17489814758301



Epoch 6/10 - Training: 100%|██████████| 51/51 [01:02<00:00,  1.23s/it]
Computing Loss: 100%|██████████| 7/7 [00:08<00:00,  1.18s/it]
Computing Accuracy: 100%|██████████| 7/7 [00:08<00:00,  1.17s/it]
Computing Loss: 100%|██████████| 51/51 [01:03<00:00,  1.24s/it]
Computing Accuracy: 100%|██████████| 51/51 [01:03<00:00,  1.24s/it]


Best val acc change: 0.28
Best val loss change: 1.8436562504087175
Best train loss change: 1.6530042068631041
total time for each epoch 206.31783866882324
train loss= 1.6530 - train acc= 34.65% - valid loss= 1.8437 - valid acc= 28.00%



Epoch 7/10 - Training: 100%|██████████| 51/51 [01:02<00:00,  1.23s/it]


Epoch 7/10 - total time for training loop 63.57696795463562



Epoch 8/10 - Training: 100%|██████████| 51/51 [01:01<00:00,  1.20s/it]
Computing Loss: 100%|██████████| 7/7 [00:08<00:00,  1.18s/it]
Computing Accuracy: 100%|██████████| 7/7 [00:08<00:00,  1.18s/it]
Computing Loss: 100%|██████████| 51/51 [01:03<00:00,  1.24s/it]
Computing Accuracy: 100%|██████████| 51/51 [01:02<00:00,  1.23s/it]


Best val acc change: 0.3
Best val loss change: 1.7584921632494246
Best train acc change: 0.41089108910891087
Best train loss change: 1.544458109958499
total time for each epoch 204.41567826271057
train loss= 1.5445 - train acc= 41.09% - valid loss= 1.7585 - valid acc= 30.00%



Epoch 9/10 - Training: 100%|██████████| 51/51 [01:01<00:00,  1.20s/it]


Epoch 9/10 - total time for training loop 61.951104402542114



Epoch 10/10 - Training: 100%|██████████| 51/51 [01:01<00:00,  1.20s/it]
Computing Loss: 100%|██████████| 7/7 [00:08<00:00,  1.18s/it]
Computing Accuracy: 100%|██████████| 7/7 [00:08<00:00,  1.18s/it]
Computing Loss: 100%|██████████| 51/51 [01:03<00:00,  1.25s/it]
Computing Accuracy: 100%|██████████| 51/51 [01:02<00:00,  1.22s/it]

Best val acc change: 0.34
Best train acc change: 0.4306930693069307
Best train loss change: 1.4450528937227585
total time for each epoch 204.20862555503845
train loss= 1.4451 - train acc= 43.07% - valid loss= 1.7689 - valid acc= 34.00%






In [None]:
torch.save(dinov3_L_model.state_dict(), '/content/L_10e.pth')

## **Evaluate on test set**

In [None]:
class TestManager:
    """
    Manages the evaluation process for a SlowFast model on a test set.
    """
    def __init__(self, config: 'Config', device: str):
        self.config = config
        self.device = device
        self.num_classes = len(config.labels)
        self.model = self._load_model()
        self.test_loader = self._create_dataloader()
        self.metrics = self._initialize_metrics()
        self.softmax = nn.Softmax(dim=1)

    def _load_model(self):
        """Loads the pre-trained SlowFast model and the fine-tuned checkpoint."""
        print("Loading model and best checkpoint...")

        # Disable the internal hub check for local loading
        torch.hub._validate_not_a_forked_repo = lambda a,b,c: True

        model = torch.hub.load('facebookresearch/pytorchvideo', 'slowfast_r101', pretrained=True)
        in_dim = model.blocks[-1].proj.in_features
        model.blocks[-1].proj = nn.Sequential(
            nn.Dropout(p=0.2), # Add a dropout layer
            nn.Linear(in_dim, num_classes)
        )

        # Load the state dictionary from the checkpoint file
        ckpt = torch.load(self.config.best_model_path, map_location=self.device)
        model.load_state_dict(ckpt["model"])
        model = model.to(self.device)
        model.eval()
        return model

    def _create_dataloader(self):
        """Creates and returns the DataLoader for the test set."""
        test_ds = ClipDataset(os.path.join(self.config.splits_dir, "test.csv"), self.config, train=False)
        return DataLoader(
            test_ds,
            batch_size=max(1, self.config.batch_size),
            shuffle=False,
            num_workers=2,
            pin_memory=True,
            # collate_fn=slowfast_collate,  # Make sure this is imported if needed
            persistent_workers=False
        )

    def _initialize_metrics(self):
        """Initializes all the evaluation metrics."""
        return {
            'top1': MulticlassAccuracy(num_classes=self.num_classes, average="micro").to(self.device),
            'top3': MulticlassAccuracy(num_classes=self.num_classes, top_k=3).to(self.device),
            'f1_macro': MulticlassF1Score(num_classes=self.num_classes, average="macro").to(self.device),
            'f1_perclass': MulticlassF1Score(num_classes=self.num_classes, average=None).to(self.device),
            'cm': MulticlassConfusionMatrix(num_classes=self.num_classes).to(self.device)
        }

    def run_inference(self):
        """Runs the inference loop and computes all metrics and predictions."""
        print("Starting inference on the test set...")
        test_loss = 0.0
        criterion = nn.CrossEntropyLoss()
        all_predictions = []

        with torch.no_grad(), torch.amp.autocast(self.device, enabled=(self.device == "cuda")):
            for batch_idx, (slow_fast, y) in enumerate(self.test_loader):
                # Ensure input tensors are lists
                if not isinstance(slow_fast, list):
                    slow_fast = [slow_fast]

                slow_fast = [t.to(self.device, non_blocking=True) for t in slow_fast]
                y = y.to(self.device, non_blocking=True)

                logits = self.model(slow_fast)
                loss = criterion(logits, y)
                test_loss += loss.item() * y.size(0)

                # Update metrics
                for metric in self.metrics.values():
                    metric.update(logits, y)

                # Collect per-sample predictions for later saving
                probs = self.softmax(logits)
                conf, pred = probs.max(dim=1)
                topk_conf, topk_idx = probs.topk(3, dim=1)

                start_idx = batch_idx * self.test_loader.batch_size

                for i in range(y.size(0)):
                    idx = start_idx + i
                    path = self.test_loader.dataset.items[idx][0]
                    row = {
                        "path": path,
                        "file": os.path.basename(path),
                        "true_idx": int(y[i]),
                        "true_label": self.config.labels[int(y[i])],
                        "pred_idx": int(pred[i]),
                        "pred_label": self.config.labels[int(pred[i])],
                        "pred_prob": float(conf[i]),
                        "top1_label": self.config.labels[int(topk_idx[i,0])],
                        "top1_prob":  float(topk_conf[i,0]),
                        "top2_label": self.config.labels[int(topk_idx[i,1])],
                        "top2_prob":  float(topk_conf[i,1]),
                        "top3_label": self.config.labels[int(topk_idx[i,2])],
                        "top3_prob":  float(topk_conf[i,2]),
                    }
                    all_predictions.append(row)

        test_loss /= len(self.test_loader.dataset)
        return test_loss, all_predictions

    def compute_and_print_results(self, test_loss):
        """Computes and prints the final metrics."""
        acc1 = self.metrics['top1'].compute().item()
        acc3 = self.metrics['top3'].compute().item()
        f1M = self.metrics['f1_macro'].compute().item()
        percls = self.metrics['f1_perclass'].compute().detach().cpu().tolist()
        confmat = self.metrics['cm'].compute().detach().cpu().numpy()

        print(f"\nTEST: loss={test_loss:.4f} | acc@1={acc1*100:.2f}% | acc@3={acc3*100:.2f}% | macro-F1={f1M:.3f}")
        print("\nPer-class F1:")
        for lab, s in sorted(zip(self.config.labels, percls), key=lambda x: x[1], reverse=True):
            print(f"  {lab:15s} {s:.3f}")

        print("\nConfusion Matrix (rows=true, cols=predicted):")
        print(confmat)

    def save_predictions(self, predictions: list, print_n: int=10):
        """Saves the list of predictions to a CSV file."""
        df = pd.DataFrame(predictions)
        save_path = os.path.join(self.config.models_dir, "test_predictions.csv")
        df.to_csv(save_path, index=False)
        print(f"\nSaved per-sample predictions to: {save_path}")
        print("\nQuick peek at the predictions:")
        print(df.head(print_n)[["file", "true_label", "pred_label", "pred_prob", "top2_label", "top2_prob", "top3_label", "top3_prob"]])

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
test_manager = TestManager(cfg, device)
test_loss, all_predictions = test_manager.run_inference()
test_manager.compute_and_print_results(test_loss)
test_manager.save_predictions(all_predictions, print_n=len(all_predictions))

Loading model and best checkpoint...


Using cache found in /root/.cache/torch/hub/facebookresearch_pytorchvideo_main


Starting inference on the test set...

TEST: loss=1.5431 | acc@1=57.69% | acc@3=75.73% | macro-F1=0.447

Per-class F1:
  serve           1.000
  jump_smash      0.857
  lift            0.769
  straight_net    0.696
  cross_net       0.500
  drop            0.400
  clear           0.250
  smash           0.000
  block           0.000
  drive           0.000
  push            0.000
  tap             0.000
  average_joe     0.000

Confusion Matrix (rows=true, cols=predicted):
[[ 0  0  0  0  0  0  0  0  1  0  0  0  0]
 [ 0  3  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  1  0  1  1  0  0  0  0  0  0  0  0]
 [ 0  0  0  1  1  1  0  0  0  0  0  0  0]
 [ 0  0  0  0  0 10  0  1  1  0  1  0  0]
 [ 0  0  0  0  3  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  1  8  0  0  1  0  0]
 [ 0  0  0  0  0  1  0  3  4  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  3  0  0  0]
 [ 0  0  0  0  0  1  1  1  2  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  

## **end-to-end match inference & overlay**

In [None]:
!pip -q install ultralytics opencv-python-headless

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.1 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━[0m [32m1.0/1.1 MB[0m [31m29.7 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m21.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import os, cv2, numpy as np, torch
from collections import deque, defaultdict
from ultralytics import YOLO
import torch.nn.functional as F

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
def load_slowfast_classifier(cfg, ckpt_path):
    torch.hub._validate_not_a_forked_repo = lambda a,b,c: True
    model = torch.hub.load('facebookresearch/pytorchvideo', 'slowfast_r101', pretrained=True)
    in_dim = model.blocks[-1].proj.in_features
    model.blocks[-1].proj = torch.nn.Linear(in_dim, len(cfg.labels))
    ckpt = torch.load(ckpt_path, map_location=device)
    model.load_state_dict(ckpt["model"], strict=True)
    model.eval().to(device)
    return model

In [None]:
def resize_pad_square(img_rgb: np.ndarray, side: int = 224) -> np.ndarray:
    """Keep aspect ratio; resize the longer side to `side`, then pad to (side, side)."""
    h, w = img_rgb.shape[:2]
    if h == 0 or w == 0:
        return np.zeros((side, side, 3), dtype=img_rgb.dtype)
    scale = side / max(h, w)
    nh, nw = int(round(h * scale)), int(round(w * scale))
    resized = cv2.resize(img_rgb, (nw, nh), interpolation=cv2.INTER_LINEAR)
    top  = (side - nh) // 2
    bottom = side - nh - top
    left = (side - nw) // 2
    right = side - nw - left
    out = cv2.copyMakeBorder(resized, top, bottom, left, right, cv2.BORDER_CONSTANT, value=(128,128,128))
    return out

def expand_box(x1, y1, x2, y2, scale: float, W: int, H: int):
    """Optionally enlarge the bbox to keep some context (e.g., racket)."""
    cx, cy = (x1 + x2) / 2.0, (y1 + y2) / 2.0
    bw, bh = (x2 - x1) * scale, (y2 - y1) * scale
    nx1, ny1 = int(max(0, cx - bw / 2)), int(max(0, cy - bh / 2))
    nx2, ny2 = int(min(W - 1, cx + bw / 2)), int(min(H - 1, cy + bh / 2))
    return nx1, ny1, nx2, ny2

In [None]:
class SlowFastPredictor:
    def __init__(self, cfg, model):
        self.cfg = cfg
        self.model = model
        self.mean = torch.tensor([0.45, 0.45, 0.45]).view(3,1,1).to(device)
        self.std  = torch.tensor([0.225, 0.225, 0.225]).view(3,1,1).to(device)

    def _prep(self, frames_rgb_list):
        """
        frames_rgb_list: list of 32 frames, each HxWx3 in RGB
        Returns: [slow, fast] tensors shaped (1,C,T,H,W)
        """
        # Stack to (T,H,W,3) -> (T,C,H,W)
        x = torch.from_numpy(np.stack(frames_rgb_list)).permute(0,3,1,2).float() / 255.0  # (T,C,H,W)
        # Resize treating T as batch
        x = F.interpolate(x, size=self.cfg.side, mode="bilinear", align_corners=False)    # (T,C,224,224)
        # Normalize
        mean = self.mean.to(device=x.device, dtype=x.dtype)
        std  = self.std.to(device=x.device, dtype=x.dtype)
        x = (x - mean) / std                                                   # (T,C,224,224)
        # (C,T,H,W)
        x = x.permute(1,0,2,3)
        fast = x.unsqueeze(0).to(device)             # (1,C,32,224,224)
        slow = x[:, ::self.cfg.alpha, :, :].unsqueeze(0).to(device)  # stride-4 -> (1,C,8,224,224)
        return [slow, fast]

    @torch.no_grad()
    def predict_probs(self, frames_rgb_list):
        assert len(frames_rgb_list) == self.cfg.fast_t  # 32
        with torch.amp.autocast('cuda', enabled=(device.type == "cuda")):
            inp = self._prep(frames_rgb_list)
            logits = self.model(inp)                  # (1, num_classes)
            probs = torch.softmax(logits, dim=1)[0].detach().cpu().numpy()
        return probs  # (C,)

In [None]:
def annotate_match_video(
    cfg,
    video_path,
    out_path,
    yolo_weights="yolo11n.pt", # change to your custom weights if you have them
    person_class=0,            # COCO 'person'
    det_conf=0.5,
    iou=0.5,
    pred_thr=0.60,             # minimum prob to show label
    cooldown=12                # frames to cool after showing a shot to reduce spam
):
    # Get video props for the writer
    cap = cv2.VideoCapture(video_path)
    fps = max(1.0, cap.get(cv2.CAP_PROP_FPS))
    W   = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    H   = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    cap.release()

    fourcc = cv2.VideoWriter_fourcc(*"mp4v")
    writer = cv2.VideoWriter(out_path, fourcc, fps, (W, H))

    # Load detector+tracker
    yolo = YOLO(yolo_weights)

    # Load classifier
    clf_model = load_slowfast_classifier(cfg, cfg.best_model_path)
    clf = SlowFastPredictor(cfg, clf_model)

    # Per-track state
    buffers = defaultdict(lambda: deque(maxlen=cfg.fast_t))            # 32-frame RGB crops per track
    last_shown_frame = defaultdict(lambda: -99999)                     # cooldown control
    hist = defaultdict(lambda: deque(maxlen=5))                        # small temporal smoothing buffer

    frame_idx = 0
    for res in yolo.track(source=video_path, stream=True, persist=True,
                          classes=[person_class], conf=det_conf, iou=iou, verbose=False):
        frame_bgr = res.orig_img  # BGR
        h, w = frame_bgr.shape[:2]

        # If no boxes/ids in this frame, just write it
        if res.boxes is None or res.boxes.id is None:
            writer.write(frame_bgr)
            frame_idx += 1
            continue

        ids = res.boxes.id.int().cpu().numpy()
        xyxy = res.boxes.xyxy.int().cpu().numpy()  # (N,4)

        to_draw = []  # (x1,y1,x2,y2,label,prob,tid)

        for j, tid in enumerate(ids):
            x1, y1, x2, y2 = xyxy[j]
            x1, y1 = max(0, x1), max(0, y1)
            x2, y2 = min(w-1, x2), min(h-1, y2)
            if x2 <= x1 or y2 <= y1:
                continue

            # NEW: enlarge a bit for context (optional, try 1.2–1.4)
            x1, y1, x2, y2 = expand_box(x1, y1, x2, y2, scale=1.25, W=w, H=h)

            # Crop -> RGB -> letterbox to fixed square
            crop = frame_bgr[y1:y2, x1:x2, :]
            if crop.size == 0:
                continue
            crop_rgb = cv2.cvtColor(crop, cv2.COLOR_BGR2RGB)
            crop_rgb = resize_pad_square(crop_rgb, side=cfg.side)  # now every frame is 224x224

            buffers[tid].append(crop_rgb)

            label_to_show = None
            prob_to_show  = 0.0

            # Classify when we have a full 32-frame clip
            if len(buffers[tid]) == cfg.fast_t:
                probs = clf.predict_probs(list(buffers[tid]))  # (C,)
                ci = int(probs.argmax())
                pi = float(probs[ci])
                hist[tid].append((ci, pi))

                # Small smoothing: require at least 2 of the last 3 agreeing + prob >= thr
                if len(hist[tid]) >= 3:
                    last3 = list(hist[tid])[-3:]
                else:
                    last3 = list(hist[tid])

                # Choose the label with the highest mean prob among last3
                if last3:
                    classes = [c for c, p in last3 if cfg.labels[c] != "average_joe" and p >= pred_thr]
                    if classes:
                        # pick the most common; break ties by highest avg prob
                        uniq = set(classes)
                        best_c, best_score = None, -1.0
                        for u in uniq:
                            avgp = np.mean([p for (c, p) in last3 if c == u])
                            score = (classes.count(u), avgp)  # (count, avgp)
                            if score > (classes.count(best_c) if best_c is not None else -1, best_score):
                                best_c, best_score = u, avgp
                        if best_c is not None and (frame_idx - last_shown_frame[tid] >= cooldown):
                            label_to_show = cfg.labels[best_c]
                            prob_to_show = float(best_score)
                            last_shown_frame[tid] = frame_idx

            # Queue drawing if we have a confident non-background label
            if label_to_show is not None:
                to_draw.append((x1, y1, x2, y2, label_to_show, prob_to_show, int(tid)))

        # ---- Draw all overlays on this frame ----
        for (x1, y1, x2, y2, lab, p, tid) in to_draw:
            color = (0, 220, 0)
            cv2.rectangle(frame_bgr, (x1, y1), (x2, y2), color, 2)
            txt = f"#{tid} {lab} {p*100:.1f}%"
            cv2.putText(frame_bgr, txt, (x1, max(20, y1-10)),
                        cv2.FONT_HERSHEY_SIMPLEX, 0.6, color, 2, cv2.LINE_AA)

        writer.write(frame_bgr)
        frame_idx += 1

    writer.release()
    print(f"Saved annotated video to: {out_path}")

In [None]:
in_video  = "/content/drive/MyDrive/FIT3163,3164/SlowFast/01_raw/lcw_ld_2016_short/1/master.mp4"
out_video = "/content/match_annotated.mp4"
yolo_weights = "/content/drive/MyDrive/FIT3163,3164/YOLO/my_yolov8_1.pt"

annotate_match_video(cfg, in_video, out_video,
                     yolo_weights=yolo_weights,  # swap if you have a better person/badminton model
                     det_conf=0.35, iou=0.5,
                     pred_thr=0.60, cooldown=12)

Using cache found in /root/.cache/torch/hub/facebookresearch_pytorchvideo_main


Saved annotated video to: /content/match_annotated.mp4
