In [11]:
import os, math
from transformers import AutoImageProcessor, AutoModelForVideoClassification, AutoConfig
import numpy as np 

from pathlib import Path 
from datasets import Dataset, Features, Array3D, ClassLabel, Value, load_from_disk
from multiprocessing import Pool, cpu_count
from tqdm.notebook import tqdm\

from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence
import torch
import torch.nn.functional as F
from torch.optim import AdamW


from datasets import load_from_disk
import matplotlib.pyplot as plt

# the fine tuning stuff

from peft import LoraConfig, get_peft_model
from transformers import TrainingArguments, Trainer, get_linear_schedule_with_warmup


In [12]:
# load dat aset

preprocesed_dataset = load_from_disk("C:\\Users\\brand\\desktop\\workspace\\f1-telemetry-app\\fine-tune\\preprocessed_dataset")

# C:\Users\brand\desktop\workspace\f1-telemetry-app\fine-tune\preprocessed_dataset

Loading dataset from disk:   0%|          | 0/71 [00:00<?, ?it/s]

In [13]:

# confirm data set save is correct

#first_example = preprocesed_dataset[1]
#frames = np.array(first_example["features"])
#label = first_example["label"]

sample = preprocesed_dataset[0]
print("Video shape:", preprocesed_dataset.features)
torch.tensor(sample["pixel_values"]).shape

#print("Label:", label)

# for idx in range(0, len(frames)) :
#    plt.imshow(frames[idx])
#     plt.title(f"Label: {label}")
#     plt.axis("off")
#     plt.show()


Video shape: {'label': List(Value('int64'), length=3), 'pixel_values': List(List(List(List(Value('float32'))))), 'labels': List(Value('int64'))}


torch.Size([60, 3, 224, 224])

In [14]:
model_name = model_name = "MCG-NJU/videomae-base-finetuned-kinetics"



# condfig to overide number of classes
config = AutoConfig.from_pretrained(model_name)
config.num_labels = len(preprocesed_dataset["label"])
# take raw video frames, resize to right size, convert to pytorch tensors, normalize pixel values
processor = AutoImageProcessor.from_pretrained(model_name)

model = AutoModelForVideoClassification.from_pretrained(
    model_name,
    config = config,
    ignore_mismatched_sizes = True
)


Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Some weights of VideoMAEForVideoClassification were not initialized from the model checkpoint at MCG-NJU/videomae-base-finetuned-kinetics and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([400]) in the checkpoint and torch.Size([972]) in the model instantiated
- classifier.weight: found shape torch.Size([400, 768]) in the checkpoint and torch.Size([972, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
# wrap VideoMae model with LoRA


# LoRA config
lora_config = LoraConfig(
    r = 16, # try 4-32
    lora_alpha = 32, # scaling factor,
    target_modules = ["query", "value"], # attention layers to apply LoRA to
    lora_dropout = 0.1,
    bias = "none",
    task_type = "SEQ_CLS" # find a better way for this
)

model = get_peft_model(model, lora_config) # wrapping model

model.print_trainable_parameters() # see trainable parameters



trainable params: 1,337,292 || all params: 88,311,960 || trainable%: 1.5143


In [16]:
dataset = preprocesed_dataset.train_test_split(test_size=0.1)
train_dataset = dataset["train"]
eval_dataset = dataset["test"]

In [17]:
# to collate jagged array

def collate_fn(batch, num_frames = 60) :
    pixel_values = []
    labels = []

    for item in batch :
        vid = torch.tensor(item["pixel_values"])
                           
        if vid.shape[0] > num_frames :
            vid = vid[:num_frames]
        elif vid.shape[0] < num_frames :
            pad = num_frames - vid.shape[0]
            vid = F.pad(vid, (0,0,0,0,0,0,0,pad))  # pad frames dimension
        
        pixel_values.append(vid)
        labels.append(torch.tensor(item["labels"]).argmax().long())
    
    pixel_values = torch.stack(pixel_values)
    labels = torch.stack(labels)

    return {"pixel_values": pixel_values, "labels": labels}



In [18]:
train_loader = DataLoader(train_dataset, batch_size = 4, shuffle = True, collate_fn = collate_fn)
eval_loader = DataLoader(eval_dataset, batch_size = 4, collate_fn = collate_fn)

In [19]:
batch = next(iter(train_loader))


In [20]:
backbone = model.base_model.model.videomae
pos_embed = backbone.embeddings.position_embeddings  # shape: [1, old_seq_len, dim]
print(pos_embed.shape)

torch.Size([1, 1568, 768])


In [21]:
seq_len = pos_embed.shape[1]  # frames * patches/frame
dim = pos_embed.shape[2]

num_patches_per_frame = seq_len // 16
pos_embed_reshaped = pos_embed.view(1, 16, num_patches_per_frame, dim)  # [1, 16, P, D]

pos_embed_reshaped = pos_embed_reshaped.permute(0, 3, 2, 1)             # [1, D, P, 16]
pos_embed_reshaped = pos_embed_reshaped.reshape(1, dim * num_patches_per_frame, 16)  # [1, D*P, 16]

# interpolate frames from 16 → 60
pos_embed_resized = F.interpolate(
    pos_embed_reshaped, size=60, mode="linear", align_corners=False
)  # [1, D*P, 60]

# reshape back
pos_embed_resized = pos_embed_resized.view(1, dim, num_patches_per_frame, 60)  
# [1, D, P, 60]

pos_embed_resized = pos_embed_resized.permute(0, 3, 2, 1)  # [1, 60, P, D]

# final flatten
pos_embed_new = pos_embed_resized.reshape(1, 60 * num_patches_per_frame, dim)

backbone.embeddings.position_embeddings = torch.nn.Parameter(pos_embed_new)


In [22]:

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = model.to(device)  # move model to gpu


pixel_values = torch.tensor(batch["pixel_values"]).to(device) # move inputs to gpu
labels = torch.tensor(batch["labels"]).to(device).long() # move inputs to gpu

outputs = model(pixel_values = pixel_values, labels = labels)
print(outputs.loss, outputs.logits.shape)

print("pixel_values:", pixel_values.shape)
print("labels:", labels.shape)

  pixel_values = torch.tensor(batch["pixel_values"]).to(device) # move inputs to gpu
  labels = torch.tensor(batch["labels"]).to(device).long() # move inputs to gpu


tensor(6.6806, device='cuda:0', grad_fn=<NllLossBackward0>) torch.Size([4, 972])
pixel_values: torch.Size([4, 60, 3, 224, 224])
labels: torch.Size([4])


In [23]:
# ---------- HYPERPARAMS ----------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
epochs = 8
train_batch_size = 4           # actual dataloader batch size
eval_batch_size = 4
learning_rate = 3e-4           # a good starting point for LoRA / small adapter training
weight_decay = 0.01
gradient_accumulation_steps = 1  # increase if you need larger effective batch
max_grad_norm = 1.0
save_dir = "./checkpoints"
os.makedirs(save_dir, exist_ok=True)
use_amp = True                 # mixed precision (recommended when using CUDA)

In [24]:
# optimizer: only parameters that require_grad (PEFT typically sets base params frozen)
optimizer = AdamW(filter(lambda p: p.requires_grad, model.parameters()),
                  lr=learning_rate, weight_decay=weight_decay)

In [25]:
num_update_steps_per_epoch = math.ceil(len(train_loader) / gradient_accumulation_steps)
max_train_steps = epochs * num_update_steps_per_epoch
warrmup_steps = int(0.06 * max_train_steps)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps = warrmup_steps, num_training_steps = max_train_steps)
scaler = torch.cuda.amp.GradScaler(enabled = (use_amp and torch.cuda.is_available()))

  scaler = torch.cuda.amp.GradScaler(enabled = (use_amp and torch.cuda.is_available()))


In [26]:
def evaluate(model, eval_loader, device):
    model.eval()
    total = 0
    correct = 0
    loss_sum = 0.0

    with torch.no_grad():
        for batch in tqdm(eval_loader, desc="Eval", leave=False):
            # move to device
            pixel_values = batch["pixel_values"].to(device).float()
            labels = batch["labels"].to(device).long()   # already class indices now

            # forward (model returns .loss when labels provided)
            outputs = model(pixel_values=pixel_values, labels=labels)
            loss = outputs.loss
            logits = outputs.logits

            preds = logits.argmax(dim=1)   # [B]
            correct += (preds == labels).sum().item()
            total += labels.size(0)
            loss_sum += loss.item() * labels.size(0)

    avg_loss = loss_sum / total if total > 0 else 0.0
    accuracy = correct / total if total > 0 else 0.0
    model.train()
    return avg_loss, accuracy

In [None]:
# ---------- TRAIN LOOP ----------
global_step = 0
best_val_acc = 0.0

for epoch in range(epochs):
    model.train()                           # put model in training mode
    epoch_loss = 0.0
    pbar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}")

    optimizer.zero_grad()
    for step, batch in enumerate(pbar):
        pixel_values = batch["pixel_values"].to(device).float()  # ensure float
        labels = batch["labels"].to(device).long()               # ensure long

        # mixed precision context
        with torch.cuda.amp.autocast(enabled=(use_amp and torch.cuda.is_available())):
            outputs = model(pixel_values=pixel_values, labels=labels)
            loss = outputs.loss
            # if using gradient accumulation, scale loss down here
            loss = loss / gradient_accumulation_steps

        # scale the loss, backprop with scaler (AMP)
        scaler.scale(loss).backward()

        # gradient accumulation step
        if (step + 1) % gradient_accumulation_steps == 0:
            # unscale before clipping
            scaler.unscale_(optimizer)
            # optional gradient clipping
            torch.nn.utils.clip_grad_norm_(filter(lambda p: p.requires_grad, model.parameters()), max_grad_norm)

            # step optimizer and scaler
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()

            # scheduler step (after optimizer.step)
            scheduler.step()
            global_step += 1

        epoch_loss += loss.item() * gradient_accumulation_steps
        pbar.set_postfix({"avg_loss": epoch_loss / (step + 1)})

    # --- end epoch: evaluate ---
    val_loss, val_acc = evaluate(model, eval_loader, device)
    print(f"Epoch {epoch+1} finished — train_loss: {epoch_loss/len(train_loader):.4f} — val_loss: {val_loss:.4f} — val_acc: {val_acc:.4f}")

    # save checkpoint if improved
    if val_acc > best_val_acc:
        best_val_acc = val_acc
        print(f"New best val acc: {best_val_acc:.4f}. Saving checkpoint...")
        # 1) save full state (model + optimizer + scheduler) for resuming
        torch.save({
            "epoch": epoch + 1,
            "model_state_dict": model.state_dict(),
            "optimizer_state_dict": optimizer.state_dict(),
            "scheduler_state_dict": scheduler.state_dict(),
            "best_val_acc": best_val_acc,
            "scaler_state_dict": scaler.state_dict()
        }, os.path.join(save_dir, "best_full.pt"))

        # 2) save the PEFT adapters (recommended) - this saves LoRA adapter config & weights
        # model here is a PeftModel; this will save the adapter so you can re-apply it to base later
        model.save_pretrained(os.path.join(save_dir, "peft_adapter"))

# after training you can save the final model weights / adapter again:
model.save_pretrained(os.path.join(save_dir, "final_peft_adapter"))

Epoch 1/8:   0%|          | 0/219 [00:00<?, ?it/s]

  with torch.cuda.amp.autocast(enabled=(use_amp and torch.cuda.is_available())):


Eval:   0%|          | 0/25 [00:00<?, ?it/s]

Epoch 1 finished — train_loss: 2.7715 — val_loss: 0.6009 — val_acc: 0.7653
New best val acc: 0.7653. Saving checkpoint...


Epoch 2/8:   0%|          | 0/219 [00:00<?, ?it/s]

Eval:   0%|          | 0/25 [00:00<?, ?it/s]

Epoch 2 finished — train_loss: 0.6469 — val_loss: 0.5053 — val_acc: 0.7959
New best val acc: 0.7959. Saving checkpoint...


Epoch 3/8:   0%|          | 0/219 [00:00<?, ?it/s]

In [None]:
print(loss, loss.requires_grad, loss.grad_fn)
