<a href="https://colab.research.google.com/github/ZsofiaK/masterthesis/blob/main/Implementation/Benchmarking/VideoMAE_Benchmarking.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Benchmarking the AR task with VideoMAE fine-tuning

## Preliminaries

In [1]:
# Setting up dictionaries for later ease of use.
dataset_dict = {'fishClips' : 'Fish clips', 'AK-fish' : 'AK fish'}

# Storing current date for checkpoint saving.
from datetime import datetime

today = datetime.now().strftime('%d%m%Y')

In [2]:
# Set the dataset, model and embedding specifics for the classification.

dataset_name = 'fishClips'

frame_selection = 'motionAbsdiff_10'

image_size = 448

evaluate_only = False    # If model should be loaded from checkpoint instead of training.

load_checkpoint_date = '12062024'   # Checkpoint to use for evaluation only.

In [3]:
# Set training hyperparameters.
batch_size = 32

learning_rate = 1e-3

num_epochs = 10

In [4]:
# Additional notebook parameters, filled automatically if correct root
# directories are given.

pred_path = f'/content/drive/MyDrive/UvA/M Thesis/Data/Results/Predictions/pred_{dataset_name}_{frame_selection}_VideoMAE_{image_size}.csv'

params_path = f'/content/drive/MyDrive/UvA/M Thesis/Data/Results/Parameters/params_{dataset_name}_{frame_selection}_VideoMAE_{image_size}.csv'

checkpoint_path = f'/content/drive/MyDrive/UvA/M Thesis/Data/Results/Benchmarking/VideoMAE/Checkpoints/checkpoint_{today}_{dataset_name}_{frame_selection}_{image_size}.pth'

load_checkpoint_path = f'/content/drive/MyDrive/UvA/M Thesis/Data/Results/Benchmarking/VideoMAE/Checkpoints/checkpoint_{load_checkpoint_date}_{dataset_name}_{frame_selection}_{image_size}.pth'

dataset_dir = dataset_dict[dataset_name]

dataset_path = f'/content/drive/MyDrive/UvA/M Thesis/Data/{dataset_dir}'

clips_path = f'{dataset_path}/Clips'

selected_frames_path = f'{dataset_path}/Selected frames/{dataset_name}_{frame_selection}.csv'

nr_frames = int(frame_selection.split('_')[-1])

In [5]:
# Mount Drive.
from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
# CUSTOM VIDEO DATASET CLASS.
import cv2
import numpy as np
import os
from torch.utils.data import Dataset, DataLoader
import pandas as pd

class CustomVideoDataset(Dataset):
    def __init__(self, clips_file, clips_folder, split, transform=None):
        data_frame = pd.read_csv(clips_file)

        self.data_frame = data_frame[data_frame['type']==split].copy().reset_index(drop=True)
        self.clips_folder = clips_folder
        self.transform = transform

    def __len__(self):
        return len(self.data_frame)

    def __getitem__(self, idx):
        if dataset_name == 'fishClips':
          video_path = os.path.join(self.clips_folder, f'{self.data_frame["video"][idx]}')

        elif dataset_name == 'AK-fish':
          video_path = os.path.join(self.clips_folder, f'{self.data_frame["video"][idx]}.mp4')

        label = self.data_frame['label'][idx]
        video = self.load_video(video_path)

        if self.transform:
            video = self.transform(video)

        return video, label

    def load_video(self, video_path):
        # Read the indices of selected frames.
        frame_selection_df = pd.read_csv(selected_frames_path, index_col='video')

        video_file = video_path.split('/')[-1]

        selected_frames = eval(frame_selection_df['frames'][video_file])

        # Load selected frames.
        cap = cv2.VideoCapture(video_path)
        frames = []

        for frame_idx in selected_frames:
          cap.set(cv2.CAP_PROP_POS_FRAMES, frame_idx)
          ret, frame = cap.read()

          if not ret:
              break

          # Convert frame from BGR (OpenCV default) to RGB
          frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
          frames.append(frame)

        cap.release()

        # Convert list of frames to a numpy array
        video = np.array(frames)

        # Resize frames to a consistent size.
        video = np.array([cv2.resize(frame, (image_size, image_size)) for frame in video])

        # Optional: Normalize the video frames if they are not already.
        if np.min(video) < 0.0 or np.max(video) > 1.0:
          video = video / 255.0  # Scale pixel values to [0, 1]

        # Transpose video dimensions to match the model's expected input format: (T, C, H, W)
        video = video.transpose(0, 3, 1, 2)

        # Ensure the video has 16 frames as that is the requirement for VideoMAE
        video = self.ensure_16_frames(video)

        return video

    def ensure_16_frames(self, video):
      import random

      num_frames = video.shape[0]

      if num_frames == 16:
          return video

      # Downsample frames randomly if there are more than 16.
      elif num_frames > 16:
          start_idx = random.randint(0, num_frames - 16)
          return video[start_idx:start_idx + 16]

      # Add padding if there are less than 16 frames.
      else:
          padding = np.zeros((16 - num_frames, video.shape[1], video.shape[2], video.shape[3]))
          video = np.concatenate((video, padding), axis=0)
          return video

## Fine tune the VideoMAE model on dataset

In [7]:
import torch
from transformers import VideoMAEFeatureExtractor, VideoMAEModel

# Load the VideoMAE model
model = VideoMAEModel.from_pretrained('MCG-NJU/videomae-base')
feature_extractor = VideoMAEFeatureExtractor.from_pretrained('MCG-NJU/videomae-base')

# Replace the classification head of the model
num_classes = 1  # Output continuous values between 0 and 1
model.classifier = torch.nn.Linear(model.config.hidden_size, num_classes)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/725 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/377M [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/271 [00:00<?, ?B/s]



In [8]:
# Load your dataset and prepare it for training
train_dataset = CustomVideoDataset(clips_file=f'{dataset_path}/clips.csv', clips_folder=clips_path, split='train')
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

In [9]:
# Define the optimizer and loss function
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
loss_fn = torch.nn.BCEWithLogitsLoss()  # For binary classification with continuous output

In [10]:
# Set gradient accumulation steps
accumulation_steps = 4  # Adjust based on your needs

# Fine-tune the model on your dataset
from tqdm.notebook import tqdm

if not evaluate_only:   # If model should be trained.
  for epoch in range(num_epochs):
      model.train()
      optimizer.zero_grad()

      for batch_idx, batch in enumerate(
          tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}", unit="batch")):

          # print('Finished loader.')

          inputs, labels = batch

          # print('Inputs', type(inputs), inputs.shape)
          # print('Inputs', type(labels), labels.shape)

          labels = labels.float().unsqueeze(1)  # Reshape labels for BCEWithLogitsLoss

          # Inputs: batch size, temporal dimension, channels, height, width.
          B, T, C, H, W = inputs.shape
          inputs = inputs.permute(0, 2, 1, 3, 4).reshape(B * T, C, H, W)

          # print('Finished permutation.')
          # print('Inputs after permutation and reshape:', inputs.shape)

          # Apply the feature extractor
          features = feature_extractor(list(inputs), return_tensors="pt")

          # print('Finished extracting features.')
          # print('Features:', {key: value.shape for key, value in features.items()})

          # Reshape the features to fit the model input dimensions.
          features = {key: value.view(B, T, -1, value.size(-2), value.size(-1)) for key, value in features.items()}

          # print('Finished reshaping features.')
          # print('Features after reshape:', {key: value.shape for key, value in features.items()})

          # Pass the features through the VideoMAE model.
          outputs = model(**features)

          # Extract the last hidden state.
          last_hidden_state = outputs.last_hidden_state

          # Average pool last hidden state for classification.
          pooled_last_hidden_state = torch.mean(last_hidden_state, dim=1)

          # Pass the last hidden state through the classifier to get the logits
          logits = model.classifier(pooled_last_hidden_state)

          loss = loss_fn(logits, labels)
          loss = loss / accumulation_steps  # Scale the loss by the accumulation steps
          loss.backward()

          if (batch_idx + 1) % accumulation_steps == 0:
              optimizer.step()
              optimizer.zero_grad()

      # Ensure any remaining gradients are updated
      optimizer.step()
      optimizer.zero_grad()

      # Save model checkpoint
      torch.save({'epoch': epoch,
                  'model_state_dict': model.state_dict(),
                  'optimizer_state_dict': optimizer.state_dict(),
                  'loss': loss}, checkpoint_path)

  print(f'Finished training!')

Epoch 1/10:   0%|          | 0/6 [00:00<?, ?batch/s]

It looks like you are trying to rescale already rescaled images. If the input images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again.
  return torch.tensor(value)


Epoch 2/10:   0%|          | 0/6 [00:00<?, ?batch/s]

Epoch 3/10:   0%|          | 0/6 [00:00<?, ?batch/s]

Epoch 4/10:   0%|          | 0/6 [00:00<?, ?batch/s]

Epoch 5/10:   0%|          | 0/6 [00:00<?, ?batch/s]

Epoch 6/10:   0%|          | 0/6 [00:00<?, ?batch/s]

Epoch 7/10:   0%|          | 0/6 [00:00<?, ?batch/s]

Epoch 8/10:   0%|          | 0/6 [00:00<?, ?batch/s]

Epoch 9/10:   0%|          | 0/6 [00:00<?, ?batch/s]

Epoch 10/10:   0%|          | 0/6 [00:00<?, ?batch/s]

Finished training!


## Evaluate the model on the dataset

In [11]:
# Load test dataset
test_dataset = CustomVideoDataset(clips_file=f'{dataset_path}/clips.csv', clips_folder=clips_path, split='test')
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [12]:
# Load saved model if training was not run.
if evaluate_only:
  # Load the checkpoint
  checkpoint = torch.load(load_checkpoint_path)

  # Restore the model and optimizer states
  model.load_state_dict(checkpoint['model_state_dict'])
  optimizer.load_state_dict(checkpoint['optimizer_state_dict'])

  # If needed, you can also restore the epoch and loss
  epoch = checkpoint['epoch']
  loss = checkpoint['loss']

In [13]:
# Evaluate the performance of the fine-tuned model and save predictions
from tqdm.notebook import tqdm

results = []
model.eval()

with torch.no_grad():
    for batch_idx, (inputs, labels) in enumerate(tqdm(test_loader, desc=f"Progress", unit="batch")):
        # Inputs: batch size, temporal dimension, channels, height, width.
        B, T, C, H, W = inputs.shape
        inputs = inputs.permute(0, 2, 1, 3, 4).reshape(B * T, C, H, W)

        # Apply the feature extractor
        features = feature_extractor(list(inputs), return_tensors="pt")

        # Reshape the features to fit the model input dimensions.
        features = {key: value.view(B, T, -1, value.size(-2), value.size(-1)) for key, value in features.items()}

        # Pass the features through the VideoMAE model.
        outputs = model(**features)

        # Extract the last hidden state
        last_hidden_state = outputs.last_hidden_state

        # Pass the last hidden state through the classifier to get the logits
        logits = model.classifier(last_hidden_state[:, 0, :])

        predictions = torch.sigmoid(logits).squeeze().tolist()  # Apply sigmoid to get continuous values between 0 and 1

        # Store predictions for the given batch.
        for i in range(B):
            video_name = test_dataset.data_frame.iloc[batch_idx * batch_size + i]['video']
            true_label = test_dataset.data_frame.iloc[batch_idx * batch_size + i]['label']
            prediction = predictions[i]

            results.append({
                'video': video_name,
                'prediction': prediction,
                'label': true_label
            })

# Save predictions to a CSV file
results_df = pd.DataFrame(results)
results_df.to_csv(pred_path, index=False)

print('Predictions saved to the specified location.')

Progress:   0%|          | 0/2 [00:00<?, ?batch/s]

Predictions saved to the specified location.


In [14]:
# Save test predictions and test labels.
y_test = list(results_df['label'])
y_pred_raw = list(results_df['prediction'])

In [15]:
# Choose best threshold for positive predictions.
# Threshold is selected to maximize F1 score.
from sklearn.metrics import precision_recall_curve

best_params = {}

precision, recall, thresholds = precision_recall_curve(y_test, y_pred_raw)

# Calculate F1 Scores for different thresholds
f1_scores = 2 * recall * precision / (recall + precision)

f1_scores = f1_scores[:-1]    # Drop last score as that corresponds to -inf threshold.

f1_scores_nonnull = f1_scores[~np.isnan(f1_scores)]

thresholds_nonnull = thresholds[~np.isnan(f1_scores)]

optimal_idx = np.argmax(f1_scores_nonnull)

pos_threshold = thresholds_nonnull[optimal_idx]

best_params['pos_threshold'] = pos_threshold

print('Optimal threshold:', pos_threshold)

Optimal threshold: 0.1865510493516922


  f1_scores = 2 * recall * precision / (recall + precision)


In [16]:
# Save best model parameters.
params_path = f'/content/drive/MyDrive/UvA/M Thesis/Data/Results/Parameters/params_{dataset_name}_{frame_selection}_VideoMAE_{image_size}.csv'

best_params = {'pos_threshold' : pos_threshold}

best_params_df = pd.DataFrame([best_params])

best_params_df.to_csv(params_path, index=False)

### Display classification report

In [17]:
# Display classification report.
from sklearn.metrics import classification_report

y_pred = (y_pred_raw > pos_threshold).astype(int)

report = classification_report(y_test, y_pred, target_names=['No Attack', 'Attack'])

print(report)

              precision    recall  f1-score   support

   No Attack       0.78      0.50      0.61        36
      Attack       0.18      0.44      0.26         9

    accuracy                           0.49        45
   macro avg       0.48      0.47      0.43        45
weighted avg       0.66      0.49      0.54        45

