<a href="https://colab.research.google.com/github/YanivZimmer/collision/blob/main/finetune_videomae.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import os
root_data = "hide_from_git"
os.listdir(root_data)


['sample_submission.csv',
 'test.csv',
 'train.csv',
 '.DS_Store',
 'test',
 'train',
 'weights',
 'submission.csv']

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as transforms
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import os
import cv2
from transformers import VideoMAEForVideoClassification, VideoMAEFeatureExtractor

from google.colab.patches import cv2_imshow
from transformers import VideoMAEForVideoClassification, VideoMAEFeatureExtractor
from google.colab import drive

#drive.mount('/content/drive')

# Define dataset class
class AccidentDataset(Dataset):
    def __init__(self, csv_file, video_dir, feature_extractor, frames_per_clip=16):
        self.data = pd.read_csv(csv_file)
        self.video_dir = video_dir
        self.feature_extractor = feature_extractor
        self.frames_per_clip = frames_per_clip

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        video_id = str(int(self.data.iloc[idx]['id'])).zfill(5)
        label = self.data.iloc[idx]['target']

        video_path = os.path.join(self.video_dir, f"{video_id}.mp4")
        #print(video_id,label)

        frames = self.load_video(video_path)
        #print(f"n_frames={len(frames)}",video_path)
        # if len(frames) < self.frames_per_clip:
        #     frames.extend(frames[-1:] * (self.frames_per_clip - len(frames)))

        inputs = self.feature_extractor(frames, return_tensors="pt")
        return inputs['pixel_values'].squeeze(0), torch.tensor(label, dtype=torch.float32)

    def load_video(self, video_path):
        cap = cv2.VideoCapture(video_path)
        frames = []
        total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
        if total_frames == 0:
            cap.release()
            return frames

        frame_idxs = torch.linspace(0, total_frames-1, self.frames_per_clip).long().tolist()

        for i in range(total_frames):
            ret, frame = cap.read()
            if not ret:
                break
            if i in frame_idxs:
                frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                frame = cv2.resize(frame, (224, 224))
                frames.append(frame)

        cap.release()
        return frames



# Load feature extractor
feature_extractor = VideoMAEFeatureExtractor.from_pretrained("MCG-NJU/videomae-base")


# Load pre-trained VideoMAE model
model = VideoMAEForVideoClassification.from_pretrained("MCG-NJU/videomae-base", num_labels=1)
model.classifier = nn.Sequential(
    nn.Linear(model.classifier.in_features, 512),
    nn.ReLU(),
    nn.Linear(512, 1),
    nn.Sigmoid()
)

# Training setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#device = torch.device('mps')

model.to(device)
criterion = nn.BCELoss()
optimizer = optim.AdamW(model.parameters(), lr=1e-4)
device
# Training loop


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/271 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/725 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/377M [00:00<?, ?B/s]

Some weights of VideoMAEForVideoClassification were not initialized from the model checkpoint at MCG-NJU/videomae-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


device(type='cuda')

In [None]:
test_csv = os.path.join(root_data,"sample_submission.csv")
test_video_dir = os.path.join(root_data,"test")

testset = AccidentDataset(test_csv, test_video_dir, feature_extractor)
testloader = DataLoader(testset, batch_size=32, shuffle=True)


In [None]:
# from moviepy.editor import VideoFileClip

# video = VideoFileClip(video_path)
# video.ipython_display(width=480)


In [None]:
# Evaluation function
from tqdm import tqdm

def evaluate(model, dataloader):
    model.eval()
    total, correct = 0, 0
    with torch.no_grad():
        for inputs, labels in tqdm(dataloader):
            inputs, labels = inputs.to(device), labels.to(device).unsqueeze(1)
            outputs = model(pixel_values=inputs).logits
            predictions = (outputs > 0.5).float()
            correct += (predictions == labels).sum().item()
            total += labels.size(0)

    accuracy = correct / total
    print(f"Evaluation Accuracy: {accuracy:.4f}")
    return accuracy

#evaluate(model,testloader)

In [None]:
# Define paths
data_csv = os.path.join(root_data,"train.csv")
video_dir = os.path.join(root_data,"train")

trainset = AccidentDataset(data_csv, video_dir, feature_extractor)
trainloader = DataLoader(trainset, batch_size=32, shuffle=True)
trainset[0]


(tensor([[[[ 0.1083,  0.1768,  0.1939,  ...,  2.1804,  2.1804,  2.1804],
           [ 0.1597,  0.2282,  0.2453,  ...,  2.1804,  2.1804,  2.1804],
           [ 0.1939,  0.2624,  0.2796,  ...,  2.1804,  2.1804,  2.1804],
           ...,
           [-1.3815, -1.3815, -1.3815,  ...,  0.4166,  0.4166,  0.4166],
           [-1.3815, -1.3815, -1.3815,  ...,  0.4166,  0.4166,  0.4166],
           [-1.3815, -1.3815, -1.3815,  ...,  0.4166,  0.4166,  0.4166]],
 
          [[ 1.9559,  1.9384,  1.9384,  ...,  2.3585,  2.3585,  2.3585],
           [ 2.0084,  1.9909,  1.9909,  ...,  2.3585,  2.3585,  2.3585],
           [ 2.0434,  2.0259,  2.0259,  ...,  2.3585,  2.3585,  2.3585],
           ...,
           [-1.2129, -1.2129, -1.2129,  ...,  0.7654,  0.7654,  0.7654],
           [-1.2129, -1.2129, -1.2129,  ...,  0.7654,  0.7654,  0.7654],
           [-1.2129, -1.2129, -1.2129,  ...,  0.7654,  0.7654,  0.7654]],
 
          [[ 2.5529,  2.5703,  2.5703,  ...,  2.5703,  2.5703,  2.5703],
           [ 

In [None]:
model_path = "/content/drive/MyDrive/Data/videomae_epoch_19.pth"
model.load_state_dict(torch.load(model_path))
model.to(device)
model.eval()

  model.load_state_dict(torch.load(model_path))


VideoMAEForVideoClassification(
  (videomae): VideoMAEModel(
    (embeddings): VideoMAEEmbeddings(
      (patch_embeddings): VideoMAEPatchEmbeddings(
        (projection): Conv3d(3, 768, kernel_size=(2, 16, 16), stride=(2, 16, 16))
      )
    )
    (encoder): VideoMAEEncoder(
      (layer): ModuleList(
        (0-11): 12 x VideoMAELayer(
          (attention): VideoMAESdpaAttention(
            (attention): VideoMAESdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=False)
              (key): Linear(in_features=768, out_features=768, bias=False)
              (value): Linear(in_features=768, out_features=768, bias=False)
              (dropout): Dropout(p=0.0, inplace=False)
            )
            (output): VideoMAESelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): VideoMAEIntermediate(
            (den

In [None]:


epochs = 16
for epoch in range(100,epochs+5):
    model.train()
    epoch_loss = 0
    for inputs, labels in tqdm(trainloader):
        inputs, labels = inputs.to(device), labels.to(device).unsqueeze(1)
        optimizer.zero_grad()
        outputs = model(pixel_values=inputs).logits
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()

    print(f"Epoch {epoch+1}/{epochs}, Loss: {epoch_loss/len(trainloader):.4f}")
    torch.save(model.state_dict(), f"/content/drive/MyDrive/Data/videomae_epoch_{epoch+1}.pth")

    if epoch % 30 == 0:
        torch.save(model.state_dict(), f"/content/drive/MyDrive/Data/videomae_epoch_{epoch+1}.pth")
        evaluate(model,testloader)


In [None]:
evaluate(model,testloader)


  0%|          | 0/42 [00:54<?, ?it/s]


KeyboardInterrupt: 

In [None]:
# prompt: inherit AccidentDataset and change only get item to not return label
class BlindAccidentDataset(AccidentDataset):
    def __getitem__(self, idx):
        video_id = str(int(self.data.iloc[idx]['id'])).zfill(5)

        video_path = os.path.join(self.video_dir, f"{video_id}.mp4")

        frames = self.load_video(video_path)

        inputs = self.feature_extractor(frames, return_tensors="pt")
        return inputs['pixel_values'].squeeze(0) # Return only pixel values


In [None]:
# prompt: for the dataset in test.csv  (not the other csv) return the model predictions for it. this csv does not contain the true prediciton

import pandas as pd
import torch
from torch.utils.data import DataLoader
from tqdm import tqdm

# Assuming the necessary imports and model definition are already present from the previous code

# Load the trained model (replace with your actual model path)
model_path = "/content/drive/MyDrive/Data/videomae_epoch_19.pth"
model.load_state_dict(torch.load(model_path))
model.to(device)
model.eval()


# Create a prediction function
def predict_on_test(model, dataloader):
    predictions = []
    with torch.no_grad():
        for inputs in tqdm(dataloader):  # No labels for test set
            inputs = inputs.to(device)
            outputs = model(pixel_values=inputs).logits
            predictions.extend(outputs.cpu().numpy())
    return predictions

# Load the test dataset and create dataloader
test_csv = os.path.join(root_data, "test.csv")
test_video_dir = os.path.join(root_data, "test")

testset = BlindAccidentDataset(test_csv, test_video_dir, feature_extractor)
testloader = DataLoader(testset, batch_size=32, shuffle=False)  # Important: shuffle=False

# Make predictions
test_predictions = predict_on_test(model, testloader)


# Process and print the predictions (example)
# Assuming you want to print the prediction probability for each video
test_df = pd.read_csv(test_csv)

# for i, pred in enumerate(test_predictions):
#   print(f"Video {test_df.iloc[i]['id']}: Prediction Probability = {pred[0]:.4f}")


#Or save them to a CSV
submission_df = pd.DataFrame({'id': test_df['id'], 'target': [p[0] for p in test_predictions]})
submission_df.to_csv('/content/drive/MyDrive/Data/nexar-collision-prediction/submission.csv', index=False)


  model.load_state_dict(torch.load(model_path))
100%|██████████| 42/42 [44:49<00:00, 64.04s/it]
