<a href="https://colab.research.google.com/github/YanivZimmer/collision/blob/main/finetune_vllm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as transforms
from torch.utils.data import DataLoader, Dataset
import pandas as pd
import cv2
import os
from transformers import AutoProcessor, AutoModelForVision2Seq

# Configuration
root_data = "hide"

DATASET_DIR = f"{root_data}/train"
TRAIN_CSV = f"{root_data}/train.csv"
BATCH_SIZE = 8
NUM_FRAMES = 30*45  # Number of frames per video
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
QUERY = "This is a video of a car dash camera. Give a score between 0 to 1 for the probability of an accident in it."

# Load dataset metadata
df = pd.read_csv(TRAIN_CSV)

# Load model and processor once
processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B")
model = AutoModelForVision2Seq.from_pretrained("Qwen/Qwen2-VL-7B").to(DEVICE)
model.eval()

def extract_features(video_path, processor, model):
    cap = cv2.VideoCapture(video_path)
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    frame_indices = torch.linspace(0, total_frames - 1, NUM_FRAMES).long()
    frames = []

    for idx in frame_indices:
        cap.set(cv2.CAP_PROP_POS_FRAMES, int(idx))
        ret, frame = cap.read()
        if not ret:
            break
        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        frames.append(frame)
    cap.release()

    inputs = processor(images=frames, text=QUERY, return_tensors="pt").to(DEVICE)
    with torch.no_grad():
        features = model.generate(**inputs)
    return features.squeeze(0)  # Extract prediction from model output

# Define dataset class
class AccidentDataset(Dataset):
    def __init__(self, df, data_dir, num_frames=16):
        self.df = df
        self.data_dir = data_dir
        self.num_frames = num_frames

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        video_path = os.path.join(self.data_dir, f"{row['id']}.mp4")
        return video_path, torch.tensor(row['target'], dtype=torch.float32)

# Initialize dataset and dataloader
dataset = AccidentDataset(df, DATASET_DIR, num_frames=NUM_FRAMES)
dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)

# Define the model
class AccidentPredictor(nn.Module):
    def __init__(self, input_dim=4096):  # based on Qwen2-VL-7B output dim
        super(AccidentPredictor, self).__init__()
        self.fc = nn.Linear(input_dim, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.fc(x)
        return self.sigmoid(x)

predictor_model = AccidentPredictor().to(DEVICE)
criterion = nn.BCELoss()
optimizer = optim.Adam(predictor_model.parameters(), lr=1e-4)

# Training loop
EPOCHS = 10
for epoch in range(EPOCHS):
    predictor_model.train()
    total_loss = 0
    for video_paths, targets in dataloader:
        features = torch.stack([extract_features(vp, processor, model) for vp in video_paths]).to(DEVICE)
        targets = targets.to(DEVICE)
        optimizer.zero_grad()
        outputs = predictor_model(features).squeeze()
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch [{epoch+1}/{EPOCHS}], Loss: {total_loss / len(dataloader):.4f}")

print("Training complete!")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/347 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/3.56k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

chat_template.json:   0%|          | 0.00/418 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.20k [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/56.5k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/5 [00:00<?, ?it/s]

model-00001-of-00005.safetensors:   0%|          | 0.00/3.90G [00:00<?, ?B/s]

model-00002-of-00005.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

model-00003-of-00005.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

model-00004-of-00005.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

model-00005-of-00005.safetensors:   0%|          | 0.00/1.09G [00:00<?, ?B/s]

`Qwen2VLRotaryEmbedding` can now be fully parameterized by passing the model config through the `config` argument. All other arguments will be removed in v4.46


Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/244 [00:00<?, ?B/s]

IndexError: list index out of range