<a href="https://colab.research.google.com/github/YanivZimmer/collision/blob/main/fe_frame_and_sequence_process.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive/')
import os
root_data = "hide/nexar-collision-prediction"
os.listdir(root_data)


Mounted at /content/drive/


['sample_submission.csv',
 'test.csv',
 'train.csv',
 '.DS_Store',
 'test',
 'train',
 'weights',
 'submission.csv']

In [None]:
import torch
import os
import cv2
import pandas as pd
from torch.utils.data import Dataset, DataLoader
import torchvision.models as models
import torchvision.transforms as transforms
from PIL import Image
import torch.nn as nn
from tqdm import tqdm
# Dataset Class
class AccidentDataset(Dataset):
    def __init__(self, csv_file, video_dir, feature_extractor,frames_per_clip=16, max_frames=30*45): # 30 in sec, ~40 se per
        self.data = pd.read_csv(csv_file)
        self.video_dir = video_dir
        self.frames_per_clip = frames_per_clip
        #self.max_frames = max_frames

        self.feature_extractor = feature_extractor#models.mobilenet_v2(pretrained=True)
        self.feature_extractor.classifier = torch.nn.Identity()
        #self.feature_extractor.eval()

        self.transform = transforms.Compose([
            transforms.Resize((224, 224)),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
        ])

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        video_id = str(int(self.data.iloc[idx]['id'])).zfill(5)
        label = torch.tensor(self.data.iloc[idx]['target'], dtype=torch.float32)
        video_path = os.path.join(self.video_dir, f"{video_id}.mp4")

        frames = self.load_video(video_path)
        features = self.extract_features(frames)

        return features, label, len(features)

    def load_video(self, video_path):
        cap = cv2.VideoCapture(video_path)
        frames = []
        total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
        if total_frames == 0:
            cap.release()
            return frames

        frame_idxs = torch.linspace(0, total_frames - 1, self.frames_per_clip).long().tolist()
        for i in range(total_frames):
            ret, frame = cap.read()
            if not ret:
                break
            if i in frame_idxs:
                frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                frame = cv2.resize(frame, (224, 224))
                frames.append(Image.fromarray(frame))

        cap.release()
        return frames

    def extract_features(self, frames):
        features = []
        for frame in frames:
            frame = self.transform(frame).unsqueeze(0)
            with torch.no_grad():
                feature = self.feature_extractor(frame).squeeze(0)
            features.append(feature)

        return torch.stack(features) if features else torch.zeros(1, 1280)

# Collate Function
def collate_fn(batch):
    features, labels, lengths = zip(*batch)
    max_length = max(lengths)
    padded_features = torch.zeros(len(batch), max_length, 1280)
    mask = torch.ones(len(batch), max_length, dtype=torch.bool)

    for i, (feat, length) in enumerate(zip(features, lengths)):
        padded_features[i, :length, :] = feat
        mask[i, :length] = False

    return padded_features, torch.tensor(labels), mask

# Transformer Model
class VideoTransformer(nn.Module):
    def __init__(self, feature_dim=1280, num_heads=8, num_layers=4):
        super(VideoTransformer, self).__init__()
        self.pos_embedding = nn.Parameter(torch.randn(1, 100, feature_dim))
        self.transformer = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(d_model=feature_dim, nhead=num_heads),
            num_layers=num_layers
        )
        self.fc = nn.Linear(feature_dim, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, frame_features, mask):
        seq_length = frame_features.size(1)
        frame_features += self.pos_embedding[:, :seq_length, :]
        transformed_features = self.transformer(frame_features, src_key_padding_mask=mask.T)
        output = self.fc(transformed_features.mean(dim=1))
        return self.sigmoid(output)

# Training Function
def train(model,fe, dataloader, epochs=10, lr=1e-4):
    model.train()
    #optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    optimizer = torch.optim.Adam([
        {'params': fe.parameters(), 'lr': 1e-5},  # Smaller LR for MobileNet
        {'params': model.parameters(), 'lr': lr}  # Larger LR for Transformer
    ])

    criterion = nn.BCELoss()#nn.MSELoss()
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    for epoch in range(epochs):
        total_loss = 0
        for frame_features, labels, mask in tqdm(dataloader):
            frame_features, labels, mask = frame_features.to(device), labels.to(device), mask.to(device)
            optimizer.zero_grad()
            outputs = model(frame_features, mask)
            loss = criterion(outputs.squeeze(), labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        print(f"Epoch [{epoch+1}/{epochs}], Loss: {total_loss / len(dataloader):.4f}")
        torch.save(model.state_dict(), f"/content/drive/MyDrive/Data/vit_{epoch+1}.pth")
        torch.save(fe.state_dict(), f"/content/drive/MyDrive/Data/mobile_{epoch+1}.pth")



# Load Dataset and Start Training
fe=models.mobilenet_v2(pretrained=True)
fe.classifier = nn.Identity()  # Remove final classification layer

# Fine-tuning strategy: Freeze early layers, train later layers
for param in fe.features[:10].parameters():
    param.requires_grad = False

for param in fe.features[10:].parameters():
    param.requires_grad = True

fe.train()  # Set MobileNet to training mode
dataset = AccidentDataset(csv_file=f"{root_data}/train.csv", video_dir=f"{root_data}/train",feature_extractor=fe)

dataloader = DataLoader(dataset, batch_size=32, collate_fn=collate_fn, shuffle=True)

model = VideoTransformer()
train(model,fe, dataloader)


Downloading: "https://download.pytorch.org/models/mobilenet_v2-b0353104.pth" to /root/.cache/torch/hub/checkpoints/mobilenet_v2-b0353104.pth
100%|██████████| 13.6M/13.6M [00:00<00:00, 209MB/s]
  0%|          | 0/47 [00:15<?, ?it/s]


KeyboardInterrupt: 

In [None]:
import torch
import os
import cv2
import pandas as pd
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as transforms
from PIL import Image
import torch.nn as nn
from tqdm import tqdm
from transformers import AutoModel, AutoProcessor

# Load DINOv2 model and processor
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
fe = AutoModel.from_pretrained("facebook/dinov2-base").to(device).eval()
processor = AutoProcessor.from_pretrained("facebook/dinov2-base",use_fast=True)

# Dataset Class
class AccidentDataset(Dataset):
    def __init__(self, csv_file, video_dir, feature_extractor, processor, frames_per_clip=16):
        self.data = pd.read_csv(csv_file)
        self.video_dir = video_dir
        self.frames_per_clip = frames_per_clip
        self.feature_extractor = feature_extractor
        self.processor = processor

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        video_id = str(int(self.data.iloc[idx]['id'])).zfill(5)
        label = torch.tensor(self.data.iloc[idx]['target'], dtype=torch.float32)
        video_path = os.path.join(self.video_dir, f"{video_id}.mp4")

        frames = self.load_video(video_path)
        features = self.extract_features(frames)

        return features, label, len(features)

    def load_video(self, video_path):
        cap = cv2.VideoCapture(video_path)
        frames = []
        total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
        if total_frames == 0:
            cap.release()
            return frames

        frame_idxs = torch.linspace(0, total_frames - 1, self.frames_per_clip).long().tolist()
        for i in range(total_frames):
            ret, frame = cap.read()
            if not ret:
                break
            if i in frame_idxs:
                frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                frames.append(Image.fromarray(frame))

        cap.release()
        return frames

    def extract_features(self, frames):
        features = []
        for frame in frames:
            inputs = self.processor(images=frame, return_tensors="pt").to(device)
            with torch.no_grad():
                feature = self.feature_extractor(**inputs).last_hidden_state.mean(dim=1).squeeze(0)
            features.append(feature)

        return torch.stack(features) if features else torch.zeros(1, 768)  # DINOv2 base has 768-dim features

# Collate Function
def collate_fn(batch):
    features, labels, lengths = zip(*batch)
    max_length = max(lengths)
    padded_features = torch.zeros(len(batch), max_length, 768, device=device)  # Adjusted for DINOv2's 768-dim output
    mask = torch.ones(len(batch), max_length, dtype=torch.bool, device=device)

    for i, (feat, length) in enumerate(zip(features, lengths)):
        padded_features[i, :length, :] = feat
        mask[i, :length] = False

    return padded_features, torch.tensor(labels, device=device), mask

# Transformer Model
class VideoTransformer(nn.Module):
    def __init__(self, feature_dim=768, num_heads=8, num_layers=4):
        super(VideoTransformer, self).__init__()
        self.pos_embedding = nn.Parameter(torch.randn(1, 100, feature_dim, device=device))
        self.transformer = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(d_model=feature_dim, nhead=num_heads),
            num_layers=num_layers
        )
        self.fc = nn.Linear(feature_dim, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, frame_features, mask):
        seq_length = frame_features.size(1)
        frame_features += self.pos_embedding[:, :seq_length, :]
        transformed_features = self.transformer(frame_features, src_key_padding_mask=mask.T)
        output = self.fc(transformed_features.mean(dim=1))
        return self.sigmoid(output)

# Training Function
def train(model, fe, dataloader, epochs=10, lr=1e-4):
    model.train()
    optimizer = torch.optim.Adam([
        {'params': model.parameters(), 'lr': lr}  # Fine-tuning only Transformer
    ])

    criterion = nn.BCELoss()
    model.to(device)

    for epoch in range(epochs):
        total_loss = 0
        for frame_features, labels, mask in tqdm(dataloader):
            frame_features, labels, mask = frame_features.to(device), labels.to(device), mask.to(device)
            optimizer.zero_grad()
            outputs = model(frame_features, mask)
            loss = criterion(outputs.squeeze(), labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        print(f"Epoch [{epoch+1}/{epochs}], Loss: {total_loss / len(dataloader):.4f}")
        torch.save(model.state_dict(), f"vit_{epoch+1}.pth")

# Load Dataset and Start Training
dataset = AccidentDataset(csv_file=f"{root_data}/train.csv", video_dir=f"{root_data}/train",
                          feature_extractor=fe, processor=processor)
dataloader = DataLoader(dataset, batch_size=16, collate_fn=collate_fn, shuffle=True)

model = VideoTransformer()
train(model, fe, dataloader)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/548 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/346M [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/436 [00:00<?, ?B/s]

`use_fast` is set to `True` but the image processor class does not have a fast version.  Falling back to the slow version.
100%|██████████| 94/94 [1:25:41<00:00, 54.69s/it]


Epoch [1/10], Loss: 0.6951


100%|██████████| 94/94 [1:10:48<00:00, 45.20s/it]


Epoch [2/10], Loss: 0.5941


100%|██████████| 94/94 [1:09:33<00:00, 44.40s/it]


Epoch [3/10], Loss: 0.5890


  2%|▏         | 2/94 [01:12<56:24, 36.79s/it]

In [None]:
class BlindAccidentDataset(AccidentDataset):
    def __getitem__(self, idx):
        video_id = str(int(self.data.iloc[idx]['id'])).zfill(5)

        video_path = os.path.join(self.video_dir, f"{video_id}.mp4")

        frames = self.load_video(video_path)

        inputs = self.feature_extractor(frames, return_tensors="pt")
        return inputs['pixel_values'].squeeze(0) # Return only pixel values



In [None]:

import pandas as pd
import torch
from torch.utils.data import DataLoader
from tqdm import tqdm
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model.to(device)
model.eval()


# Create a prediction function
def predict_on_test(model, dataloader):
    predictions = []
    with torch.no_grad():
        for inputs in tqdm(dataloader):  # No labels for test set
            inputs = inputs.to(device)
            outputs = model(pixel_values=inputs).logits
            predictions.extend(outputs.cpu().numpy())
    return predictions

# Load the test dataset and create dataloader
test_csv = os.path.join(root_data, "test.csv")
test_video_dir = os.path.join(root_data, "test")

testset = BlindAccidentDataset(test_csv, test_video_dir, fe)
testloader = DataLoader(testset, batch_size=32, shuffle=False)  # Important: shuffle=False

# Make predictions
test_predictions = predict_on_test(model, testloader)


test_df = pd.read_csv(test_csv)

# for i, pred in enumerate(test_predictions):
#   print(f"Video {test_df.iloc[i]['id']}: Prediction Probability = {pred[0]:.4f}")


#save them to a CSV
submission_df = pd.DataFrame({'id': test_df['id'], 'target': [p[0] for p in test_predictions]})
submission_df.to_csv('hide/submission_fe_and_transformers.csv', index=False)


# Load video on dataset
is not efficient. improve it as its bottleneck no matter the model\gpu

In [None]:
!pip install decord

Collecting decord
  Downloading decord-0.6.0-py3-none-manylinux2010_x86_64.whl.metadata (422 bytes)
Downloading decord-0.6.0-py3-none-manylinux2010_x86_64.whl (13.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.6/13.6 MB[0m [31m68.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: decord
Successfully installed decord-0.6.0


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import os
import cv2
import pandas as pd
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as transforms
from PIL import Image
from tqdm import tqdm
from transformers import AutoModel

import decord
from decord import VideoReader, cpu
from PIL import Image
decord.bridge.set_bridge('torch')  # Use PyTorch-native tensor output

# Load DINOv2 model
fe = AutoModel.from_pretrained("facebook/dinov2-base").eval()

# Dataset Class
class AccidentDataset(Dataset):
    def __init__(self, csv_file, video_dir, feature_extractor, frames_per_clip=16):
        self.data = pd.read_csv(csv_file)
        self.video_dir = video_dir
        self.frames_per_clip = frames_per_clip
        self.feature_extractor = feature_extractor

        self.transform = transforms.Compose([
            transforms.Resize((224, 224)),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
        ])

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        video_id = str(int(self.data.iloc[idx]['id'])).zfill(5)
        label = torch.tensor(self.data.iloc[idx]['target'], dtype=torch.float32)
        video_path = os.path.join(self.video_dir, f"{video_id}.mp4")

        frames = self.load_video(video_path)
        features = self.extract_features(frames)

        return features, label, len(features)


    def load_videocv(self, video_path):
        cap = cv2.VideoCapture(video_path)
        frames = []
        total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))

        if total_frames == 0 or not cap.isOpened():
            cap.release()
            return frames

        frame_idxs = torch.linspace(0, total_frames - 1, self.frames_per_clip).long().tolist()

        for idx in frame_idxs:
            cap.set(cv2.CAP_PROP_POS_FRAMES, idx)  # Seek to frame
            ret, frame = cap.read()
            if not ret:
                break
            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            frame = cv2.resize(frame, (224, 224))
            frames.append(Image.fromarray(frame))

        cap.release()
        return frames

    def load_video_dec(self, video_path):
        decord.bridge.set_bridge('torch')  # Enable PyTorch tensor output
        vr = VideoReader(video_path, ctx=cpu(0))  # Load video with CPU decoding
        total_frames = len(vr)

        if total_frames == 0:
            return []

        frame_idxs = torch.linspace(0, total_frames - 1, self.frames_per_clip).long()

        # Efficiently load selected frames
        frames = vr.get_batch(frame_idxs).byte().permute(0, 2, 3, 1).numpy()  # Convert to (H, W, C)

        # Ensure frames are RGB (3 channels)
        if frames.shape[-1] == 1:  # If grayscale, convert to RGB
            frames = np.repeat(frames, 3, axis=-1)

        # Convert frames to PIL images
        frames = [Image.fromarray(frame, mode="RGB") for frame in frames]

        return frames

    def load_video(self, video_path, frames_per_clip=16, device="cuda"):
        vr = VideoReader(video_path)  # Load video
        total_frames = len(vr)

        if total_frames == 0:
            print(f"Warning: {video_path} has no frames.")
            return torch.empty(0, device=device)  # Return an empty tensor on the correct device

        frame_idxs = torch.linspace(0, total_frames - 1, frames_per_clip, dtype=torch.int64)

        # Load frames efficiently and normalize in-place
        frames = vr.get_batch(frame_idxs).permute(0, 3, 1, 2).to(torch.float32)  # (T, H, W, C) -> (T, C, H, W)
        frames.div_(255.0)  # In-place normalization to [0, 1]

        return frames.to(device, non_blocking=True)  # Move to GPU if needed


    def extract_features(self, frames):
        device = next(self.feature_extractor.parameters()).device  # Get model's device
        features = []

        for frame in frames:
            frame = self.transform(frame).unsqueeze(0).to(device)  # Move frame to model's device
            with torch.no_grad():
                feature = self.feature_extractor(frame).last_hidden_state.mean(dim=1).squeeze(0)
            features.append(feature)

        return torch.stack(features) if features else torch.zeros(1, 768, device=device)  # Ensure consistency

    def extract_features2(self, frames):
        features = []
        for frame in frames:
            frame = self.transform(frame).unsqueeze(0)
            with torch.no_grad():
                feature = self.feature_extractor(frame).last_hidden_state.mean(dim=1).squeeze(0)
            features.append(feature)

        return torch.stack(features) if features else torch.zeros(1, 768)

# Collate Function
def collate_fn(batch):
    features, labels, lengths = zip(*batch)
    max_length = max(lengths)
    padded_features = torch.zeros(len(batch), max_length, 768)
    mask = torch.ones(len(batch), max_length, dtype=torch.bool)

    for i, (feat, length) in enumerate(zip(features, lengths)):
        padded_features[i, :length, :] = feat
        mask[i, :length] = False

    return padded_features, torch.tensor(labels), mask

# Bi-LSTM Model
class VideoBiLSTM(nn.Module):
    def __init__(self, input_dim=768, hidden_dim=512, num_layers=2):
        super(VideoBiLSTM, self).__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_dim * 2, 1)  # Bi-directional, so 2x hidden_dim
        self.sigmoid = nn.Sigmoid()

    def forward(self, frame_features, mask):
        packed_output, _ = self.lstm(frame_features)
        output = self.fc(packed_output[:, -1, :])  # Take the last output for classification
        return self.sigmoid(output)

# Training Function
def train(model, fe, dataloader, epochs=10, lr=1e-4):
    model.train()
    optimizer = optim.Adam([
        #{'params': fe.parameters(), 'lr': 1e-5},
        {'params': model.parameters(), 'lr': lr}
    ])
    criterion = nn.BCELoss()
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    fe.to(device)
    fe.eval()
    for epoch in range(epochs):
        total_loss = 0
        for frame_features, labels, mask in tqdm(dataloader):
            frame_features, labels, mask = frame_features.to(device), labels.to(device), mask.to(device)
            optimizer.zero_grad()
            outputs = model(frame_features, mask)
            loss = criterion(outputs.squeeze(), labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        print(f"Epoch [{epoch+1}/{epochs}], Loss: {total_loss / len(dataloader):.4f}")
        torch.save(model.state_dict(), f"bilstm_{epoch+1}.pth")
        torch.save(fe.state_dict(), f"dinov2_{epoch+1}.pth")



In [None]:
# prompt: load model from saved

# Load the trained model (replace with your actual model path)
model = VideoBiLSTM()

# model_path = "bilstm_1.pth" #@param {type:"string"}
# model.load_state_dict(torch.load(model_path))
# model.to(device)
# model.eval()


In [None]:
import shutil

# Define the source and destination directories
source_directory = f"{root_data}"
destination_directory = "/content/destination_directory"

# Copy the entire directory
shutil.copytree(source_directory, destination_directory)


'/content/destination_directory'

In [None]:
# Load Dataset and Start Training
dataset = AccidentDataset(csv_file=f"/content/destination_directory/train.csv", video_dir=f"/content/destination_directory/train", feature_extractor=fe)
dataloader = DataLoader(dataset, batch_size=16, collate_fn=collate_fn, shuffle=True)

train(model, fe, dataloader)


 43%|████▎     | 40/94 [50:07<1:05:56, 73.27s/it]