In [1]:
import os
import glob
import re
from collections import defaultdict
import numpy as np
from sklearn.model_selection import train_test_split
import torch

import cv2
import numpy as np

from torch.utils.data import Dataset
from transformers import AutoImageProcessor, TimesformerForVideoClassification, Trainer, TrainingArguments

# Step 1: Load and Filter Dataset
class_to_videos = defaultdict(list)

# Use glob to traverse the dataset
video_paths = glob.glob('/kaggle/input/*/*/*/*.MOV')
print(f"Total videos found: {len(video_paths)}")

# Process each video path to extract class names
for video_path in video_paths:
    # Extract class name (second last element of path)
    class_name = re.findall(r"[a-zA-Z]+", video_path.split('/')[-2])[0].lower()
    class_to_videos[class_name].append(video_path)

# Sort classes by the number of videos
sorted_classes = sorted(class_to_videos.items(), key=lambda x: len(x[1]), reverse=True)
all_classes = sorted_classes  # Use all classes instead of top 5
print(f"Total classes: {len(all_classes)}")

# Create mappings for class names to indices
class_to_idx = {cls: idx for idx, (cls, _) in enumerate(all_classes)}
print(f"Class to index mapping: {class_to_idx}")

# Collect videos and labels for all classes
videos = []
labels = []
for cls, video_list in all_classes:
    videos.extend(video_list)
    labels.extend([class_to_idx[cls]] * len(video_list))

# Shuffle data for robustness
data = list(zip(videos, labels))
np.random.shuffle(data)
videos, labels = zip(*data)

# Step 2: Dataset Class
class VideoDataset(Dataset):
    def __init__(self, video_paths, labels):
        self.video_paths = video_paths
        self.labels = labels

    def __len__(self):
        return len(self.video_paths)

    def __getitem__(self, idx):
        video_path = self.video_paths[idx]
        video_frames = self.load_video(video_path)
        label = self.labels[idx]

        # Return frames and label without processing
        return {'video_frames': video_frames, 'labels': label}

    def load_video(self, video_path):
        cap = cv2.VideoCapture(video_path)
        if not cap.isOpened():
            raise ValueError("Error: Could not open video file.")

        total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))

        frame_step = max(total_frames // 8, 1)

        video_frames = []
        for i in range(8):
            cap.set(cv2.CAP_PROP_POS_FRAMES, i * frame_step)
            ret, frame = cap.read()
            if ret:
                # Resize frame to 224x224 and convert to RGB
                frame_resized = cv2.resize(frame, (224, 224))
                frame_rgb = cv2.cvtColor(frame_resized, cv2.COLOR_BGR2RGB)
                video_frames.append(frame_rgb)
            else:
                break

        cap.release()
        return video_frames

Total videos found: 3652
Total classes: 255
Class to index mapping: {'good': 0, 'train': 1, 'you': 2, 'hot': 3, 'warm': 4, 'cool': 5, 'new': 6, 'bad': 7, 'old': 8, 'young': 9, 'wet': 10, 'loud': 11, 'happy': 12, 'quiet': 13, 'dry': 14, 'sick': 15, 'healthy': 16, 'alright': 17, 'how': 18, 'hello': 19, 'tall': 20, 'wide': 21, 'long': 22, 'short': 23, 'she': 24, 'i': 25, 'it': 26, 'he': 27, 'fast': 28, 'narrow': 29, 'slow': 30, 'small': 31, 'big': 32, 'they': 33, 'we': 34, 'pleased': 35, 'thank': 36, 'cold': 37, 'brother': 38, 'boy': 39, 'grey': 40, 'library': 41, 'market': 42, 'store': 43, 'temple': 44, 'hospital': 45, 'restaurant': 46, 'street': 47, 'city': 48, 'house': 49, 'school': 50, 'park': 51, 'office': 52, 'university': 53, 'court': 54, 'ground': 55, 'bank': 56, 'location': 57, 'india': 58, 'brown': 59, 'clothing': 60, 'shoes': 61, 'pocket': 62, 'pant': 63, 't': 64, 'woman': 65, 'baby': 66, 'man': 67, 'sister': 68, 'shirt': 69, 'hat': 70, 'dress': 71, 'neighbour': 72, 'president'

In [2]:
# Step 3: Split Data and Load Processor/Model
train_videos, val_videos, train_labels, val_labels = train_test_split(videos, labels, test_size=0.2, random_state=42)

# Load the pretrained processor
processor = AutoImageProcessor.from_pretrained("facebook/timesformer-base-finetuned-k400")

# Load the pretrained model and redefine the classifier
model = TimesformerForVideoClassification.from_pretrained(
    "facebook/timesformer-base-finetuned-k400",
    num_labels=len(class_to_idx),  # Updated to total number of classes
    ignore_mismatched_sizes=True  # Ignore mismatched sizes for classifier layer
)

train_dataset = VideoDataset(train_videos, train_labels)
val_dataset = VideoDataset(val_videos, val_labels)

# Step 4: Define Training Arguments
training_args = TrainingArguments(
    output_dir="./timesformer-finetuned",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=10,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    save_total_limit=2,
    fp16=True,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    report_to="none",
    remove_unused_columns=False  # Add this line
)

# Step 5: Define Trainer
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = (predictions == labels).mean()
    return {"accuracy": accuracy}

# Specify a data collator
def collate_fn(batch):
    video_frames = [item['video_frames'] for item in batch]
    labels = [item['labels'] for item in batch]

    # Process batch of videos
    inputs = processor(images=video_frames, return_tensors="pt")
    labels = torch.tensor(labels, dtype=torch.long)
    inputs['labels'] = labels

    return inputs

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=processor,
    data_collator=collate_fn,
    compute_metrics=compute_metrics
)

# Step 6: Train and Evaluate
trainer.train()
metrics = trainer.evaluate()
print("Validation Metrics:", metrics)

# Save the model
trainer.save_model("./timesformer-finetuned")

preprocessor_config.json:   0%|          | 0.00/412 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/22.7k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/486M [00:00<?, ?B/s]

Some weights of TimesformerForVideoClassification were not initialized from the model checkpoint at facebook/timesformer-base-finetuned-k400 and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([400, 768]) in the checkpoint and torch.Size([255, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([400]) in the checkpoint and torch.Size([255]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss,Accuracy
1,4.1896,4.014014,0.060192
2,2.7792,2.804811,0.262654
3,1.4851,1.786491,0.487004
4,0.8726,1.174929,0.667579
5,0.5513,0.837874,0.777018
6,0.2465,0.620412,0.833105
7,0.0246,0.532992,0.865937
8,0.0478,0.505591,0.864569
9,0.0014,0.459343,0.879617
10,0.001,0.465851,0.883721


Validation Metrics: {'eval_loss': 0.4658510386943817, 'eval_accuracy': 0.8837209302325582, 'eval_runtime': 647.6213, 'eval_samples_per_second': 1.129, 'eval_steps_per_second': 0.565, 'epoch': 10.0}


In [3]:
trainer.save_model("./timesformer-finetuned")

In [4]:
# Save the class-to-index mapping
import json

idx_to_class = {idx: cls for cls, idx in class_to_idx.items()}
with open('./timesformer-finetuned/idx_to_class.json', 'w') as f:
    json.dump(idx_to_class, f)

# Step 7: Inference Function
def predict_video(video_path):
    # Load video frames
    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        raise ValueError("Error: Could not open video file.")

    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    frame_step = max(total_frames // 8, 1)

    video_frames = []
    for i in range(8):
        cap.set(cv2.CAP_PROP_POS_FRAMES, i * frame_step)
        ret, frame = cap.read()
        if ret:
            # Resize frame to 224x224 and convert to RGB
            frame_resized = cv2.resize(frame, (224, 224))
            frame_rgb = cv2.cvtColor(frame_resized, cv2.COLOR_BGR2RGB)
            video_frames.append(frame_rgb)
        else:
            break

    cap.release()

    # Process frames
    inputs = processor(images=[video_frames], return_tensors="pt").to(model.device)

    # Make prediction
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        predicted_class_idx = logits.argmax(-1).item()

    # Map index to class name
    idx_to_class = {idx: cls for cls, idx in class_to_idx.items()}
    predicted_class = idx_to_class[predicted_class_idx]

    print(f"Predicted class: {predicted_class}")



In [5]:
# Step 7: Inference Function
def predict_video(video_path):
    # Load video frames
    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        raise ValueError("Error: Could not open video file.")

    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    frame_step = max(total_frames // 8, 1)

    video_frames = []
    for i in range(8):
        cap.set(cv2.CAP_PROP_POS_FRAMES, i * frame_step)
        ret, frame = cap.read()
        if ret:
            # Resize frame to 224x224 and convert to RGB
            frame_resized = cv2.resize(frame, (224, 224))
            frame_rgb = cv2.cvtColor(frame_resized, cv2.COLOR_BGR2RGB)
            video_frames.append(frame_rgb)
        else:
            break

    cap.release()

    # Process frames
    inputs = processor(images=[video_frames], return_tensors="pt").to(model.device)

    # Make prediction
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        predicted_class_idx = logits.argmax(-1).item()

    # Map index to class name
    idx_to_class = {idx: cls for cls, idx in class_to_idx.items()}
    predicted_class = idx_to_class[predicted_class_idx]

    print(f"Predicted class: {predicted_class}")

In [6]:
predict_video('/kaggle/input/include/Adjectives_7of8/Adjectives/94. good/MVI_5159.MOV')

ValueError: Error: Could not open video file.

In [None]:
predict_video('/kaggle/input/include/Adjectives_6of8/Adjectives/89. warm/MVI_5222.MOV')

In [None]:
predict_video('/kaggle/input/include/Adjectives_6of8/Adjectives/87. hot/MVI_5137.MOV')

In [None]:
predict_video('/kaggle/input/include/Adjectives_7of8/Adjectives/93. young/MVI_5155.MOV')