In [1]:
# =====================================
# IMPORTS AND SETUP
# =====================================

!pip install torch torchvision pytorchvideo > /dev/null 2>&1

import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

import json
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
import cv2
from tqdm import tqdm
import wandb
from pytorchvideo.models.hub import slowfast_r50
import torchvision.transforms as transforms
from kaggle_secrets import UserSecretsClient
import random
from huggingface_hub import HfApi, hf_hub_download, create_repo
import torch.nn as nn
from typing import Dict, Any, Optional
from torchvision import transforms
import pytorchvideo.models.hub as models
from sklearn.metrics import f1_score, confusion_matrix, classification_report
import torch.nn.functional as F
import os
import shutil
import torch.optim.lr_scheduler as lr_scheduler
import re
import glob
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler
import albumentations as A
from albumentations.pytorch import ToTensorV2  # Fixed import
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from tenacity import retry, stop_after_attempt, wait_exponential

  check_for_updates()


In [2]:
# Reuse your existing configurations and dataset class
class Config:
    num_frames = 32
    crop_size = (224, 224)
    dataset_mean = [0.45, 0.45, 0.45]
    dataset_std = [0.225, 0.225, 0.225]
    batch_size = 16
    resize_size = (256, 256)

In [3]:
# =====================================
# DATASET CLASS
# =====================================

class MELDDataset(Dataset):
    """Enhanced Dataset with Temporal Augmentations"""
    
    def __init__(self, metadata, split, train=True):
        self.data = metadata[split]
        self.train = train
        self.transform = self._build_transforms()
        self.error_log = open("dataset_errors.log", "a")

    def _build_transforms(self):
        """Build data augmentation transforms"""
        normalize = A.Normalize(
            mean=Config.dataset_mean,
            std=Config.dataset_std,
            max_pixel_value=255.0
        )
        
        if self.train:
            return A.Compose([
                A.Resize(Config.resize_size[0], Config.resize_size[1]),
                A.RandomCrop(height=Config.crop_size[0], width=Config.crop_size[1], p=1.0),
                A.HorizontalFlip(p=0.5),
                A.Rotate(limit=15, p=0.4),
                A.RandomBrightnessContrast(brightness_limit=0.2, contrast_limit=0.2, p=0.5),
                normalize,
                ToTensorV2()
            ])
        else:
            return A.Compose([
                A.Resize(Config.resize_size[0], Config.resize_size[1]),
                A.CenterCrop(height=Config.crop_size[0], width=Config.crop_size[1], p=1.0),
                normalize,
                ToTensorV2()
            ])

    def __getitem__(self, idx):
        item = self.data[idx]
        frames_dir = item['frames_dir']
        mask_info = item['mask_info']
        label = item['y']
        
        try:
            if not os.path.exists(frames_dir):
                raise FileNotFoundError(f"Directory not found: {frames_dir}")
            
            frame_files = sorted(
                [f for f in os.listdir(frames_dir) if f.endswith(('.jpg', '.png'))],
                key=lambda x: int(re.search(r'^(\d+)', x).group(1))
            )
            
            if len(frame_files) < Config.num_frames:
                raise ValueError(f"Only {len(frame_files)} frames found, need {Config.num_frames}")

            # FIXED MASK HANDLING
            if len(frame_files) > Config.num_frames:
                start_idx = random.randint(0, len(frame_files) - Config.num_frames)
                selected_files = frame_files[start_idx:start_idx+Config.num_frames]
                selected_mask_info = mask_info[start_idx:start_idx+Config.num_frames]
            else:
                selected_files = frame_files
                selected_mask_info = mask_info[:len(selected_files)]
                if len(selected_mask_info) < Config.num_frames:
                    pad_length = Config.num_frames - len(selected_mask_info)
                    selected_mask_info = selected_mask_info + [0] * pad_length

            frames = []
            for i, fname in enumerate(selected_files):
                frame_path = os.path.join(frames_dir, fname)
                frame = cv2.imread(frame_path)
                if frame is None:
                    raise IOError(f"Failed to read {frame_path}")
                frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                transformed = self.transform(image=frame)["image"]
                frames.append(transformed)

            video_tensor = torch.stack(frames)  # [T, C, H, W]
            slow_pathway = video_tensor[::4].permute(1, 0, 2, 3)  # [C, T/4, H, W]
            fast_pathway = video_tensor.permute(1, 0, 2, 3)       # [C, T, H, W]
            
            mask = torch.tensor(selected_mask_info[:Config.num_frames], dtype=torch.float32)
            slow_mask = mask[::4]
            fast_mask = mask
            
            return slow_pathway, fast_pathway, slow_mask, fast_mask, label
            
        except Exception as e:
            self.error_log.write(f"Error loading index {idx}: {str(e)}\n")
            slow = torch.zeros(3, Config.num_frames // 4, *Config.crop_size)
            fast = torch.zeros(3, Config.num_frames, *Config.crop_size)
            slow_mask = torch.ones(Config.num_frames // 4)
            fast_mask = torch.ones(Config.num_frames)
            return slow, fast, slow_mask, fast_mask, label

    def __len__(self):
        return len(self.data)

    def __del__(self):
        self.error_log.close()

In [4]:
# =====================================
# MODEL ARCHITECTURE
# =====================================

class MaskedSlowFast(nn.Module):
    """Simplified Model Architecture"""
    
    def __init__(self, num_classes):
        super().__init__()
        self.backbone = slowfast_r50(pretrained=True, progress=True)
        self.backbone.blocks = self.backbone.blocks[:-1]  # Remove classification head

        # Add batch normalization before classifier
        self.feature_bn = nn.BatchNorm1d(2304)
        
        # Simplified classifier
        self.classifier = nn.Sequential(
            nn.Linear(2304, 1024),
            nn.BatchNorm1d(1024),
            nn.GELU(),
            nn.Dropout(0.3),
            nn.Linear(1024, 512),
            nn.BatchNorm1d(512),
            nn.GELU(),
            nn.Dropout(0.2),
            nn.Linear(512, num_classes)
        )
        
        # Freeze initial layers
        for param in self.backbone.parameters():
            param.requires_grad = False
            
        # Gradual unfreezing setup
        self.unfreeze_stages = {5: False, 4: False, 3: False}

    def unfreeze_layers(self, epoch):
        """Gradual layer unfreezing during training"""
        if epoch >= 3 and not self.unfreeze_stages[5]:
            self._unfreeze_stage(5)
        if epoch >= 6 and not self.unfreeze_stages[4]:
            self._unfreeze_stage(4)
        if epoch >= 9 and not self.unfreeze_stages[3]:
            self._unfreeze_stage(3)
            
    def _unfreeze_stage(self, stage):
        """Unfreeze a specific stage of the backbone"""
        for param in self.backbone.blocks[stage].parameters():
            param.requires_grad = True
        self.unfreeze_stages[stage] = True
        print(f"Unfroze stage {stage} layers")

    def forward(self, slow_input, fast_input, slow_mask, fast_mask):
        # Apply masks to zero out padded frames
        slow_input = slow_input * slow_mask[:, None, :, None, None]
        fast_input = fast_input * fast_mask[:, None, :, None, None]
        
        # Get features
        features = self.backbone([slow_input, fast_input])
        features = features.view(features.size(0), -1)

        # Apply feature batch normalization
        features = self.feature_bn(features)
        
        return self.classifier(features)

In [5]:
# Extended dataset to return video IDs
class FeatureExtractionDataset(MELDDataset):
    def __getitem__(self, idx):
        item = self.data[idx]
        # Find video ID key (non-standard keys)
        standard_keys = {'y', 'label', 'frames_dir', 'mask_info'}
        video_id = next((k for k in item.keys() if k not in standard_keys), None)
        
        # Get original data
        slow, fast, slow_mask, fast_mask, label = super().__getitem__(idx)
        return slow, fast, slow_mask, fast_mask, label, video_id

In [6]:
def extract_features():
    # Setup device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    # Load metadata
    with open("/kaggle/input/meld-extracted-video-frames-rgb/extraction_checkpoint.json") as f:
        metadata = json.load(f)["metadata"]
    
    # Load model
    num_classes = 7  # Adjust based on your dataset
    model = MaskedSlowFast(num_classes).to(device)
    
    # Download model checkpoint from Hugging Face Hub
    user_secrets = UserSecretsClient()
    hf_token = user_secrets.get_secret("HF_TOKEN")
    checkpoint_path = hf_hub_download(
        repo_id="prakanda/hatsu-meld-emotion-recognition-new",
        filename="best_model.pth",
        token=hf_token
    )
    
    # Load model weights and remove classifier
    checkpoint = torch.load(checkpoint_path, map_location=device, weights_only=False)
    model.load_state_dict(checkpoint['model_state_dict'])
    model.classifier = torch.nn.Identity()  # Return features instead of predictions
    model.eval()
    
    # Create datasets with video IDs
    splits = ['train', 'dev', 'test']
    datasets = {
        split: FeatureExtractionDataset(metadata, split, train=False)
        for split in splits
    }
    
    # Create data loaders
    loaders = {
        split: DataLoader(
            dataset, 
            batch_size=Config.batch_size,
            shuffle=False,
            num_workers=4
        )
        for split, dataset in datasets.items()
    }
    
    # Extract features
    features_dict = {}
    for split, loader in loaders.items():
        for batch in tqdm(loader, desc=f"Extracting {split} features"):
            slow, fast, slow_mask, fast_mask, labels, video_ids = batch
            slow = slow.to(device)
            fast = fast.to(device)
            slow_mask = slow_mask.to(device)
            fast_mask = fast_mask.to(device)
            
            with torch.no_grad():
                features = model(slow, fast, slow_mask, fast_mask)
                features = features.cpu().numpy()
            
            for i, vid in enumerate(video_ids):
                features_dict[vid] = features[i].tolist()
    
    # Add features to metadata
    standard_keys = {'y', 'label', 'frames_dir', 'mask_info'}
    for split in splits:
        for item in metadata[split]:
            # Find video ID key
            video_id = next((k for k in item.keys() if k not in standard_keys), None)
            if video_id and video_id in features_dict:
                feature_key = f"{video_id}__slowfast_features"
                item[feature_key] = features_dict[video_id]
    
    # Save results
    output = metadata
    with open("Video_Features_SlowFast.json", "w") as f:
        json.dump(output, f, indent=4)

In [7]:
if __name__ == "__main__":
    extract_features()

Downloading: "https://dl.fbaipublicfiles.com/pytorchvideo/model_zoo/kinetics/SLOWFAST_8x8_R50.pyth" to /root/.cache/torch/hub/checkpoints/SLOWFAST_8x8_R50.pyth
100%|██████████| 264M/264M [00:01<00:00, 245MB/s]


best_model.pth:   0%|          | 0.00/426M [00:00<?, ?B/s]

Extracting train features: 100%|██████████| 625/625 [14:50<00:00,  1.42s/it]
Extracting dev features: 100%|██████████| 70/70 [01:40<00:00,  1.44s/it]
Extracting test features: 100%|██████████| 164/164 [04:08<00:00,  1.52s/it]
