In [None]:
!pip install -U -q sentence-transformers

In [None]:
import numpy as np
import os
import random
import torch
import torch.nn as nn
from transformers import Wav2Vec2Processor, Wav2Vec2Model, ViTModel, ViTImageProcessor
from sentence_transformers import SentenceTransformer
import torchaudio
import h5py
import torch.nn.functional as F
import math
# Set environment variable
os.environ['CUBLAS_WORKSPACE_CONFIG'] = ':4096:8'

global_seed = 0
random.seed(global_seed)
np.random.seed(global_seed)

In [None]:
class AudioHateClassifier(nn.Module):
    def __init__(self, hidden_size=768, num_heads=2, num_layers=2, dropout=0.1, train_option="finetune", seed=0):
        assert train_option in ["finetune","transfer"]
        torch.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        super().__init__()

        self.hidden_size = hidden_size
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        
        # Initialize audio extractor
        self.wav2vec_model = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base")
        self.wav2vec_processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base")
        
        if train_option == "transfer":
            for param in self.wav2vec_model.parameters():
                param.requires_grad = False

        # Audio projection and position embedding
        self.audio_proj = nn.Linear(hidden_size, hidden_size, bias=False)
        self.audio_pos_embed = nn.Parameter(torch.randn(1, 1, hidden_size))
        
        # Attention layers
        self.attention_layers = nn.ModuleList([
            nn.MultiheadAttention(hidden_size, num_heads, dropout=dropout, batch_first=True)
            for _ in range(num_layers)
        ])
        
        self.layer_norms = nn.ModuleList([
            nn.LayerNorm(hidden_size)
            for _ in range(num_layers)
        ])
        
        self.dropout = nn.Dropout(dropout)
        
        # Classification head
        self.output_head = nn.Sequential(
            nn.Linear(hidden_size, 128),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(128, 1),
            nn.Sigmoid()
        )

    def _extract_audio_features(self, audio_path):
        # Same as original
        waveform, sample_rate = torchaudio.load(audio_path)
        if waveform.shape[0] > 1:
            waveform = torch.mean(waveform, dim=0, keepdim=True)
        
        if sample_rate != self.wav2vec_processor.feature_extractor.sampling_rate:
            resampler = torchaudio.transforms.Resample(sample_rate, 
                self.wav2vec_processor.feature_extractor.sampling_rate)
            waveform = resampler(waveform)

        if torch.abs(waveform).float().mean().item() * 100 > 90:
            return None
            
        input_values = self.wav2vec_processor(waveform.squeeze().numpy(), 
            sampling_rate=self.wav2vec_processor.feature_extractor.sampling_rate, 
            return_tensors="pt").input_values.to(self.device)
        
        with torch.no_grad():
            outputs = self.wav2vec_model(input_values)
            features = outputs.last_hidden_state.squeeze(0)
            
        return features

    def forward(self, audio_path):
        features = self._extract_audio_features(audio_path)
        if features is None:
            return torch.tensor([0.5]).to(self.device).detach().clone().requires_grad_(True)
            
        features = self.audio_proj(features) + self.audio_pos_embed
        
        # Apply attention layers
        for attention, norm in zip(self.attention_layers, self.layer_norms):
            attended_features, _ = attention(features, features, features)
            features = features + self.dropout(attended_features)
            features = norm(features)
        
        # Pool and classify
        pooled_features = features.squeeze().mean(dim=0)
        output = self.output_head(pooled_features)
        return output

    def predict(self, audio_path):
        self.eval()
        with torch.no_grad():
            output = self.forward(audio_path)
            return (output > 0.5).item(), output.item()

class ImageHateClassifier(nn.Module):
    def __init__(self, hidden_size=768, num_heads=2, num_layers=2, dropout=0.1, train_option="finetune", seed=0):
        assert train_option in ["finetune","transfer"]
        torch.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        super().__init__()

        self.hidden_size = hidden_size
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        
        # Initialize image extractor
        self.vit_model = ViTModel.from_pretrained("google/vit-base-patch16-224")
        self.vit_processor = ViTImageProcessor.from_pretrained("google/vit-base-patch16-224")
        
        if train_option == "transfer":
            for param in self.vit_model.parameters():
                param.requires_grad = False

        # Image projection and position embedding
        self.image_proj = nn.Linear(hidden_size, hidden_size, bias=False)
        self.image_pos_embed = nn.Parameter(torch.randn(1, 1, hidden_size))
        
        # Attention layers
        self.attention_layers = nn.ModuleList([
            nn.MultiheadAttention(hidden_size, num_heads, dropout=dropout, batch_first=True)
            for _ in range(num_layers)
        ])
        
        self.layer_norms = nn.ModuleList([
            nn.LayerNorm(hidden_size)
            for _ in range(num_layers)
        ])
        
        self.dropout = nn.Dropout(dropout)
        
        # Classification head
        self.output_head = nn.Sequential(
            nn.Linear(hidden_size, 128),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(128, 1),
            nn.Sigmoid()
        )

    def _extract_image_features(self, h5_path, batch_size=32):
        # Same as original
        with h5py.File(h5_path, 'r') as f:
            frames = f['frames'][:]
        if len(frames.shape) == 3:
            frames = np.stack([frames] * 3, axis=-1)
        
        all_features = []
        for i in range(0, len(frames), batch_size):
            batch_frames = frames[i:i + batch_size]
            inputs = self.vit_processor(images=batch_frames, return_tensors="pt").pixel_values.to(self.device)
            
            with torch.no_grad():
                outputs = self.vit_model(inputs)
                features = outputs.pooler_output
                all_features.append(features)
        
        return torch.cat(all_features, dim=0) if all_features else None

    def forward(self, image_path):
        features = self._extract_image_features(image_path)
        if features is None:
            with torch.no_grad():
                return torch.tensor([0.5]).to(self.device).detach().clone().requires_grad_(True)
            
        features = self.image_proj(features) + self.image_pos_embed
        
        # Apply attention layers
        for attention, norm in zip(self.attention_layers, self.layer_norms):
            attended_features, _ = attention(features, features, features)
            features = features + self.dropout(attended_features)
            features = norm(features)
        
        # Pool and classify
        pooled_features = features.squeeze(0).mean(dim=0)
        output = self.output_head(pooled_features)
        return output

    def predict(self, image_path):
        self.eval()
        with torch.no_grad():
            output = self.forward(image_path)
            return (output > 0.5).item(), output.item()

class TextHateClassifier(nn.Module):
    def __init__(self, hidden_size=768, num_heads=2, num_layers=2, dropout=0.1, train_option="finetune", seed=0):
        assert train_option in ["finetune","transfer"]
        torch.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        super().__init__()

        self.hidden_size = hidden_size
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        
        # Initialize text extractor
        self.text_model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")
        
        if train_option == "transfer":
            for param in self.text_model.parameters():
                param.requires_grad = False

        # Text projection and position embedding
        self.text_proj = nn.Linear(hidden_size, hidden_size, bias=False)
        self.text_pos_embed = nn.Parameter(torch.randn(1, 1, hidden_size))
        
        # Attention layers
        self.attention_layers = nn.ModuleList([
            nn.MultiheadAttention(hidden_size, num_heads, dropout=dropout, batch_first=True)
            for _ in range(num_layers)
        ])
        
        self.layer_norms = nn.ModuleList([
            nn.LayerNorm(hidden_size)
            for _ in range(num_layers)
        ])
        
        self.dropout = nn.Dropout(dropout)
        
        # Classification head
        self.output_head = nn.Sequential(
            nn.Linear(hidden_size, 128),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(128, 1),
            nn.Sigmoid()
        )

    def _extract_text_features(self, txt_path):
        # Same as original
        with open(txt_path, 'r', encoding='utf-8') as f:
            text = f.read()
        
        sentences = [s.strip() for s in text.split('. ') if s.strip()]
        if not sentences:
            return None
            
        with torch.no_grad():
            embeddings = self.text_model.encode(sentences, convert_to_tensor=True).to(self.device)
        
        return embeddings

    def forward(self, text_path):
        features = self._extract_text_features(text_path)
        if features is None:
            return torch.tensor([0.5]).to(self.device).detach().clone().requires_grad_(True)
            
        features = self.text_proj(features) + self.text_pos_embed
        
        # Apply attention layers
        for attention, norm in zip(self.attention_layers, self.layer_norms):
            attended_features, _ = attention(features, features, features)
            features = features + self.dropout(attended_features)
            features = norm(features)
        
        # Pool and classify
        pooled_features = features.squeeze(0).mean(dim=0)
        output = self.output_head(pooled_features)
        return output

    def predict(self, text_path):
        self.eval()
        with torch.no_grad():
            output = self.forward(text_path)
            return (output > 0.5).item(), output.item()

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score
from tqdm import tqdm
import os
import gc
import time

class HateVideoDataset(Dataset):
    def __init__(self, dataframe, base_path):
        self.data = dataframe
        self.base_path = base_path
        
    def __len__(self):
        return len(self.data)        
        
    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        audio_path = os.path.join(self.base_path, 'audio', row['audio'])
        image_path = os.path.join(self.base_path, 'image_sequences', row['imageseq'])
        text_path = os.path.join(self.base_path, 'transcripts', row['transcript'])
        label = torch.tensor([row['label']], dtype=torch.float)
        return {
            'audio_path': audio_path,
            'image_path': image_path,
            'text_path': text_path,
            'label': label
        }

class EarlyStopping:
    def __init__(self, patience=7, min_delta=1e-5):
        self.patience = patience
        self.min_delta = min_delta
        self.counter = 0
        self.best_loss = None
        self.early_stop = False
        
    def __call__(self, val_loss):
        if self.best_loss is None:
            self.best_loss = val_loss
        elif val_loss > self.best_loss - self.min_delta:
            self.counter += 1
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_loss = val_loss
            self.counter = 0

def validate_models(models, val_loader, criterion):
    losses = {name: 0.0 for name in models.keys()}
    
    for model_name, model in models.items():
        model.eval()
        with torch.no_grad():
            for data in tqdm(val_loader):
                torch.cuda.empty_cache()
                gc.collect()
                label = data['label'].to("cuda")
                path_key = f"{model_name}_path"
                outputs = model(data[path_key])
                losses[model_name] += criterion[model_name](outputs, label).item()
                
        losses[model_name] /= len(val_loader)
    
    return losses

def train_multi_modal(models, train_loader, val_loader, criterion, optimizers,
                     schedulers=None, num_epochs=20, patience=5, 
                     checkpoint_dir="checkpoints"):
    torch.set_grad_enabled(True)
    os.makedirs(checkpoint_dir, exist_ok=True)
    early_stoppers = {name: EarlyStopping(patience=patience) for name in models.keys()}
    best_losses = {name: float('inf') for name in models.keys()}

    for epoch in range(num_epochs):
        print(f"Epoch {epoch+1}/{num_epochs}")
        
        # Training phase
        for model_name, model in models.items():
            model.train()
            running_loss = 0.0
            print(model_name)
            for data in tqdm(train_loader):
                label = data['label'].to("cuda")
                path_key = f"{model_name}_path"
                torch.cuda.empty_cache()
                gc.collect()
                optimizers[model_name].zero_grad()
                outputs = model(data[path_key])
                loss = criterion[model_name](outputs.float(), label.float())
                loss.backward()
                optimizers[model_name].step()
                running_loss += loss.item()
            
            epoch_loss = running_loss / len(train_loader)
            print(f"{model_name} Training Loss: {epoch_loss:.4f}")
        
        # Validation phase
        val_losses = validate_models(models, val_loader, criterion)
        
        # Update schedulers and check early stopping
        should_stop = []
        for model_name in models.keys():
            if schedulers and model_name in schedulers:
                schedulers[model_name].step(val_losses[model_name])
            
            early_stoppers[model_name](val_losses[model_name])
            print(f"{model_name} Validation Loss: {val_losses[model_name]:.4f}")
            
            if val_losses[model_name] < best_losses[model_name]:
                best_losses[model_name] = val_losses[model_name]
                torch.save(models[model_name].state_dict(), 
                         os.path.join(checkpoint_dir, f"{model_name}_best.pth"))
                print(f"{model_name} model saved")
            
            if early_stoppers[model_name].early_stop:
                should_stop.append(model_name)
        
        if should_stop:
            print(f"Early stopping for models: {', '.join(should_stop)}")
            if len(should_stop) == len(models):
                break
    
    return models

def evaluate_multi_modal(models, test_loader):
    results = {}
    
    for model_name, model in models.items():
        model.eval()
        y_true = []
        y_pred = []
        
        with torch.no_grad():
            for data in tqdm(test_loader):
                torch.cuda.empty_cache()
                gc.collect()
                label = data['label'].to("cuda")
                path_key = f"{model_name}_path"
                outputs = model(data[path_key])
                y_true.extend(label.cpu().numpy())
                y_pred.extend(outputs.cpu().numpy())
        
        y_true = np.array(y_true)
        y_pred = (np.array(y_pred) > 0.5).astype(int)
        accuracy = accuracy_score(y_true, y_pred)
        precision, recall, f1, _ = precision_recall_fscore_support(y_true, y_pred, average='binary')
        roc_auc = roc_auc_score(y_true, y_pred)
        results[model_name] = {
                'accuracy': accuracy,
                'precision': precision,
                'recall': recall,
                'f1': f1,
                'roc_auc': roc_auc
            }
    return results

In [None]:
base_path = "/kaggle/input/mmhate/HateMM"
torch.cuda.empty_cache()
gc.collect()

def get_duration(file):
    import wave
    with wave.open(file, 'r') as f:
        frames = f.getnframes()
        rate = f.getframerate()
        duration = frames / float(rate)
        return int(duration)+ (duration>int(duration))

files = []  
for file in os.listdir('/kaggle/input/mmhate/HateMM/audio/'):
    if file.endswith('.wav'):
        if get_duration('/kaggle/input/mmhate/HateMM/audio/' + file) <= 180: # 3 minutes
            files.append(file)

df = pd.DataFrame(columns=['audio', 'imageseq', 'transcript', 'label'])

for file in files:
  filename = file.split('.')[0]
  df = pd.concat([df, pd.DataFrame({'audio': [filename + '.wav'], 'imageseq': [filename + '.h5'], 'transcript': [filename + '.txt'], 'label': [0 if "non" in filename else 1]})], ignore_index=True)
                                                                                                                                           

train, test = train_test_split(df, test_size=0.15, random_state=1, stratify=df['label'])
train, val = train_test_split(train, test_size=0.176, random_state=0, stratify=train['label'])

# Create datasets
train_dataset = HateVideoDataset(train, base_path)
val_dataset = HateVideoDataset(val, base_path)
test_dataset = HateVideoDataset(test, base_path)

# Create dataloaders
train_loader = DataLoader(train_dataset, batch_size=None, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=None)
test_loader = DataLoader(test_dataset, batch_size=None)

# Initialize models
models = {
    'audio': AudioHateClassifier().to("cuda"),
    'image': ImageHateClassifier().to("cuda"),
    'text': TextHateClassifier().to("cuda")
}

# Initialize optimizers and criteria
num_epochs = 20
learning_rate = 1e-4
optimizers = {
    name: torch.optim.AdamW(model.parameters(),lr=learning_rate) 
    for name, model in models.items()
}
schedulers = {
    name: torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer) 
    for name, optimizer in optimizers.items()
}
criteria = {name: nn.BCELoss() for name, _ in models.items()}

# Train models
trained_models = train_multi_modal(
    models, train_loader, val_loader, criteria, 
    optimizers, schedulers
)

# Evaluate
results = evaluate_multi_modal(trained_models, test_loader)
for k,v in results.items(): 
    print(k)
    print(v)