In [20]:
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from PIL import Image
from transformers import (
    AutoModel, AutoTokenizer,
    BertModel, BertTokenizer,
    ViTModel, ViTFeatureExtractor,
    XLMRobertaModel, XLMRobertaTokenizer
)
from torchvision import transforms, models
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import tensorflow as tf
import os
import pickle
import glob
import re
from torch.utils.data import Dataset, DataLoader
import warnings
warnings.filterwarnings('ignore')

In [21]:
# Load dataset
df = pd.read_csv("/kaggle/input/multilingual-meme-datasets/final_datasets.csv")
print(f"Dataset loaded with shape: {df.shape}")

# Take 30% sample
sample_size = int(len(df) * 0.3)
df_sample = df.sample(n=sample_size, random_state=42)
print(f"Working with sample of {len(df_sample)} rows")

# Image directory (update this path to your image directory)
image_dir = "/kaggle/input/multilingual-meme-datasets/datasets/datasets"

Dataset loaded with shape: (25600, 11)
Working with sample of 7680 rows


In [22]:
# ViT + BERT Model Definition
class ViTBertClassifier(nn.Module):
    def __init__(self, num_classes=2):
        super(ViTBertClassifier, self).__init__()
        
        # Load pre-trained Vision Transformer
        self.vit = ViTModel.from_pretrained("google/vit-base-patch16-224")
        
        # Load pre-trained BERT
        self.bert = BertModel.from_pretrained("bert-base-multilingual-cased")
        
        # Dimensionality of ViT and BERT embeddings
        vit_hidden_size = self.vit.config.hidden_size  # 768 for ViT base
        bert_hidden_size = self.bert.config.hidden_size  # 768 for BERT base
        
        # Enhanced classification head
        self.classifier = nn.Sequential(
            nn.Linear(vit_hidden_size + bert_hidden_size, 512),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(256, num_classes)
        )
        
    def forward(self, input_ids, attention_mask, pixel_values):
        # Process image with ViT
        vit_outputs = self.vit(pixel_values=pixel_values)
        vit_embeddings = vit_outputs.last_hidden_state[:, 0, :]  # CLS token
        
        # Process text with BERT
        bert_outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        bert_embeddings = bert_outputs.last_hidden_state[:, 0, :]  # CLS token
        
        # Concatenate image and text features
        combined_embeddings = torch.cat((vit_embeddings, bert_embeddings), dim=1)
        
        # Classification
        logits = self.classifier(combined_embeddings)
        
        return logits

# XLM-RoBERTa Model Definition
class XLMRClassifier(nn.Module):
    def __init__(self, num_classes=2, dropout_rate=0.3):
        super(XLMRClassifier, self).__init__()
        
        # Load pre-trained XLM-RoBERTa
        self.xlmr = XLMRobertaModel.from_pretrained("xlm-roberta-base")
        
        # Get hidden size from config
        hidden_size = self.xlmr.config.hidden_size  # 768 for base model
        
        # Enhanced classification head
        self.classifier = nn.Sequential(
            nn.Linear(hidden_size, 512),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            nn.Linear(256, num_classes)
        )
        
    def forward(self, input_ids, attention_mask):
        # Process text with XLM-R
        outputs = self.xlmr(input_ids=input_ids, attention_mask=attention_mask)
        
        # Get the [CLS] token embedding (first token)
        embeddings = outputs.last_hidden_state[:, 0, :]
        
        # Classification
        logits = self.classifier(embeddings)
        
        return logits

In [23]:
# MuRIL Image Classifier Model Definition
class MuRILImageClassifier(nn.Module):
    def __init__(self, num_classes=2):
        super(MuRILImageClassifier, self).__init__()
        
        # Load pretrained MuRIL
        self.muril = AutoModel.from_pretrained("google/muril-base-cased")
        
        # Image encoder (simplified CNN instead of ResNet50)
        self.image_encoder = nn.Sequential(
            nn.Conv2d(3, 32, kernel_size=3, stride=2, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Conv2d(32, 64, kernel_size=3, stride=2, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Conv2d(64, 128, kernel_size=3, stride=2, padding=1),
            nn.ReLU(),
            nn.AdaptiveAvgPool2d((1, 1)),
            nn.Flatten()
        )
        
        # Freeze MuRIL weights
        for param in self.muril.parameters():
            param.requires_grad = False
            
        # Unfreeze the last few layers of MuRIL
        for param in self.muril.encoder.layer[-2:].parameters():
            param.requires_grad = True
        
        # Dimensions
        muril_hidden_size = self.muril.config.hidden_size  # 768 for muril-base
        image_feature_size = 128  # From our simplified CNN
        
        # Enhanced classification head
        self.classifier = nn.Sequential(
            nn.Linear(muril_hidden_size + image_feature_size, 512),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(256, num_classes)
        )
        
    def forward(self, input_ids, attention_mask, token_type_ids, image):
        # Process text with MuRIL
        muril_outputs = self.muril(
            input_ids=input_ids, 
            attention_mask=attention_mask,
            token_type_ids=token_type_ids
        )
        muril_embeddings = muril_outputs.last_hidden_state[:, 0, :]  # CLS token
        
        # Process image with our image encoder
        image_features = self.image_encoder(image)
        
        # Concatenate MuRIL and image features
        combined_embeddings = torch.cat((muril_embeddings, image_features), dim=1)
        
        # Classification
        logits = self.classifier(combined_embeddings)
        
        return logits

# ResNet50 + BERT Model Definition
class ResNet50BertClassifier(nn.Module):
    def __init__(self, num_classes=2):
        super(ResNet50BertClassifier, self).__init__()
        
        # Load pretrained ResNet50
        self.resnet = models.resnet50(pretrained=True)
        self.resnet = nn.Sequential(*list(self.resnet.children())[:-1])
        
        # Load BERT
        self.bert = BertModel.from_pretrained("bert-base-multilingual-cased")
        
        # Classifier head
        self.classifier = nn.Sequential(
            nn.Linear(2048 + self.bert.config.hidden_size, 512),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(256, num_classes)
        )
        
    def forward(self, input_ids, attention_mask, image):
        image_features = self.resnet(image)
        image_features = image_features.view(image_features.size(0), -1)
        
        bert_outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        bert_embeddings = bert_outputs.last_hidden_state[:, 0, :]
        
        combined = torch.cat((image_features, bert_embeddings), dim=1)
        logits = self.classifier(combined)
        return logits

# DenseNet121 + BERT Model Definition
class DenseNet121BertClassifier(nn.Module):
    def __init__(self, num_classes=2):
        super(DenseNet121BertClassifier, self).__init__()
        
        # Load pretrained DenseNet121
        self.densenet = models.densenet121(pretrained=True)
        self.densenet.classifier = nn.Identity()
        
        # Load pre-trained BERT
        self.bert = BertModel.from_pretrained("bert-base-multilingual-cased")
        
        # Dimensionality of DenseNet and BERT embeddings
        densenet_hidden_size = 1024  # DenseNet121 output feature size
        bert_hidden_size = self.bert.config.hidden_size  # 768 for BERT base
        
        # Enhanced classification head
        self.classifier = nn.Sequential(
            nn.Linear(densenet_hidden_size + bert_hidden_size, 512),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(256, num_classes)
        )
        
    def forward(self, input_ids, attention_mask, image):
        # Process image with DenseNet121
        image_features = self.densenet.features(image)
        image_features = nn.functional.adaptive_avg_pool2d(image_features, (1, 1))
        image_features = image_features.view(image_features.size(0), -1)  # Flatten the features
        
        # Process text with BERT
        bert_outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        bert_embeddings = bert_outputs.last_hidden_state[:, 0, :]  # CLS token
        
        # Concatenate image and text features
        combined_embeddings = torch.cat((image_features, bert_embeddings), dim=1)
        
        # Classification
        logits = self.classifier(combined_embeddings)
        
        return logits

In [24]:
# Create image mapping
def create_image_mapping(dataframe, image_directory):
    """
    Create a mapping between dataset 'name' column and actual image filenames in the directory.
    """
    # Get all image files in the directory
    image_files = glob.glob(os.path.join(image_directory, '*'))
    image_mapping = {}
    
    # Create a set of all available image filenames (without path)
    available_images = {os.path.basename(f) for f in image_files}
    print(f"Found {len(available_images)} images in directory")
    
    # Method 1: Exact match
    for name in dataframe['name'].unique():
        if name in available_images:
            image_mapping[name] = name
    
    # Method 2: Check if the id is part of the filename
    unmapped_names = set(dataframe['name']) - set(image_mapping.keys())
    for name in unmapped_names:
        # Extract ID from name (assuming name has some ID pattern)
        id_match = re.search(r'\d+', name)
        if id_match:
            id_value = id_match.group()
            # Look for files containing this ID
            matching_files = [f for f in available_images if id_value in f]
            if matching_files:
                image_mapping[name] = matching_files[0]
    
    # Method 3: Try matching using 'ids' or 'id' column if available
    if 'ids' in dataframe.columns or 'id' in dataframe.columns:
        id_col = 'ids' if 'ids' in dataframe.columns else 'id'
        id_to_name = dict(zip(dataframe[id_col], dataframe['name']))
        
        for id_value, name in id_to_name.items():
            if name not in image_mapping:
                # Look for files containing this ID
                matching_files = [f for f in available_images if str(id_value) in f]
                if matching_files:
                    image_mapping[name] = matching_files[0]
    
    print(f"Successfully mapped {len(image_mapping)} out of {len(dataframe['name'].unique())} unique names")
    return image_mapping

# Create image mapping
image_mapping = create_image_mapping(df_sample, image_dir)

Found 25716 images in directory
Successfully mapped 7661 out of 7661 unique names


In [25]:
# Dataset class for batch processing
class HarmfulContentDataset(Dataset):
    def __init__(self, dataframe, image_dir, tokenizer, muril_tokenizer, xlmr_tokenizer, 
                 image_mapping=None, max_len=128):
        self.dataframe = dataframe
        self.image_dir = image_dir
        self.tokenizer = tokenizer  # BERT tokenizer
        self.muril_tokenizer = muril_tokenizer
        self.xlmr_tokenizer = xlmr_tokenizer
        self.max_len = max_len
        self.image_mapping = image_mapping or {}
        
        # Create a default blank image
        self.blank_image = Image.new('RGB', (224, 224), color='white')
        
        # Image transformation
        self.transform = transforms.Compose([
            transforms.Resize((224, 224)),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
        ])
        
        # ViT feature extractor
        self.vit_feature_extractor = ViTFeatureExtractor.from_pretrained("google/vit-base-patch16-224")
    
    def __len__(self):
        return len(self.dataframe)
    
    def __getitem__(self, idx):
        row = self.dataframe.iloc[idx]
        
        # Combine text features
        combined_features = f"{row.get('gender', '')} {row.get('age', '')} {row.get('age_bucket', '')} {row.get('dominant_emotion', '')} {row.get('dominant_race', '')} {row.get('translated_text', '')}"
        
        # BERT tokenization
        bert_encoding = self.tokenizer.encode_plus(
            combined_features,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        
        # MuRIL tokenization
        muril_encoding = self.muril_tokenizer(
            combined_features,
            return_tensors='pt',
            padding='max_length',
            truncation=True,
            max_length=self.max_len
        )
        
        # XLM-RoBERTa tokenization
        xlmr_encoding = self.xlmr_tokenizer.encode_plus(
            combined_features,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        
        # Image processing
        try:
            # Get the correct image filename using the mapping
            image_filename = self.image_mapping.get(row['name'], row['name'])
            image_path = os.path.join(self.image_dir, image_filename)
            
            # Check if the file exists
            if os.path.exists(image_path):
                image = Image.open(image_path).convert('RGB')
            else:
                # Try alternate approach with different extensions
                base_name = os.path.splitext(image_path)[0]
                for ext in ['.jpg', '.jpeg', '.png', '.gif']:
                    alt_path = base_name + ext
                    if os.path.exists(alt_path):
                        image = Image.open(alt_path).convert('RGB')
                        break
                else:
                    # If still not found, use blank image
                    image = self.blank_image
                    if idx % 100 == 0:  # Limit logging
                        print(f"Image not found for {row['name']}, using blank image")
            
            # Apply transformations
            image_tensor = self.transform(image)
            
            # Get ViT pixel values
            vit_pixel_values = self.vit_feature_extractor(images=image, return_tensors="pt")['pixel_values'].squeeze(0)
            
        except Exception as e:
            if idx % 100 == 0:  # Limit logging
                print(f"Error processing image for {row['name']}: {e}")
            # Create blank image features
            image_tensor = torch.zeros((3, 224, 224))
            vit_pixel_values = torch.zeros((3, 224, 224))
        
        # Text for BiLSTM (just keeping the text for now)
        text_only = combined_features
        
        return {
            # BERT inputs
            'bert_input_ids': bert_encoding['input_ids'].flatten(),
            'bert_attention_mask': bert_encoding['attention_mask'].flatten(),
            
            # MuRIL inputs
            'muril_input_ids': muril_encoding['input_ids'].flatten(),
            'muril_attention_mask': muril_encoding['attention_mask'].flatten(),
            'muril_token_type_ids': muril_encoding['token_type_ids'].flatten(),
            
            # XLM-R inputs
            'xlmr_input_ids': xlmr_encoding['input_ids'].flatten(),
            'xlmr_attention_mask': xlmr_encoding['attention_mask'].flatten(),
            
            # Image inputs
            'image': image_tensor,
            'vit_pixel_values': vit_pixel_values,
            
            # Text for BiLSTM
            'text': text_only,
            
            # Label
            'labels': torch.tensor(row['label'], dtype=torch.long)
        }


In [26]:
# Loading models and tokenizers
def load_all_models(device):
    print("Loading models and tokenizers...")
    
    # Load tokenizers
    bert_tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased")
    muril_tokenizer = AutoTokenizer.from_pretrained("google/muril-base-cased")
    xlmr_tokenizer = XLMRobertaTokenizer.from_pretrained("xlm-roberta-base")
    
    # Load BiLSTM tokenizer
    with open("/kaggle/input/bilstm/keras/default/1/tokenizer.pkl", "rb") as f:
        bilstm_tokenizer = pickle.load(f)
    
    # Load models
    # MuRIL model
    muril_model = MuRILImageClassifier(num_classes=2)
    muril_model.load_state_dict(torch.load("/kaggle/input/muril/pytorch/default/1/best_muril_model.pth", map_location=device))
    muril_model.to(device)
    muril_model.eval()
    
    # ResNet50 model
    resnet_model = ResNet50BertClassifier(num_classes=2)
    resnet_model.load_state_dict(torch.load("/kaggle/input/resnet50/pytorch/default/1/best_resnet50_bert_model.pth", map_location=device))
    resnet_model.to(device)
    resnet_model.eval()
    
    # DenseNet121 model
    densenet_model = DenseNet121BertClassifier(num_classes=2)
    densenet_model.load_state_dict(torch.load("/kaggle/input/densenet121/pytorch/default/1/best_densenet_bert_model.pth", map_location=device))
    densenet_model.to(device)
    densenet_model.eval()
    
    # BiLSTM model
    bilstm_model = tf.keras.models.load_model("/kaggle/input/bilstm/keras/default/1/bilstm_model.keras")
    
    # ViT model
    vit_model = ViTBertClassifier(num_classes=2)
    vit_model.load_state_dict(torch.load("/kaggle/input/vision-transformer/pytorch/default/1/best_vit_model.pth", map_location=device))
    vit_model.to(device)
    vit_model.eval()
    
    # XLM-R model
    xlmr_model = XLMRClassifier(num_classes=2)
    xlmr_model.load_state_dict(torch.load("/kaggle/input/xlm-roberta/pytorch/default/1/best_xlmr_model.pth", map_location=device))
    xlmr_model.to(device)
    xlmr_model.eval()
    
    print("All models loaded successfully!")
    
    return {
        'tokenizers': {
            'bert': bert_tokenizer,
            'muril': muril_tokenizer,
            'xlmr': xlmr_tokenizer,
            'bilstm': bilstm_tokenizer
        },
        'models': {
            'muril': muril_model,
            'resnet': resnet_model,
            'densenet': densenet_model,
            'bilstm': bilstm_model,
            'vit': vit_model,
            'xlmr': xlmr_model
        }
    }

In [27]:
# Voting ensemble prediction function
def predict_with_ensemble(batch, models, tokenizers, device, max_len=106):
    """
    Makes predictions using all models and returns the majority vote
    """
    with torch.no_grad():
        predictions = {
            'muril': [],
            'resnet': [],
            'densenet': [],
            'bilstm': [],
            'vit': [],
            'xlmr': []
        }
        
        # MuRIL prediction
        muril_outputs = models['muril'](
            input_ids=batch['muril_input_ids'].to(device),
            attention_mask=batch['muril_attention_mask'].to(device),
            token_type_ids=batch['muril_token_type_ids'].to(device),
            image=batch['image'].to(device)
        )
        muril_probs = torch.nn.functional.softmax(muril_outputs, dim=1)
        muril_preds = torch.argmax(muril_probs, dim=1)
        predictions['muril'] = muril_preds.cpu().numpy()
        
        # ResNet50 prediction
        resnet_outputs = models['resnet'](
            batch['bert_input_ids'].to(device),
            batch['bert_attention_mask'].to(device),
            batch['image'].to(device)
        )
        resnet_probs = torch.nn.functional.softmax(resnet_outputs, dim=1)
        resnet_preds = torch.argmax(resnet_probs, dim=1)
        predictions['resnet'] = resnet_preds.cpu().numpy()
        
        # DenseNet121 prediction
        densenet_outputs = models['densenet'](
            batch['bert_input_ids'].to(device),
            batch['bert_attention_mask'].to(device),
            batch['image'].to(device)
        )
        densenet_probs = torch.nn.functional.softmax(densenet_outputs, dim=1)
        densenet_preds = torch.argmax(densenet_probs, dim=1)
        predictions['densenet'] = densenet_preds.cpu().numpy()
        
        # BiLSTM prediction
        # Need to tokenize and pad text for BiLSTM
        batch_texts = batch['text']  # Assume this is a list of strings
        tokenized_sequences = []
        for text in batch_texts:
            # Convert tensor to string if needed
            if isinstance(text, torch.Tensor):
                text = text.item() if text.numel() == 1 else str(text)
            sequence = tokenizers['bilstm'].texts_to_sequences([text])
            padded = tf.keras.preprocessing.sequence.pad_sequences(sequence, maxlen=max_len)
            tokenized_sequences.append(padded[0])  # Take the first (only) item
        
        if tokenized_sequences:
            stacked_sequences = np.stack(tokenized_sequences)
            bilstm_probs = models['bilstm'].predict(stacked_sequences)
            bilstm_preds = (bilstm_probs > 0.7).astype(int).flatten()
            predictions['bilstm'] = bilstm_preds
        else:
            predictions['bilstm'] = np.zeros(len(batch['text']))
        
        # ViT prediction
        vit_outputs = models['vit'](
            input_ids=batch['bert_input_ids'].to(device),
            attention_mask=batch['bert_attention_mask'].to(device),
            pixel_values=batch['vit_pixel_values'].to(device)
        )
        vit_probs = torch.nn.functional.softmax(vit_outputs, dim=1)
        vit_preds = torch.argmax(vit_probs, dim=1)
        predictions['vit'] = vit_preds.cpu().numpy()
        
        # XLM-R prediction
        xlmr_outputs = models['xlmr'](
            input_ids=batch['xlmr_input_ids'].to(device),
            attention_mask=batch['xlmr_attention_mask'].to(device)
        )
        xlmr_probs = torch.nn.functional.softmax(xlmr_outputs, dim=1)
        xlmr_preds = torch.argmax(xlmr_probs, dim=1)
        predictions['xlmr'] = xlmr_preds.cpu().numpy()
        
        # Combine predictions (voting)
        all_preds = np.vstack([
            predictions['muril'],
            predictions['resnet'],
            predictions['densenet'],
            predictions['bilstm'],
            predictions['vit'],
            predictions['xlmr']
        ])
        
        # Simple majority voting
        ensemble_preds = np.apply_along_axis(
            lambda x: np.bincount(x).argmax(), 
            axis=0, 
            arr=all_preds
        )
        
        return ensemble_preds, predictions

In [28]:
# Main evaluation function
def evaluate_ensemble(df_sample, image_dir, image_mapping, batch_size=8):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")
    
    # Load models and tokenizers
    resources = load_all_models(device)
    models = resources['models']
    tokenizers = resources['tokenizers']
    
    # Create dataset
    dataset = HarmfulContentDataset(
        df_sample, 
        image_dir, 
        tokenizers['bert'], 
        tokenizers['muril'],
        tokenizers['xlmr'],
        image_mapping=image_mapping
    )
    
    # Create dataloader
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)
    
    # Predictions storage
    all_labels = []
    ensemble_predictions = []
    individual_predictions = {
        'muril': [],
        'resnet': [],
        'densenet': [],
        'bilstm': [],
        'vit': [],
        'xlmr': []
    }
    
    print(f"Running predictions on {len(dataset)} samples...")
    
    # Prediction loop
    for batch in dataloader:
        # Get labels
        labels = batch['labels'].numpy()
        all_labels.extend(labels)
        
        # Get combined text features for BiLSTM
        batch_texts = [f"{row.get('gender', '')} {row.get('age', '')} {row.get('age_bucket', '')} {row.get('dominant_emotion', '')} {row.get('dominant_race', '')} {row.get('translated_text', '')}" 
                      for _, row in df_sample.iloc[len(ensemble_predictions):len(ensemble_predictions)+len(labels)].iterrows()]
        batch['text'] = batch_texts
        
        # Make predictions
        ensemble_preds, model_preds = predict_with_ensemble(batch, models, tokenizers, device)
        
        # Store predictions
        ensemble_predictions.extend(ensemble_preds)
        for model_name, preds in model_preds.items():
            individual_predictions[model_name].extend(preds)
    
    # Convert to numpy arrays
    all_labels = np.array(all_labels)
    ensemble_predictions = np.array(ensemble_predictions)
    
    for model_name in individual_predictions:
        individual_predictions[model_name] = np.array(individual_predictions[model_name])
    
    # Calculate metrics
    print("\n===== ENSEMBLE MODEL EVALUATION =====")
    print(classification_report(all_labels, ensemble_predictions, target_names=['Non-Harmful', 'Harmful']))
    ensemble_cm = confusion_matrix(all_labels, ensemble_predictions)
    
    # Plot confusion matrix for ensemble
    plt.figure(figsize=(8, 6))
    sns.heatmap(ensemble_cm, annot=True, fmt='d', cmap='Blues', 
               xticklabels=['Non-Harmful', 'Harmful'], 
               yticklabels=['Non-Harmful', 'Harmful'])
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.title('Confusion Matrix - Ensemble Model')
    plt.tight_layout()
    plt.savefig('ensemble_confusion_matrix.png')
    plt.close()
    
    # Calculate individual model performance
    print("\n===== INDIVIDUAL MODEL EVALUATION =====")
    individual_accuracies = {}
    
    for model_name, preds in individual_predictions.items():
        accuracy = accuracy_score(all_labels, preds)
        individual_accuracies[model_name] = accuracy
        print(f"{model_name.upper()} Accuracy: {accuracy:.4f}")
        
        # Generate classification report
        print(f"\n{model_name.upper()} Classification Report:")
        print(classification_report(all_labels, preds, target_names=['Non-Harmful', 'Harmful']))
        
        # Confusion matrix
        cm = confusion_matrix(all_labels, preds)
        plt.figure(figsize=(8, 6))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                   xticklabels=['Non-Harmful', 'Harmful'],
                   yticklabels=['Non-Harmful', 'Harmful'])
        plt.xlabel('Predicted')
        plt.ylabel('True')
        plt.title(f'Confusion Matrix - {model_name.upper()}')
        plt.tight_layout()
        plt.savefig(f'{model_name}_confusion_matrix.png')
        plt.close()
    
    # Plot comparison of model accuracies
    plt.figure(figsize=(10, 6))
    models_names = list(individual_accuracies.keys()) + ['ensemble']
    accuracies = list(individual_accuracies.values()) + [accuracy_score(all_labels, ensemble_predictions)]
    sns.barplot(x=models_names, y=accuracies)
    plt.xlabel('Model')
    plt.ylabel('Accuracy')
    plt.title('Model Accuracy Comparison')
    plt.ylim(0, 1)
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.savefig('model_comparison.png')
    plt.close()
    
    print("\nEvaluation complete! Results saved as PNG files.")
    
    return {
        'ensemble_accuracy': accuracy_score(all_labels, ensemble_predictions),
        'individual_accuracies': individual_accuracies,
        'ensemble_report': classification_report(all_labels, ensemble_predictions, output_dict=True),
        'ensemble_cm': ensemble_cm
    }


In [29]:
# Define models (reusing the models from the provided code)
# MuRILImageClassifier, ResNet50BertClassifier, DenseNet121BertClassifier, ViTBertClassifier, XLMRClassifier
# ... (model definitions from the original code)
# Run the evaluation
if __name__ == "__main__":
    results = evaluate_ensemble(df_sample, image_dir, image_mapping)
    
    # Optionally, save the results to a CSV file
    ensemble_metrics = results['ensemble_report']['weighted avg']
    metrics_df = pd.DataFrame({
        'Model': ['Ensemble'] + list(results['individual_accuracies'].keys()),
        'Accuracy': [results['ensemble_accuracy']] + list(results['individual_accuracies'].values()),
        'Precision': [ensemble_metrics['precision']] + [0] * len(results['individual_accuracies']),  # You'd need to calculate these for individual models
        'Recall': [ensemble_metrics['recall']] + [0] * len(results['individual_accuracies']),
        'F1-Score': [ensemble_metrics['f1-score']] + [0] * len(results['individual_accuracies'])
    })
    
    metrics_df.to_csv('model_comparison_results.csv', index=False)
    print("\nResults exported to model_comparison_results.csv")

Using device: cuda
Loading models and tokenizers...


pytorch_model.bin:   0%|          | 0.00/953M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/953M [00:00<?, ?B/s]

Downloading: "https://download.pytorch.org/models/densenet121-a639ec97.pth" to /root/.cache/torch/hub/checkpoints/densenet121-a639ec97.pth
100%|██████████| 30.8M/30.8M [00:00<00:00, 222MB/s]
I0000 00:00:1745841821.679835      31 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 10588 MB memory:  -> device: 0, name: Tesla T4, pci bus id: 0000:00:04.0, compute capability: 7.5
I0000 00:00:1745841821.680456      31 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 13942 MB memory:  -> device: 1, name: Tesla T4, pci bus id: 0000:00:05.0, compute capability: 7.5


config.json:   0%|          | 0.00/69.7k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/346M [00:00<?, ?B/s]

Some weights of ViTModel were not initialized from the model checkpoint at google/vit-base-patch16-224 and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

All models loaded successfully!


preprocessor_config.json:   0%|          | 0.00/160 [00:00<?, ?B/s]

Running predictions on 7680 samples...


I0000 00:00:1745841849.644650     127 cuda_dnn.cc:529] Loaded cuDNN version 90300


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms