In [None]:
# Install required packages
!pip install transformers datasets torch scikit-learn pandas matplotlib seaborn nltk emoji tensorflow openpyxl --quiet

# Check if running in Google Colab
try:
    import google.colab
    IN_COLAB = True
    print("✓ Running in Google Colab")
except:
    IN_COLAB = False
    print("✓ Running locally")

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/608.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m604.2/608.4 kB[0m [31m28.9 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m608.4/608.4 kB[0m [31m16.4 MB/s[0m eta [36m0:00:00[0m
[?25h

 **Import Libraries**

In [None]:
import pandas as pd
import numpy as np
import re, nltk, emoji
import torch
import seaborn as sns
import matplotlib.pyplot as plt
import os

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score,confusion_matrix,classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from torch.utils.data import Dataset, DataLoader
from torch import nn
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding,LSTM,Dense,Dropout

# Check if running in Google Colab
try:
    import google.colab
    IN_COLAB = True
    print("✓ Running in Google Colab")
except:
    IN_COLAB = False
    print("✓ Running locally")

**Option: Upload Files Directly (Google Colab)**

If you want to upload files directly in Colab, uncomment and run the cell below:


In [None]:
# Uncomment the lines below if you want to upload files directly in Colab
# from google.colab import files
# uploaded = files.upload()
# # After uploading, files will be in the current directory
# # You can then proceed to the next cell


**Load Dataset Files**

**For Google Colab Users:**
1. Upload both dataset files using the file uploader below, OR
2. Mount Google Drive and place files in your Drive folder

In [None]:
# Google Colab: File Upload Option (Uncomment if uploading files directly)
# from google.colab import files
# uploaded = files.upload()  # This will prompt you to upload files

# Google Colab: Mount Google Drive Option (Uncomment if using Drive)
# from google.colab import drive
# drive.mount('/content/drive')
# # Then update file paths below to: '/content/drive/MyDrive/your_folder/filename.xlsx'

import os

# Determine file paths based on environment
if IN_COLAB:
    # Try multiple possible locations in Colab
    possible_paths_csv = [
        "Urdu Abusive Dataset.csv",
        "/content/Urdu Abusive Dataset.csv",
        "/content/drive/MyDrive/Urdu Abusive Dataset.csv",
        "/content/drive/My Drive/Urdu Abusive Dataset.csv"
    ]
    possible_paths_xlsx = [
        "Hate Speech Roman Urdu (HS-RU-20).xlsx",
        "/content/Hate Speech Roman Urdu (HS-RU-20).xlsx",
        "/content/drive/MyDrive/Hate Speech Roman Urdu (HS-RU-20).xlsx",
        "/content/drive/My Drive/Hate Speech Roman Urdu (HS-RU-20).xlsx"
    ]
else:
    # Local paths
    possible_paths_csv = ["Urdu Abusive Dataset.csv"]
    possible_paths_xlsx = ["Hate Speech Roman Urdu (HS-RU-20).xlsx"]

# Find and load first dataset
df_csv = pd.DataFrame()
csv_file_path = None
for path in possible_paths_csv:
    if os.path.exists(path):
        csv_file_path = path
        break

if csv_file_path:
    try:
        df_csv = pd.read_excel(csv_file_path, engine='openpyxl')
        print(f"✓ Loaded Urdu Abusive Dataset.csv from: {csv_file_path}")
    except Exception as e:
        print(f"✗ Error loading first file: {e}")
        df_csv = pd.DataFrame()
else:
    print("✗ Urdu Abusive Dataset.csv not found. Please upload it or check the path.")
    print("  Searched in:", possible_paths_csv)

# Find and load second dataset
df_tsv = pd.DataFrame()
xlsx_file_path = None
for path in possible_paths_xlsx:
    if os.path.exists(path):
        xlsx_file_path = path
        break

if xlsx_file_path:
    try:
        df_tsv = pd.read_excel(xlsx_file_path, engine='openpyxl')
        print(f"✓ Loaded Hate Speech Roman Urdu (HS-RU-20).xlsx from: {xlsx_file_path}")
    except Exception as e:
        print(f"✗ Error loading second file: {e}")
        df_tsv = pd.DataFrame()
else:
    print("✗ Hate Speech Roman Urdu (HS-RU-20).xlsx not found. Please upload it or check the path.")
    print("  Searched in:", possible_paths_xlsx)

# Display dataset info
if not df_csv.empty:
    print(f"\n✓ First dataset shape: {df_csv.shape}")
    print(f"✓ First dataset columns: {df_csv.columns.tolist()}")
else:
    print("\n✗ First dataset is empty")

if not df_tsv.empty:
    print(f"\n✓ Second dataset shape: {df_tsv.shape}")
    print(f"✓ Second dataset columns: {df_tsv.columns.tolist()}")
else:
    print("\n✗ Second dataset is empty")

UnicodeDecodeError: 'utf-8' codec can't decode bytes in position 15-16: invalid continuation byte

**Standardize Columns**

In [None]:
# Auto-detect and standardize columns to 'text' and 'label'
# Common text column names
text_cols = ['comment', 'tweet', 'message', 'content', 'text', 'Comment', 'Tweet', 'Message', 'Content', 'Text',
             'comment_text', 'Comment_Text', 'sentence', 'Sentence']
# Common label column names
label_cols = ['label', 'Label', 'class', 'Class', 'category', 'Category', 'toxic', 'Toxic', 'hate', 'Hate',
              'comment_class', 'Comment_Class', 'Neutral (N) / Hostile (H)', 'neutral (n) / hostile (h)']

def standardize_columns(df, dataset_name):
    """Auto-detect and rename columns to standard 'text' and 'label'"""
    df = df.copy()
    
    # Find text column
    text_col = None
    for col in text_cols:
        if col in df.columns:
            text_col = col
            break
    
    # Find label column
    label_col = None
    for col in label_cols:
        if col in df.columns:
            label_col = col
            break
    
    # If not found, try to infer
    if text_col is None:
        # Check for columns with text-like content
        for col in df.columns:
            if df[col].dtype == 'object' and col != label_col:
                text_col = col
                break
    
    if label_col is None:
        # Check for numeric or categorical columns that might be labels
        for col in df.columns:
            if col != text_col and (df[col].dtype in ['int64', 'float64'] or df[col].dtype.name == 'category'):
                label_col = col
                break
    
    if text_col:
        df = df.rename(columns={text_col: 'text'})
        print(f"{dataset_name}: Text column '{text_col}' -> 'text'")
    else:
        print(f"Warning: Could not find text column in {dataset_name}")
    
    if label_col:
        df = df.rename(columns={label_col: 'label'})
        print(f"{dataset_name}: Label column '{label_col}' -> 'label'")
    else:
        print(f"Warning: Could not find label column in {dataset_name}")
    
    return df

df_csv = standardize_columns(df_csv, "Dataset 1")
df_tsv = standardize_columns(df_tsv, "Dataset 2")

print("\nDataset 1 columns after standardization:", df_csv.columns.tolist())
print("Dataset 2 columns after standardization:", df_tsv.columns.tolist())

**Merge Datasets**

In [None]:
# Merge datasets
df = pd.concat([df_csv, df_tsv], axis=0, ignore_index=True)

# Keep only text and label columns
if 'text' in df.columns and 'label' in df.columns:
    df = df[['text', 'label']]
else:
    print("Error: 'text' or 'label' columns not found after merging")
    print("Available columns:", df.columns.tolist())

# Remove duplicates and nulls
df = df.drop_duplicates(subset=["text"])
df = df.dropna(subset=["text", "label"])

# Standardize labels to binary (0/1)
# Handle various label formats
def standardize_label(label):
    """Convert various label formats to binary 0/1"""
    if pd.isna(label):
        return None
    
    # Handle boolean values directly
    if isinstance(label, bool):
        return 1 if label else 0
    
    # Convert to string for comparison
    label_str = str(label).lower().strip()
    
    # Handle numeric labels
    if label_str in ['0', '0.0', '0.00']:
        return 0
    if label_str in ['1', '1.0', '1.00']:
        return 1
    
    # Handle text labels (non-toxic/negative = 0, toxic/positive = 1)
    non_toxic = ['non-toxic', 'nontoxic', 'non_toxic', 'negative', 'normal', 'clean', 'safe', 'no', 'false', '0', 'n', 'neutral']
    toxic = ['toxic', 'hate', 'abusive', 'positive', 'yes', 'true', '1', 'h', 'hostile']
    
    if label_str in non_toxic:
        return 0
    if label_str in toxic:
        return 1
    
    # If it's a number, convert directly
    try:
        num = float(label_str)
        return int(num > 0.5)  # Threshold at 0.5
    except:
        # Default: assume non-toxic if unclear
        return 0

df['label'] = df['label'].apply(standardize_label)
df = df.dropna(subset=["label"])  # Remove any that couldn't be standardized

# Shuffle
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

print("Merged dataset size:", df.shape)
print("\nLabel distribution:")
print(df['label'].value_counts().sort_index())
print(f"\nLabel value counts:\n{df['label'].value_counts()}")

**Roman Urdu Normalization & Cleaning**

In [None]:
nltk.download('stopwords', quiet=True)
from nltk.corpus import stopwords

# Extended Roman Urdu stopwords
stopwords_roman = ["hai", "hay", "he", "hain", "kya", "ha", "me", "tum", "nai", "nahi", "na", 
                   "mein", "main", "acha", "accha", "bohat", "bahut", "nh", "h", "ho", "hoon",
                   "ka", "ki", "ke", "ko", "se", "par", "aur", "ya", "bhi", "to", "tu"]

try:
    stopwords_urdu = set(stopwords.words('urdu'))
except:
    stopwords_urdu = set()

# Comprehensive slang and spelling variation mapping for Roman Urdu
slang_map = {
    # Common spelling variations
    "yar": "yaar", "yarr": "yaar", "yaaar": "yaar",
    "bhai": "bhai", "bhaii": "bhai", "bro": "bhai",
    "ganda": "gandi", "gandha": "gandi",
    "lanat": "laanat", "lanath": "laanat",
    "chutiya": "chutiya", "chutia": "chutiya", "chootiya": "chutiya",
    "bkwas": "bakwas", "bakwaas": "bakwas", "bakwass": "bakwas",
    "larki": "ladki", "larkee": "ladki", "ladkee": "ladki",
    "larka": "ladka", "larkaa": "ladka", "ladkaa": "ladka",
    "tum": "tm", "tu": "tm", "tumhe": "tmhe",
    "mein": "main", "me": "main", "mujhe": "mujhe",
    "hai": "hai", "hay": "hai", "he": "hai",
    "nahi": "nahi", "nai": "nahi", "na": "nahi",
    "acha": "accha", "accha": "accha", "achha": "accha",
    "bohat": "bahut", "bahut": "bahut", "bohot": "bahut",
    # More variations
    "kya": "kya", "kyaa": "kya",
    "kar": "kar", "karr": "kar",
    "de": "de", "dey": "de",
    "le": "le", "ley": "le",
    "ja": "ja", "jaa": "ja",
    "aa": "aa", "aao": "aao",
    "gaya": "gaya", "gaya": "gaya",
    "aya": "aya", "aaya": "aya",
}

def normalize_roman_urdu(text):
    """Normalize Roman Urdu spelling variations"""
    words = text.split()
    normalized_words = []
    for word in words:
        # Check exact match first
        if word in slang_map:
            normalized_words.append(slang_map[word])
        else:
            # Check case-insensitive match
            word_lower = word.lower()
            if word_lower in slang_map:
                normalized_words.append(slang_map[word_lower])
            else:
                normalized_words.append(word)
    return " ".join(normalized_words)

def clean_text(text):
    """Comprehensive text cleaning and normalization"""
    if pd.isna(text):
        return ""
    
    text = str(text).lower()
    
    # Remove emojis
    text = emoji.replace_emoji(text, "")
    
    # Remove URLs
    text = re.sub(r"http\S+|www\S+", "", text)
    
    # Remove extra whitespace
    text = re.sub(r"\s+", " ", text)
    
    # Keep Urdu characters (ء-ی), English letters, numbers, and spaces
    text = re.sub(r"[^a-zA-Z0-9ء-ی ]", " ", text)
    
    # Normalize Roman Urdu spelling variations
    text = normalize_roman_urdu(text)
    
    # Remove stopwords
    words = text.split()
    filtered_words = [w for w in words if w not in stopwords_roman and w not in stopwords_urdu and len(w) > 1]
    
    text = " ".join(filtered_words)
    
    # Final cleanup
    text = re.sub(r"\s+", " ", text).strip()
    
    return text

# Apply cleaning
print("Cleaning and normalizing text...")
df['text'] = df['text'].apply(clean_text)

# Remove empty texts after cleaning
df = df[df['text'].str.len() > 0]

print(f"Dataset size after cleaning: {df.shape}")
print("\nSample cleaned texts:")
print(df.head(10))

**Train-Test Split**

In [None]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    df["text"], 
    df["label"], 
    test_size=0.2, 
    random_state=42,
    stratify=df["label"]  # Ensure balanced split
)

# Ensure labels are integers
y_train = y_train.astype(int)
y_test = y_test.astype(int)

print(f"Training set size: {len(X_train)}")
print(f"Test set size: {len(X_test)}")
print(f"\nTraining label distribution:\n{y_train.value_counts().sort_index()}")
print(f"\nTest label distribution:\n{y_test.value_counts().sort_index()}")

# **PART 1 – ML Baselines**

**TF-IDF Vectorization**

In [None]:
vectorizer = TfidfVectorizer(max_features=5000)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

**Logistic Regression**

In [None]:
# Ensure labels are numeric
y_train_lr = y_train.astype(int)
y_test_lr = y_test.astype(int)

lr = LogisticRegression(max_iter=500, random_state=42)
lr.fit(X_train_vec, y_train_lr)
lr_pred = lr.predict(X_test_vec)

print("\n" + "="*50)
print("Logistic Regression Results")
print("="*50)
print(f"Accuracy:  {accuracy_score(y_test_lr, lr_pred):.4f}")
print(f"Precision: {precision_score(y_test_lr, lr_pred, average='weighted', zero_division=0):.4f}")
print(f"Recall:    {recall_score(y_test_lr, lr_pred, average='weighted', zero_division=0):.4f}")
print(f"F1-Score:  {f1_score(y_test_lr, lr_pred, average='weighted', zero_division=0):.4f}")
print("\nDetailed Classification Report:")
print(classification_report(y_test_lr, lr_pred, zero_division=0))

**SVM**

In [None]:
# Ensure labels are numeric
y_train_svm = y_train.astype(int)
y_test_svm = y_test.astype(int)

svm = SVC(kernel='linear', random_state=42)
svm.fit(X_train_vec, y_train_svm)
svm_pred = svm.predict(X_test_vec)

print("\n" + "="*50)
print("SVM Results")
print("="*50)
print(f"Accuracy:  {accuracy_score(y_test_svm, svm_pred):.4f}")
print(f"Precision: {precision_score(y_test_svm, svm_pred, average='weighted', zero_division=0):.4f}")
print(f"Recall:    {recall_score(y_test_svm, svm_pred, average='weighted', zero_division=0):.4f}")
print(f"F1-Score:  {f1_score(y_test_svm, svm_pred, average='weighted', zero_division=0):.4f}")
print("\nDetailed Classification Report:")
print(classification_report(y_test_svm, svm_pred, zero_division=0))

# **PART 2 – LSTM Deep Learning**

**Prepare Tokenizer & Sequences**

In [None]:
keras_tokenizer = Tokenizer(num_words=8000)
keras_tokenizer.fit_on_texts(X_train)

X_train_seq = keras_tokenizer.texts_to_sequences(X_train)
X_test_seq = keras_tokenizer.texts_to_sequences(X_test)

max_len = 50
X_train_pad = pad_sequences(X_train_seq, maxlen=max_len, padding='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len, padding='post')

**Build & Train LSTM Model**

In [None]:
# Ensure labels are numeric and properly formatted for LSTM
y_train_lstm = y_train.astype(int).values
y_test_lstm = y_test.astype(int).values

model_lstm = Sequential([
    Embedding(input_dim=8000, output_dim=128, input_length=max_len),
    LSTM(64, return_sequences=False),
    Dropout(0.3),
    Dense(32, activation='relu'),
    Dropout(0.2),
    Dense(1, activation='sigmoid')
])

model_lstm.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
print("Training LSTM model...")
history = model_lstm.fit(
    X_train_pad, y_train_lstm, 
    validation_split=0.2, 
    epochs=4, 
    batch_size=32,
    verbose=1
)

# Make predictions
lstm_pred_proba = model_lstm.predict(X_test_pad, verbose=0)
lstm_pred = (lstm_pred_proba > 0.5).astype(int).flatten()

# Ensure predictions are same length as test labels
if len(lstm_pred) != len(y_test_lstm):
    print(f"Warning: Prediction length {len(lstm_pred)} != test length {len(y_test_lstm)}")
    min_len = min(len(lstm_pred), len(y_test_lstm))
    lstm_pred = lstm_pred[:min_len]
    y_test_lstm = y_test_lstm[:min_len]

print("\n" + "="*50)
print("LSTM Results")
print("="*50)
print(f"Accuracy:  {accuracy_score(y_test_lstm, lstm_pred):.4f}")
print(f"Precision: {precision_score(y_test_lstm, lstm_pred, average='weighted', zero_division=0):.4f}")
print(f"Recall:    {recall_score(y_test_lstm, lstm_pred, average='weighted', zero_division=0):.4f}")
print(f"F1-Score:  {f1_score(y_test_lstm, lstm_pred, average='weighted', zero_division=0):.4f}")
print("\nDetailed Classification Report:")
print(classification_report(y_test_lstm, lstm_pred, zero_division=0))

# **PART 3 – Transformer Model (XLM-RoBERTa)**

**Tokenizer & Model**

In [None]:
model_name = "cardiffnlp/twitter-xlm-roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

**Tokenize Data**

In [None]:
# Ensure labels are numeric
y_train_xlm = y_train.astype(int).values
y_test_xlm = y_test.astype(int).values

# Tokenize data
train_enc = tokenizer(list(X_train), padding=True, truncation=True, max_length=128, return_tensors="pt")
test_enc = tokenizer(list(X_test), padding=True, truncation=True, max_length=128, return_tensors="pt")

class HSDDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = torch.tensor(labels, dtype=torch.long)
    
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

train_dataset_xlm = HSDDataset(train_enc, y_train_xlm)
test_dataset_xlm = HSDDataset(test_enc, y_test_xlm)

print(f"Train dataset size: {len(train_dataset_xlm)}")
print(f"Test dataset size: {len(test_dataset_xlm)}")

**Train Transformer**

In [None]:
# Set output directory based on environment
if IN_COLAB:
    output_dir_xlm = "/content/results_xlm"
else:
    output_dir_xlm = "./results_xlm"

training_args = TrainingArguments(
    output_dir=output_dir_xlm,
    num_train_epochs=2,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    eval_strategy="epoch",
    logging_steps=50,
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    greater_is_better=True,
    seed=42,
    fp16=False  # Set to True if using GPU in Colab
)

# Define compute_metrics function
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return {
        'accuracy': accuracy_score(labels, predictions),
        'precision': precision_score(labels, predictions, average='weighted', zero_division=0),
        'recall': recall_score(labels, predictions, average='weighted', zero_division=0),
        'f1': f1_score(labels, predictions, average='weighted', zero_division=0)
    }

trainer_xlm = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset_xlm,
    eval_dataset=test_dataset_xlm,
    compute_metrics=compute_metrics
)

print("Training XLM-RoBERTa model...")
trainer_xlm.train()

**Evaluate Transformer**

In [None]:
# Evaluate XLM-RoBERTa
print("\nEvaluating XLM-RoBERTa...")
try:
    predictions_xlm = trainer_xlm.predict(test_dataset_xlm)
    preds_xlm = predictions_xlm.predictions.argmax(axis=1)
except NameError:
    raise NameError("trainer_xlm is not defined. Please run the training cell first.")

# Ensure same length
if len(preds_xlm) != len(y_test_xlm):
    min_len = min(len(preds_xlm), len(y_test_xlm))
    preds_xlm = preds_xlm[:min_len]
    y_test_xlm = y_test_xlm[:min_len]

print("\n" + "="*50)
print("XLM-RoBERTa Results")
print("="*50)
print(f"Accuracy:  {accuracy_score(y_test_xlm, preds_xlm):.4f}")
print(f"Precision: {precision_score(y_test_xlm, preds_xlm, average='weighted', zero_division=0):.4f}")
print(f"Recall:    {recall_score(y_test_xlm, preds_xlm, average='weighted', zero_division=0):.4f}")
print(f"F1-Score:  {f1_score(y_test_xlm, preds_xlm, average='weighted', zero_division=0):.4f}")
print("\nDetailed Classification Report:")
print(classification_report(y_test_xlm, preds_xlm, zero_division=0))

# **PART 4 – Transformer Model (mBERT)**


**Tokenizer & Model (mBERT)**


In [None]:
# Load mBERT model
model_name_mbert = "bert-base-multilingual-cased"
tokenizer_mbert = AutoTokenizer.from_pretrained(model_name_mbert)
model_mbert = AutoModelForSequenceClassification.from_pretrained(model_name_mbert, num_labels=2)

print(f"Loaded mBERT model: {model_name_mbert}")


**Tokenize Data (mBERT)**


In [None]:
# Ensure labels are numeric
y_train_mbert = y_train.astype(int).values
y_test_mbert = y_test.astype(int).values

# Tokenize data for mBERT
train_enc_mbert = tokenizer_mbert(list(X_train), padding=True, truncation=True, max_length=128, return_tensors="pt")
test_enc_mbert = tokenizer_mbert(list(X_test), padding=True, truncation=True, max_length=128, return_tensors="pt")

train_dataset_mbert = HSDDataset(train_enc_mbert, y_train_mbert)
test_dataset_mbert = HSDDataset(test_enc_mbert, y_test_mbert)

print(f"Train dataset size: {len(train_dataset_mbert)}")
print(f"Test dataset size: {len(test_dataset_mbert)}")


**Train Transformer (mBERT)**


In [None]:
# Set output directory based on environment
if IN_COLAB:
    output_dir_mbert = "/content/results_mbert"
else:
    output_dir_mbert = "./results_mbert"

training_args_mbert = TrainingArguments(
    output_dir=output_dir_mbert,
    num_train_epochs=2,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    eval_strategy="epoch",
    logging_steps=50,
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    greater_is_better=True,
    seed=42,
    fp16=False  # Set to True if using GPU in Colab
)

trainer_mbert = Trainer(
    model=model_mbert,
    args=training_args_mbert,
    train_dataset=train_dataset_mbert,
    eval_dataset=test_dataset_mbert,
    compute_metrics=compute_metrics
)

print("Training mBERT model...")
trainer_mbert.train()


**Evaluate Transformer (mBERT)**


In [None]:
# Evaluate mBERT
print("\nEvaluating mBERT...")
try:
    predictions_mbert = trainer_mbert.predict(test_dataset_mbert)
    preds_mbert = predictions_mbert.predictions.argmax(axis=1)
except NameError:
    raise NameError("trainer_mbert is not defined. Please run the training cell first.")

# Ensure same length
if len(preds_mbert) != len(y_test_mbert):
    min_len = min(len(preds_mbert), len(y_test_mbert))
    preds_mbert = preds_mbert[:min_len]
    y_test_mbert = y_test_mbert[:min_len]

print("\n" + "="*50)
print("mBERT Results")
print("="*50)
print(f"Accuracy:  {accuracy_score(y_test_mbert, preds_mbert):.4f}")
print(f"Precision: {precision_score(y_test_mbert, preds_mbert, average='weighted', zero_division=0):.4f}")
print(f"Recall:    {recall_score(y_test_mbert, preds_mbert, average='weighted', zero_division=0):.4f}")
print(f"F1-Score:  {f1_score(y_test_mbert, preds_mbert, average='weighted', zero_division=0):.4f}")
print("\nDetailed Classification Report:")
print(classification_report(y_test_mbert, preds_mbert, zero_division=0))


**Confusion Matrices for All Models**

In [None]:
# Prepare predictions for all models (ensure consistent test labels)
# Use the same test labels for all models - get the minimum length to ensure consistency
y_test_final = y_test.astype(int).values

# Find minimum length across all available test sets
test_lengths = [len(y_test_final)]
try:
    test_lengths.append(len(y_test_lr))
except NameError:
    pass
try:
    test_lengths.append(len(y_test_svm))
except NameError:
    pass
try:
    test_lengths.append(len(y_test_lstm))
except NameError:
    pass
try:
    test_lengths.append(len(y_test_xlm))
except NameError:
    pass
try:
    test_lengths.append(len(y_test_mbert))
except NameError:
    pass

min_test_len = min(test_lengths)
print(f"Using minimum test length: {min_test_len} for consistent evaluation")

# Align all predictions and labels to same length
def align_predictions(pred, target_len):
    """Align predictions to target length"""
    if len(pred) != target_len:
        if len(pred) > target_len:
            return pred[:target_len]
        else:
            # Pad with zeros if shorter (shouldn't happen, but safety check)
            return np.pad(pred, (0, target_len - len(pred)), mode='constant')
    return pred

# Align all predictions and labels (only if they exist)
try:
    lr_pred_aligned = align_predictions(lr_pred, min_test_len)
    y_test_lr_aligned = y_test_lr[:min_test_len]
except NameError:
    lr_pred_aligned = None
    y_test_lr_aligned = None

try:
    svm_pred_aligned = align_predictions(svm_pred, min_test_len)
    y_test_svm_aligned = y_test_svm[:min_test_len]
except NameError:
    svm_pred_aligned = None
    y_test_svm_aligned = None

try:
    lstm_pred_aligned = align_predictions(lstm_pred, min_test_len)
    y_test_lstm_aligned = y_test_lstm[:min_test_len]
except NameError:
    lstm_pred_aligned = None
    y_test_lstm_aligned = None

try:
    preds_xlm_aligned = align_predictions(preds_xlm, min_test_len)
    y_test_xlm_aligned = y_test_xlm[:min_test_len]
except NameError:
    preds_xlm_aligned = None
    y_test_xlm_aligned = None

try:
    preds_mbert_aligned = align_predictions(preds_mbert, min_test_len)
    y_test_mbert_aligned = y_test_mbert[:min_test_len]
except NameError:
    preds_mbert_aligned = None
    y_test_mbert_aligned = None

y_test_final_aligned = y_test_final[:min_test_len]

# Build model_preds dictionary only for available models
model_preds = {}
if lr_pred_aligned is not None and y_test_lr_aligned is not None:
    model_preds["Logistic Regression"] = (lr_pred_aligned, y_test_lr_aligned)
if svm_pred_aligned is not None and y_test_svm_aligned is not None:
    model_preds["SVM"] = (svm_pred_aligned, y_test_svm_aligned)
if lstm_pred_aligned is not None and y_test_lstm_aligned is not None:
    model_preds["LSTM"] = (lstm_pred_aligned, y_test_lstm_aligned)
if preds_xlm_aligned is not None and y_test_xlm_aligned is not None:
    model_preds["XLM-RoBERTa"] = (preds_xlm_aligned, y_test_xlm_aligned)
if preds_mbert_aligned is not None and y_test_mbert_aligned is not None:
    model_preds["mBERT"] = (preds_mbert_aligned, y_test_mbert_aligned)

# Plot confusion matrices
fig, axes = plt.subplots(2, 3, figsize=(18, 12))
axes = axes.flatten()

for idx, (name, (pred, y_true)) in enumerate(model_preds.items()):
    # Ensure same length
    min_len = min(len(pred), len(y_true))
    pred_trimmed = pred[:min_len]
    y_true_trimmed = y_true[:min_len]
    
    cm = confusion_matrix(y_true_trimmed, pred_trimmed)
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", ax=axes[idx])
    axes[idx].set_title(f"{name} Confusion Matrix", fontsize=12, fontweight='bold')
    axes[idx].set_xlabel("Predicted")
    axes[idx].set_ylabel("Actual")

# Hide the last subplot if odd number of models
if len(model_preds) < len(axes):
    axes[-1].axis('off')

plt.tight_layout()
plt.show()

# Create comprehensive comparison table
print("\n" + "="*80)
print("COMPREHENSIVE MODEL COMPARISON")
print("="*80)

results_summary = []

for name, (pred, y_true) in model_preds.items():
    min_len = min(len(pred), len(y_true))
    pred_trimmed = pred[:min_len]
    y_true_trimmed = y_true[:min_len]
    
    acc = accuracy_score(y_true_trimmed, pred_trimmed)
    prec = precision_score(y_true_trimmed, pred_trimmed, average='weighted', zero_division=0)
    rec = recall_score(y_true_trimmed, pred_trimmed, average='weighted', zero_division=0)
    f1 = f1_score(y_true_trimmed, pred_trimmed, average='weighted', zero_division=0)
    
    results_summary.append({
        'Model': name,
        'Accuracy': f"{acc:.4f}",
        'Precision': f"{prec:.4f}",
        'Recall': f"{rec:.4f}",
        'F1-Score': f"{f1:.4f}"
    })

results_df = pd.DataFrame(results_summary)
print("\n" + results_df.to_string(index=False))
print("\n" + "="*80)