In [2]:
import pandas as pd
import numpy as np
import torch
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier, VotingClassifier
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from transformers import BertModel, BertTokenizer
from torch.utils.data import DataLoader, Dataset
import torch.nn as nn
import torch.optim as optim
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
import seaborn as sns

In [3]:
import re
import emoji
import pandas as pd

# Load data
train_df = pd.read_csv("/kaggle/input/caste-and-migration-hate-speech-detection/train.csv")
dev_df = pd.read_csv("/kaggle/input/caste-and-migration-hate-speech-detection/dev.csv")
df = pd.concat([train_df, dev_df], ignore_index=True)

# Function to clean each text
def preprocess_text(text):
    # Replace mentions
    text = re.sub(r'@\w+', '<USER>', text)
    
    # Replace hashtags
    text = re.sub(r'#\w+', '<HASHTAG>', text)
    
    # Convert emojis to text (e.g., 😊 -> :smiling_face_with_smiling_eyes:)
    text = emoji.demojize(text, delimiters=(" ", " "))  # adds spaces around emoji descriptions
    
    # Optional: Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

# Apply preprocessing to your column (assuming it's called 'text')
df['text'] = df['text'].apply(preprocess_text)
df.sample(10)

Unnamed: 0,id,text,label
4272,4843,இந்திகாரர்கள் தமிழர்களை அடித்து விரட்டி இது இந...,1
4616,4700,100 rubai kooda sambarika mudilanu thaan varan...,0
4414,2720,Ena da tharkuri thailees mari video podringa n...,1
613,6438,இந்த வேளாளர் குடியினர் ஏன் பள்ளர்களுக்கு வேளாள...,0
5029,1925,வீடு தந்தவனை செருப்பால அடிக்கனும்,0
3186,3358,தமிழ்நாட்டில் படிக்காமல் வேலை இல்லாமல் நிறைய ப...,0
3058,933,திட்டம் போட்டு நாட்டை பிடிக்கிறான்....,0
130,1498,நாம் தமிழர் கட்சியை எந்த கொம்பனாலும் ஒன்றும் ச...,0
3666,397,நான் என்னுடைய பட்டறையில் தமிழ் ஆட்கள் தான் வைத...,0
1280,3470,"ஒருநாள் நம்மை அனைத்திலும் அழிப்பார்கள், அதற்கு...",1


In [4]:
texts = df['text'].tolist()
labels = df['label'].tolist()
len(texts), len(labels)

(6299, 6299)

In [5]:
from transformers import AutoTokenizer, AutoModel

# Load the MurIL tokenizer and model
tokenizer = AutoTokenizer.from_pretrained('google/muril-base-cased')
model = AutoModel.from_pretrained('google/muril-base-cased')

# Check if CUDA is available and move the model to the appropriate device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


tokenizer_config.json:   0%|          | 0.00/206 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/411 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/3.16M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/113 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/953M [00:00<?, ?B/s]

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(197285, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=Fals

In [6]:
def tokenize_texts(texts):
    return tokenizer(texts, padding=True, truncation=True, return_tensors='pt', max_length=128)

class TextDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels
        self.tokenized_data = tokenize_texts(texts)

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        return {
            'input_ids': self.tokenized_data['input_ids'][idx],
            'attention_mask': self.tokenized_data['attention_mask'][idx],
            'label': self.labels[idx]
        }

dataset = TextDataset(texts, labels)
dataloader = DataLoader(dataset, batch_size=16, shuffle=False)

In [7]:
all_embeddings = []
# Forward pass
with torch.no_grad():
    for batch in dataloader:
        inputs = {k: v.to(device) for k, v in batch.items() if k in ['input_ids', 'attention_mask']}
        outputs = model(**inputs)
        cls_embeddings = outputs.last_hidden_state[:, 0, :]  # CLS token
        all_embeddings.append(cls_embeddings.cpu())
        
X = torch.cat(all_embeddings).numpy()
y = np.array(labels)

# Save embeddings
np.save("muril_embeddings.npy", X)

In [8]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

In [None]:
models = [
    LogisticRegression(max_iter=1000),
    RandomForestClassifier(n_estimators=100),
    SVC(kernel='rbf'),
    MultinomialNB(),
    KNeighborsClassifier(n_neighbors=5),
    GradientBoostingClassifier(),
    XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
    LGBMClassifier()
]
model_names = ['Logistic Regression', 'Random Forest', 'SVM', 'Naive Bayes', 'KNN', 'Gradient Boosting', 'XGBoost', 'LightGBM']

results = {}

for model, name in zip(models, model_names):
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_val_scaled)
    
    acc = accuracy_score(y_val, y_pred)
    f1_macro = f1_score(y_val, y_pred, average='macro')  # <-- Macro F1
    cm = confusion_matrix(y_val, y_pred)
    
    results[name] = {
        'Model': model,
        'Accuracy': acc,
        'Macro F1 Score': f1_macro,
        'Confusion Matrix': cm
    }
    
    print(f"{name} -> Accuracy: {acc:.4f}, Macro F1 Score: {f1_macro:.4f}")
    print("Confusion Matrix:\n", cm)
    print("-" * 50)

Logistic Regression -> Accuracy: 0.6921, Macro F1 Score: 0.6428
Confusion Matrix:
 [[670 128]
 [260 202]]
--------------------------------------------------
Random Forest -> Accuracy: 0.7738, Macro F1 Score: 0.7276
Confusion Matrix:
 [[747  51]
 [234 228]]
--------------------------------------------------
SVM -> Accuracy: 0.7127, Macro F1 Score: 0.6469
Confusion Matrix:
 [[721  77]
 [285 177]]
--------------------------------------------------
Naive Bayes -> Accuracy: 0.6063, Macro F1 Score: 0.5335
Confusion Matrix:
 [[631 167]
 [329 133]]
--------------------------------------------------
KNN -> Accuracy: 0.6421, Macro F1 Score: 0.6108
Confusion Matrix:
 [[583 215]
 [236 226]]
--------------------------------------------------


In [None]:
voting_clf = VotingClassifier(estimators=[
    ('lr', models[0]), ('rf', models[1]), ('svm', models[2])], voting='hard')
voting_clf.fit(X_train_scaled, y_train)
y_pred_voting = voting_clf.predict(X_val_scaled)
acc_voting = accuracy_score(y_val, y_pred_voting)
f1_voting = f1_score(y_val, y_pred_voting, average='macro')
cm_voting = confusion_matrix(y_val, y_pred_voting)

results['Voting Classifier'] = {
    'Model': voting_clf,
    'Accuracy': acc_voting,
    'Macro F1 Score': f1_voting,
    'Confusion Matrix': cm_voting
}

stacking_clf = StackingClassifier(
    estimators=[('lr', models[0]), ('rf', models[1]), ('svm', models[2])],
    final_estimator=LogisticRegression()
)
stacking_clf.fit(X_train_scaled, y_train)
y_pred_stacking = stacking_clf.predict(X_val_scaled)
acc_stacking = accuracy_score(y_val, y_pred_stacking)
f1_stacking = f1_score(y_val, y_pred_stacking, average='macro')
cm_stacking = confusion_matrix(y_val, y_pred_stacking)

results['Stacking Classifier'] = {
    'Model': stacking_clf,
    'Accuracy': acc_stacking,
    'Macro F1 Score': f1_stacking,
    'Confusion Matrix': cm_stacking
}

In [None]:
model_labels = list(results.keys())
accuracies = [results[m]['Accuracy'] for m in model_labels]
f1_scores = [results[m]['Macro F1 Score'] for m in model_labels]

fig, ax = plt.subplots(figsize=(12, 6))
x = np.arange(len(model_labels))
bar_width = 0.35

bars1 = ax.bar(x - bar_width/2, accuracies, bar_width, label='Accuracy', color='skyblue')
bars2 = ax.bar(x + bar_width/2, f1_scores, bar_width, label='F1 Score', color='salmon', alpha=0.8)

# Annotate bars with values on top
for bar in bars1:
    height = bar.get_height()
    ax.text(bar.get_x() + bar.get_width()/2, height + 0.01, f'{height:.2f}', ha='center', va='bottom', fontsize=9)

for bar in bars2:
    height = bar.get_height()
    ax.text(bar.get_x() + bar.get_width()/2, height + 0.01, f'{height:.2f}', ha='center', va='bottom', fontsize=9)

ax.set_xlabel('Models')
ax.set_ylabel('Score')
ax.set_title('Accuracy and F1 Score of Models')
ax.set_xticks(x)
ax.set_xticklabels(model_labels, rotation=45, ha='right')
ax.legend()
plt.tight_layout()
plt.grid(True, linestyle='--', alpha=0.3)
plt.show()

# ==========================
# Plot Confusion Matrices
# ==========================
for model_name in model_labels:
    print(f"\nConfusion Matrix for {model_name}:")
    cm = results[model_name]['Confusion Matrix']
    print(cm)
    plt.figure(figsize=(6, 5))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title(f"Confusion Matrix: {model_name}")
    plt.xlabel("Predicted Label")
    plt.ylabel("True Label")
    plt.tight_layout()
    plt.show()