# Importing Libraries and Packages

In [1]:
import sys                                                                       ## System and Utility Packages
import time
import gc

import numpy as np                                                               ## Load and pre-process the data
import pandas as pd

import torch                                                                     ## PyTorch library for tensor operations
import torch.nn as nn                                                            ## Builds classification head on top of BERT
from torch.optim import AdamW                                                    ## Optimizer for training neural networks
from torch.utils.data import DataLoader                                          ## Utilities for handling datasets and batching

from transformers import BertTokenizer, BertModel, AutoTokenizer, AutoModel      ## Tokenizer to create tokens and model for NLP tasks

from sklearn.utils.class_weight import compute_class_weight                      ## Machine learning utilities to compute weights
from sklearn.metrics import f1_score, classification_report, accuracy_score      ## Evaluate model performance

from tqdm import tqdm                                                            ## Progress tracking

### Loading the data from kaggle and instagram

In [35]:
file_path_1 = "/users/sgamahaw/data/HateSpeechDataset.csv"
file_path_2 = "/users/sgamahaw/data/ig_data.csv"

kaggle_df = pd.read_csv(file_path_1)
ig_df = pd.read_csv(file_path_2)

kaggle_df = kaggle_df.rename(columns={
    'Content': 'text',
    'Label': 'hate_speech'
})

In [None]:
### Data Cleaning
kaggle_df = kaggle_df[['text', 'hate_speech']]

# Keep only rows where Label is 0 or 1
kaggle_df['hate_speech'] = pd.to_numeric(kaggle_df['hate_speech'], errors='coerce')
kaggle_df = kaggle_df.dropna(subset=['hate_speech'])
kaggle_df['hate_speech'] = kaggle_df['hate_speech'].astype(int)

### Compute Class Weights on Training data - Kaggle dataset

In [None]:
classes = np.unique(kaggle_df['hate_speech'])
class_weights = compute_class_weight(class_weight='balanced', classes=classes, y=kaggle_df['hate_speech'].values)

ig_df['hate_speech'] = pd.to_numeric(ig_df['hate_speech'], errors='coerce')
ig_df = ig_df.dropna(subset=['hate_speech'])
ig_df['hate_speech'] = ig_df['hate_speech'].astype(int)

# Tokenizer and Dataloader

In [None]:
class Dataset_Preprocess(Dataset):

    '''This class stores the tokenizer and the dataframe and extracts the text column.
        Converts the hatespeech column into one-hot encoded format and stores max token length per sentence.
        Returns everything in tensor format so it can be used in training loops or DataLoaders.'''

    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe.text
        self.targets = torch.tensor(self.data["hate_speech"].values, dtype=torch.float)
        self.max_len = max_len

    '''Returns number of samples in the dataset which is required by DataLoader'''

    def __len__(self):
        return len(self.text)

    '''Cleans up text by removing extra whitespace and tokenizes the text using BERT’s tokenizer.
    Adds special tokens, truncates to max length and returns input_ids, attention_mask, and token_type_ids.
    Create tensors and return as a dictionary containing ids, mask, token_type_ids and targets'''

    def __getitem__(self, index):
        text = str(self.text.iloc[index])
        text = " ".join(text.split())

        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',         # Proper padding
            truncation=True,              # Truncate if too long
            return_token_type_ids=True
        )

        ids = inputs["input_ids"]
        mask = inputs["attention_mask"]
        token_type_ids = inputs["token_type_ids"]

        return {
            "ids": torch.tensor(ids, dtype=torch.long),
            "mask": torch.tensor(mask, dtype=torch.long),
            "token_type_ids": torch.tensor(token_type_ids, dtype=torch.long),
            "targets": self.targets[index]
        }

training_set = Dataset_Preprocess(kaggle_df, TOKENIZER, MAX_LEN)
testing_set = Dataset_Preprocess(ig_df, TOKENIZER, MAX_LEN)

# Creating dictionaries containing arguments for the DataLoader.
# They're used to control how batches are loaded during training, validation, and testing

train_params = {
    "batch_size": BATCH_SIZE,
    "shuffle": True,
    "num_workers": 0
}

test_params = {
    "batch_size": 1,
    "shuffle": False,
    "num_workers": 0
}

# DataLoader Creation - Each DataLoader will now efficiently load batches of data and
# yield the preprocessed tensors (ids, mask, targets, etc.) from your Dataset_Preprocess class

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)  

## Parameters for Muril/BERT

In [None]:
MODEL_NAME = "google/muril-base-cased"
BATCH_SIZE = 64
MAX_LEN = 128
EPOCHS = 5
LEARNING_RATE = 1e-5
TOKENIZER = AutoTokenizer.from_pretrained(MODEL_NAME, truncation=True)   

training_set = Dataset_Preprocess(kaggle_df, TOKENIZER, MAX_LEN)
testing_set = Dataset_Preprocess(ig_df, TOKENIZER, MAX_LEN)

## Define model architecture

In [None]:
class CNN_Muril(nn.Module):
    def __init__(self, n_classes):
        super(CNN_Muril, self).__init__()
        self.bert = AutoModel.from_pretrained(MODEL_NAME)

        # CNN layer: Conv1D over token embeddings
        self.conv1 = nn.Conv1d(in_channels=768, out_channels=256, kernel_size=3, padding=1)
        self.relu = nn.ReLU()
        self.maxpool = nn.AdaptiveMaxPool1d(1)  # Global max pooling

        # Classifier
        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Linear(256, n_classes)

    def forward(self, input_ids, attention_mask, token_type_ids):
        outputs = self.bert(input_ids=input_ids, 
                            attention_mask=attention_mask, 
                            token_type_ids=token_type_ids)

        # Convert to (batch_size, 768, seq_len) for Conv1d
        x = outputs.last_hidden_state.permute(0, 2, 1)

        # Apply CNN + ReLU + Max Pool
        x = self.conv1(x)  # Shape: (batch_size, 256, seq_len)
        x = self.relu(x)
        x = self.maxpool(x)  # Shape: (batch_size, 256, 1)
        x = x.squeeze(2)     # Shape: (batch_size, 256)

        # Classification head
        x = self.dropout(x)
        x = self.classifier(x)

        return x

## Instantiate the model

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device being used:", device)

num_classes = kaggle_df["hate_speech"].nunique()
model = CNN_Muril(n_classes = num_classes)
model.to(device)

class_weights = torch.tensor(class_weights, dtype=torch.float).to(device)
class_weights = class_weights.to(device)

# Model Training

In [None]:
def loss_fn(outputs, targets):
    return torch.nn.CrossEntropyLoss(weight=class_weights)(outputs, targets.long())

optimizer = AdamW(params=model.parameters(), lr=LEARNING_RATE)

def train(epoch):
    model.train()
    total_loss = 0
    for _, data in tqdm(enumerate(training_loader, 0)):
        ids = data["ids"].to(device, dtype=torch.long)
        mask = data["mask"].to(device, dtype=torch.long)
        token_type_ids = data["token_type_ids"].to(device, dtype=torch.long)
        targets = data["targets"].to(device, dtype=torch.float)

        outputs = model(ids, mask, token_type_ids)
        optimizer.zero_grad()
        loss = loss_fn(outputs, targets)
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
    
    avg_loss = total_loss / len(training_loader)
    print(f"Epoch {epoch} average loss: {avg_loss:.4f}")

# Model Evaluation

In [None]:
def validation(model, loader):
    model.eval()
    fin_targets = []
    fin_outputs = []
    with torch.no_grad():
        for _, data in tqdm(enumerate(loader, 0)):
            ids = data["ids"].to(device, dtype=torch.long)
            mask = data["mask"].to(device, dtype=torch.long)
            token_type_ids = data["token_type_ids"].to(device, dtype=torch.long)
            targets = data["targets"].to(device, dtype=torch.float)
            outputs = model(ids, mask, token_type_ids)
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
    return fin_outputs, fin_targets

## Prepares Classification Report

In [None]:
if __name__ == "__main__":
    # --- Training Timer Start ---
    train_start_time = time.time()

    for epoch in range(EPOCHS):
        train(epoch)

    train_end_time = time.time()
    train_time = train_end_time - train_start_time
    print(f"\nTraining Time: {train_time:.2f} seconds ({train_time/60:.2f} minutes)\n")


    # --- Evaluation Timer Start ---
    test_start_time = time.time()

    outputs, targets = validation(model, testing_loader)

    test_end_time = time.time()
    test_time = test_end_time - test_start_time
    print(f"\nEvaluation Time: {test_time:.2f} seconds ({test_time/60:.2f} minutes)\n")

    # Process predictions and print report
    final_outputs = np.argmax(outputs, axis=1)
    targets = np.array(targets).astype(int) 


    print(f"Got {sum(final_outputs == targets)} / {len(final_outputs)} correct")
    print(classification_report(targets, final_outputs))

    # Save classification report to CSV
    report_dict = classification_report(targets, final_outputs, output_dict=True)
    report_df = pd.DataFrame(report_dict).transpose()
    report_df.to_csv("classification_report.csv", index=True)