# Toxic Comment Classification with LSTM Models


## 1. Imports
**Libraries used**

- Core: PyTorch, Pandas, NumPy

- NLP: TorchText

- evaluation: Scikit-learn

- Visualization: Matplotlib

In [2]:
!pip uninstall torchtext -y
!pip uninstall torch -y
!pip install torch==2.2.0 torchtext==0.17.0

[0mFound existing installation: torch 2.5.1+cu121
Uninstalling torch-2.5.1+cu121:
  Successfully uninstalled torch-2.5.1+cu121
Collecting torch==2.2.0
  Downloading torch-2.2.0-cp310-cp310-manylinux1_x86_64.whl.metadata (25 kB)
Collecting torchtext==0.17.0
  Downloading torchtext-0.17.0-cp310-cp310-manylinux1_x86_64.whl.metadata (7.6 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch==2.2.0)
  Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch==2.2.0)
  Downloading nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch==2.2.0)
  Downloading nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch==2.2.0)
  Downloading nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12

In [4]:
import torchtext

In [3]:
import re
import string
import zipfile

import emoji
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from nltk.corpus import stopwords
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    roc_auc_score,
)
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, Dataset, random_split, TensorDataset
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from tqdm import tqdm

## 2. Data Loading

**not important for final**

In [18]:
training_path = r"/kaggle/input/jigsaw-toxic-comment-classification-challenge/train.csv.zip"
with zipfile.ZipFile(training_path) as train_zip:
    with train_zip.open("train.csv") as csv:
        training_data = pd.read_csv(csv)

training_data.head()


Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [19]:
sample_path = r"/kaggle/input/jigsaw-toxic-comment-classification-challenge/sample_submission.csv.zip"
with zipfile.ZipFile(sample_path) as z:
    with z.open("sample_submission.csv") as csv:
        sample_data = pd.read_csv(csv)

sample_data.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.5,0.5,0.5,0.5,0.5,0.5
1,0000247867823ef7,0.5,0.5,0.5,0.5,0.5,0.5
2,00013b17ad220c46,0.5,0.5,0.5,0.5,0.5,0.5
3,00017563c3f7919a,0.5,0.5,0.5,0.5,0.5,0.5
4,00017695ad8997eb,0.5,0.5,0.5,0.5,0.5,0.5


In [20]:
test_path = r"/kaggle/input/jigsaw-toxic-comment-classification-challenge/test.csv.zip"
with zipfile.ZipFile(test_path) as z:
    with z.open("test.csv") as csv:
        test_data = pd.read_csv(csv)

test_data.head()

Unnamed: 0,id,comment_text
0,00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll...
1,0000247867823ef7,== From RfC == \n\n The title is fine as it is...
2,00013b17ad220c46,""" \n\n == Sources == \n\n * Zawe Ashton on Lap..."
3,00017563c3f7919a,":If you have a look back at the source, the in..."
4,00017695ad8997eb,I don't anonymously edit articles at all.


In [21]:
test_labels_path = r"/kaggle/input/jigsaw-toxic-comment-classification-challenge/test_labels.csv.zip"
with zipfile.ZipFile(test_labels_path) as z:
    with z.open("test_labels.csv") as csv:
        test_labels_data = pd.read_csv(csv)

test_labels_data.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,-1,-1,-1,-1,-1,-1
1,0000247867823ef7,-1,-1,-1,-1,-1,-1
2,00013b17ad220c46,-1,-1,-1,-1,-1,-1
3,00017563c3f7919a,-1,-1,-1,-1,-1,-1
4,00017695ad8997eb,-1,-1,-1,-1,-1,-1


## 3. Text Preprocessing

In [22]:
punc = string.punctuation
punc.replace('#', '')
punc.replace('!', '')
punc.replace('?', '')
punc = punc + "∞θ÷α•à−β∅³π‘₹´°£€\×™√²—"

chat_words = {
    "AFAIK": "As Far As I Know",
    "AFK": "Away From Keyboard",
    "ASAP": "As Soon As Possible",
    "ATK": "At The Keyboard",
    "ATM": "At The Moment",
    "A3": "Anytime, Anywhere, Anyplace",
    "BAK": "Back At Keyboard",
    "BBL": "Be Back Later",
    "BBS": "Be Back Soon",
    "BFN": "Bye For Now",
    "B4N": "Bye For Now",
    "BRB": "Be Right Back",
    "BRT": "Be Right There",
    "BTW": "By The Way",
    "B4": "Before",
    "B4N": "Bye For Now",
    "CU": "See You",
    "CUL8R": "See You Later",
    "CYA": "See You",
    "FAQ": "Frequently Asked Questions",
    "FC": "Fingers Crossed",
    "FWIW": "For What It's Worth",
    "FYI": "For Your Information",
    "GAL": "Get A Life",
    "GG": "Good Game",
    "GN": "Good Night",
    "GMTA": "Great Minds Think Alike",
    "GR8": "Great!",
    "G9": "Genius",
    "IC": "I See",
    "ICQ": "I Seek you (also a chat program)",
    "ILU": "ILU: I Love You",
    "IMHO": "In My Honest/Humble Opinion",
    "IMO": "In My Opinion",
    "IOW": "In Other Words",
    "IRL": "In Real Life",
    "KISS": "Keep It Simple, Stupid",
    "LDR": "Long Distance Relationship",
    "LMAO": "Laugh My A.. Off",
    "LOL": "Laughing Out Loud",
    "LTNS": "Long Time No See",
    "L8R": "Later",
    "MTE": "My Thoughts Exactly",
    "M8": "Mate",
    "NRN": "No Reply Necessary",
    "OIC": "Oh I See",
    "PITA": "Pain In The A..",
    "PRT": "Party",
    "PRW": "Parents Are Watching",
    "QPSA?": "Que Pasa?",
    "ROFL": "Rolling On The Floor Laughing",
    "ROFLOL": "Rolling On The Floor Laughing Out Loud",
    "ROTFLMAO": "Rolling On The Floor Laughing My A.. Off",
    "SK8": "Skate",
    "STATS": "Your sex and age",
    "ASL": "Age, Sex, Location",
    "THX": "Thank You",
    "TTFN": "Ta-Ta For Now!",
    "TTYL": "Talk To You Later",
    "U": "You",
    "U2": "You Too",
    "U4E": "Yours For Ever",
    "WB": "Welcome Back",
    "WTF": "What The F...",
    "WTG": "Way To Go!",
    "WUF": "Where Are You From?",
    "W8": "Wait...",
    "7K": "Sick:-D Laugher",
    "TFW": "That feeling when",
    "MFW": "My face when",
    "MRW": "My reaction when",
    "IFYP": "I feel your pain",
    "TNTL": "Trying not to laugh",
    "JK": "Just kidding",
    "IDC": "I don't care",
    "ILY": "I love you",
    "IMU": "I miss you",
    "ADIH": "Another day in hell",
    "ZZZ": "Sleeping, bored, tired",
    "WYWH": "Wish you were here",
    "TIME": "Tears in my eyes",
    "BAE": "Before anyone else",
    "FIMH": "Forever in my heart",
    "BSAAW": "Big smile and a wink",
    "BWL": "Bursting with laughter",
    "BFF": "Best friends forever",
    "CSL": "Can't stop laughing"
}


stpwds = stopwords.words('english')

nlp = spacy.load("en_core_web_sm")

time_zone_abbreviations = [
        "UTC", "GMT", "EST", "CST", "PST", "MST",
        "EDT", "CDT", "PDT", "MDT", "CET", "EET",
        "WET", "AEST", "ACST", "AWST", "HST",
        "AKST", "IST", "JST", "KST", "NZST"
    ]

patterns = [
    r'\\[nrtbfv\\]',         # \n, \t ..etc
    '<.*?>',                 # Html tags
    r'https?://\S+|www\.\S+',# Links
    r'\ufeff',               # BOM characters
    r'^[^a-zA-Z0-9]+$',      # Non-alphanumeric tokens
    r'ｗｗｗ．\S+',            # Full-width URLs
    r'[\uf700-\uf7ff]',      # Unicode private-use chars
    r'^[－—…]+$',            # Special punctuation-only tokens
    r'[︵︶]'                # CJK parentheses
]

def preprocess(text):
    for regex in patterns:
        text = re.sub(regex, '', text)
    text = text.translate(str.maketrans(punc, ' ' * len(punc)))
    text = ' '.join(word for word in text.split() if word not in time_zone_abbreviations)
    text = ' '.join(word for word in text.split() if word not in stpwds)
    text = ' '.join(chat_words.get(word.lower(), word) for word in text.split())
    text = text.lower()
    text = emoji.demojize(text)
    text = re.sub(r'\s+', ' ', text).strip()

    return text



## 4. Vocabulary Construction

In [24]:
comments = list(training_data["comment_text"])
train_iter = iter(comments)

tokenizer = get_tokenizer("basic_english")

def yield_tokens(data_iter):
    for text in data_iter:
        cleaned_text = preprocess(text)
        tokens = [
            token for token in tokenizer(cleaned_text)
            if 1 < len(token) < 25
        ]
        yield tokens

# Build vocabulary with size limit
vocab = build_vocab_from_iterator(
    yield_tokens(train_iter),
    specials=["<pad>", "<unk>"],
    max_tokens=30002  # 30K + 2 special tokens for unkown tokens and padding
)
vocab.set_default_index(vocab["<unk>"])
PAD_IDX = vocab['<pad>']

print(f"Final vocabulary size: {len(vocab)}")
print("Sample valid tokens:", [t for t in list(vocab.get_itos())[2:12]])

Final vocabulary size: 30002
Sample valid tokens: ['article', 'the', 'page', 'wikipedia', 'talk', 'you', 'please', 'would', 'one', 'like']


In [25]:
torch.save(vocab, "vocab.pth")

In [26]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def text_pipeline(text):
    return [
        vocab[token] if token in vocab else vocab['<unk>'] 
        for token in tokenizer(text)
    ]

def label_pipeline(labels):
    return torch.FloatTensor(labels)



## 5. Dataset Preparation

**Dataset Structure**:

- Handles variable-length sequences

- Implements length-based padding

- Multi-label output (6 toxicity categories)

In [27]:
from torch.utils.data import Dataset
from torch.nn.utils.rnn import pad_sequence
import torch

class PaddedDataset(Dataset):
    def __init__(self, df, vocab, max_length=None):
        self.df = df
        self.vocab = vocab
        self.max_length = max_length
        self.label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        text = self.df.iloc[idx]['comment_text']
        labels = self.df.iloc[idx][self.label_cols].values.astype(float)
        
        # Tokenize and numericalize
        tokens = tokenizer(preprocess(text))
        if self.max_length:
            tokens = tokens[:self.max_length]
        numericalized = [self.vocab[token] for token in tokens]
        
        return torch.tensor(numericalized, dtype=torch.long), torch.tensor(labels, dtype=torch.float)


def collate_batch(batch):
    texts, labels = zip(*batch)
    lengths = torch.tensor([len(t) for t in texts])
     # Filter invalid sequences (length <=0)
    valid_mask = lengths > 0
    if not valid_mask.all():
        texts = [t for t, valid in zip(texts, valid_mask) if valid]
        labels = [l for l, valid in zip(labels, valid_mask) if valid]
        lengths = lengths[valid_mask]
    
    # Add fallback for empty batch
    if len(texts) == 0:
        return torch.zeros((1,1), dtype=torch.long), torch.zeros((1,6)), torch.tensor([1])
    # Pad sequences to match longest in batch
    padded_texts = torch.nn.utils.rnn.pad_sequence(
        texts, 
        batch_first=True, 
        padding_value=PAD_IDX
    )
    
    return padded_texts, torch.stack(labels), lengths

In [28]:
BATCH_SIZE = 64
MAX_SEQ_LEN = 256

dataset = PaddedDataset(training_data, vocab, max_length=MAX_SEQ_LEN)
dataloader = DataLoader(
    dataset, 
    batch_size=BATCH_SIZE, 
    shuffle=True, 
    collate_fn=collate_batch,
    pin_memory=True,  # Faster data transfer to GPU
    num_workers=2     # Parallel data loading
)

## 6. Model Architectures

### 6.1 Baseline LSTM

Embedding → LSTM → Linear Layer

In [29]:
class lstm(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, pad_idx, output_dim):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx = pad_idx)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True)
        self.fc1 = nn.Linear(hidden_dim, output_dim)
    def forward(self, text, lengths):
        embedded = self.embedding(text)
        packed_embedded = nn.utils.rnn.pack_padded_sequence(
            embedded,
            lengths.cpu(),
            batch_first=True,
            enforce_sorted = False
        )
        packed_output, (hidden, cell) = self.lstm(packed_embedded)
        out = self.fc1(hidden[-1])
        return torch.sigmoid(out)

### 6.2 Bidirectional LSTM

**Improvements**:

-  Bidirectional
-  processingDropout
-  regularization


In [30]:
class BIDirectional_lstm(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, pad_idx, output_dim):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx = pad_idx)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.fc1 = nn.Linear(hidden_dim * 2, output_dim)
        self.dropout =  nn.Dropout(p = 0.3)
    def forward(self, text, lengths):
        embedded = self.embedding(text)
        packed_embedded = nn.utils.rnn.pack_padded_sequence(
            embedded,
            lengths.cpu(),
            batch_first=True,
            enforce_sorted = False
        )
        packed_output, (hidden, cell) = self.lstm(packed_embedded)
        hidden_output = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1)
        out = self.fc1(hidden_output)
        out = self.dropout(out)
        return torch.sigmoid(out)

## 7. Training Framework

In [31]:
def train_model(model, train_loader, val_loader, epochs, learning_rate, filename):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    criterion = nn.BCELoss()
    best_val_loss = float('inf')
    model = model.to(device)
    optimizer = optim.AdamW(model.parameters(), lr=learning_rate)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=2)
    
    for epoch in range(epochs):
        # Training Phase
        model.train()
        train_loss = 0.0
        correct = 0
        total = 0
        
        for texts, labels, lengths in tqdm(train_loader, desc=f'Epoch {epoch+1}'):
            # Move data to device
            texts, labels = texts.to(device), labels.to(device)
            
            # Zero gradients
            optimizer.zero_grad()
            
            # Forward pass
            outputs = model(texts, lengths)
            loss = criterion(outputs, labels)
            
            # Backward pass and optimize
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)  # Gradient clipping
            optimizer.step()
            
            # Calculate metrics
            train_loss += loss.item()
            predicted = (outputs > 0.5).float()
            correct += (predicted == labels).all(dim=1).sum().item()
            total += labels.size(0)
        
        # Validation Phase
        model.eval()
        val_loss = 0.0
        val_correct = 0
        val_total = 0
        
        with torch.no_grad():
            for texts, labels, lengths in val_loader:
                texts, labels = texts.to(device), labels.to(device)
                outputs = model(texts, lengths)
                loss = criterion(outputs, labels)
                
                val_loss += loss.item()
                predicted = (outputs > 0.5).float()
                val_correct += (predicted == labels).all(dim=1).sum().item()
                val_total += labels.size(0)
        
        # Epoch Statistics
        train_loss /= len(train_loader)
        train_acc = correct / total
        val_loss /= len(val_loader)
        val_acc = val_correct / val_total
        
        print(f"\nEpoch {epoch+1}/{epochs}")
        print(f"Train Loss: {train_loss:.4f} | Acc: {train_acc:.2%}")
        print(f"Val Loss: {val_loss:.4f} | Acc: {val_acc:.2%}")
        
        # Learning rate scheduling
        scheduler.step(val_loss)
        
        # Save best model
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save(model.state_dict(), filename + '.pth')
    
    print("Training complete!")

In [32]:

train_df, val_df = train_test_split(training_data.iloc[:, 1:], test_size=0.2)

train_dataset = PaddedDataset(train_df, vocab, max_length=256)
val_dataset = PaddedDataset(val_df, vocab, max_length=256)

train_loader = DataLoader(
    train_dataset,
    batch_size=64,
    shuffle=True,
    collate_fn=collate_batch,
    pin_memory=True,
    num_workers=4
)

val_loader = DataLoader(
    val_dataset,
    batch_size=64,
    collate_fn=collate_batch,
    pin_memory=True,
    num_workers=4
)


    

In [None]:
lstm_model = lstm(
    vocab_size=len(vocab),
    embed_dim=50,
    hidden_dim=256,
    output_dim=6,
    pad_idx=PAD_IDX
)

train_model(lstm_model, train_loader, val_loader, 5, 0.001, "lstm")

Epoch 1: 100%|██████████| 1995/1995 [01:06<00:00, 29.93it/s]



Epoch 1/5
Train Loss: 0.0972 | Acc: 90.21%
Val Loss: 0.0635 | Acc: 91.21%


Epoch 2: 100%|██████████| 1995/1995 [01:07<00:00, 29.77it/s]



Epoch 2/5
Train Loss: 0.0545 | Acc: 91.68%
Val Loss: 0.0531 | Acc: 91.65%


Epoch 3: 100%|██████████| 1995/1995 [01:07<00:00, 29.61it/s]


In [None]:
BIDirectional_model = BIDirectional_lstm(
    vocab_size=len(vocab),
    embed_dim=50,
    hidden_dim=256,
    output_dim=6,
    pad_idx=PAD_IDX
)

train_model(BIDirectional_model, train_loader, val_loader, 7, 0.0005, "bidirctional_lstm")

## 8. additional models

### 8.1 using pretrained embedding

In [None]:
def load_glove_from_file(glove_file):
    word_to_vec = {}
    with open(glove_file, 'r', encoding='utf-8') as f:
        for line in tqdm(f, desc="Loading GloVe"):
            parts = line.split()
            word = parts[0]
            vector = np.array([float(val) for val in parts[1:]], dtype=np.float32)
            word_to_vec[word] = vector
    return word_to_vec

# 1. Load your GloVe file
glove_path = r"/kaggle/input/glove-embeddings/glove.6B.100d.txt"  # Update with your path
glove_vectors = load_glove_from_file(glove_path)

# 2. Create embedding matrix aligned with your vocabulary
def create_embedding_matrix(vocab, embedding_dim=100):
    vocab_size = len(vocab)
    weights = torch.zeros(vocab_size, embedding_dim)
    
    for word, idx in vocab.get_stoi().items():
        if word in glove_vectors:
            weights[idx] = torch.tensor(glove_vectors[word])
        elif word == "<pad>":
            weights[idx] = torch.zeros(embedding_dim)  # Pad token
        else:
            # Initialize unknown words randomly
            weights[idx] = torch.randn(embedding_dim) * 0.25
            
    return weights


In [None]:
class BI_lstm_GloVe_model(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, pad_idx, output_dim):
        super().__init__()
        # Initialize with GloVe weights
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=pad_idx)
        
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)  # Bidirectional

    def forward(self, text, lengths):
        embedded = self.embedding(text)
        packed_embedded = nn.utils.rnn.pack_padded_sequence(
            embedded,
            lengths.cpu(),
            batch_first=True,
            enforce_sorted = False
        )
        packed_output, (hidden, cell) = self.lstm(packed_embedded)
        hidden_output = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1)
        out = self.fc(hidden_output)
        return torch.sigmoid(out)

In [None]:
embedding_dim = 100  # matches GloVe dimension
vocab_size = len(vocab)
bi_lstm_glove_model = BI_lstm_GloVe_model(
    vocab_size,
    embedding_dim, 
    hidden_dim = 256,
    pad_idx = PAD_IDX,
    output_dim = 6
)

# Create embedding matrix
weights = create_embedding_matrix(vocab, 100)
# Assign to model
bi_lstm_glove_model.embedding = nn.Embedding.from_pretrained(weights, freeze=False)

In [None]:
train_model(bi_lstm_glove_model, train_loader, val_loader, 10, 0.0005, "bi_lstm_glove")

### 8.2 Additional improvments
**Improvments**:
- stacked bi-lstm
- added basic attention mechanism and normalization layers

**THIS IS IMPORTANT**

In [None]:
class Improved_BI_LSTM_GloVe(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, pad_idx, output_dim):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=pad_idx)
        
        # Enhanced Architecture
        self.lstm = nn.LSTM(embed_dim, hidden_dim, 
                           num_layers=2,              # Stacked LSTMs
                           bidirectional=True, 
                           batch_first=True,
                           dropout=0.3)               # Inter-layer dropout
        
        self.attention = nn.Linear(hidden_dim * 2, 1) # Simple attention mechanism
        self.bn1 = nn.BatchNorm1d(hidden_dim * 2)     # Batch normalization
        
        self.fc = nn.Sequential(
            nn.Linear(hidden_dim * 2, hidden_dim),
            nn.ReLU(),
            nn.BatchNorm1d(hidden_dim),
            nn.Dropout(0.5),                          # Increased dropout
            nn.Linear(hidden_dim, output_dim)
        )
        
        # Initialize with kaiming normal for better convergence
        for layer in [self.attention, *self.fc]:
            if isinstance(layer, nn.Linear):
                nn.init.kaiming_normal_(layer.weight)

    def forward(self, text, lengths):
        # Embedding with dropout
        embedded = F.dropout(self.embedding(text), p=0.2, training=self.training)
        
        # Packed sequence
        packed_embedded = nn.utils.rnn.pack_padded_sequence(
            embedded, lengths.cpu(), batch_first=True, enforce_sorted=False
        )
        
        # BiLSTM with 2 layers
        packed_output, (hidden, cell) = self.lstm(packed_embedded)
        output, _ = nn.utils.rnn.pad_packed_sequence(packed_output, batch_first=True)
        
        # Attention mechanism
        attention_weights = F.softmax(self.attention(output), dim=1)
        context_vector = torch.sum(attention_weights * output, dim=1)
        
        # Batch norm + FC
        context_vector = self.bn1(context_vector)
        return self.fc(context_vector)

In [None]:
def train_model(model, train_loader, val_loader, epochs, learning_rate, filename):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    criterion = nn.BCEWithLogitsLoss()
    best_val_loss = float('inf')
    model = model.to(device)
    optimizer = torch.optim.AdamW(model.parameters())
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=3)
    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
    for epoch in range(epochs):
        # Training Phase
        model.train()
        train_loss = 0.0
        correct = 0
        total = 0
        
        for texts, labels, lengths in tqdm(train_loader, desc=f'Epoch {epoch+1}'):
            # Move data to device
            texts, labels = texts.to(device), labels.to(device)
            
            # Zero gradients
            optimizer.zero_grad()
            
            # Forward pass
            outputs = model(texts, lengths)
            loss = criterion(outputs, labels)
            
            # Backward pass and optimize
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)  # Gradient clipping
            optimizer.step()
            
            # Calculate metrics
            train_loss += loss.item()
            predicted = (outputs > 0.5).float()
            correct += (predicted == labels).all(dim=1).sum().item()
            total += labels.size(0)
        
        # Validation Phase
        model.eval()
        val_loss = 0.0
        val_correct = 0
        val_total = 0
        
        with torch.no_grad():
            for texts, labels, lengths in val_loader:
                texts, labels = texts.to(device), labels.to(device)
                outputs = model(texts, lengths)
                loss = criterion(outputs, labels)
                
                val_loss += loss.item()
                predicted = (outputs > 0.5).float()
                val_correct += (predicted == labels).all(dim=1).sum().item()
                val_total += labels.size(0)
        
        # Epoch Statistics
        train_loss /= len(train_loader)
        train_acc = correct / total
        val_loss /= len(val_loader)
        val_acc = val_correct / val_total
        
        print(f"\nEpoch {epoch+1}/{epochs}")
        print(f"Train Loss: {train_loss:.4f} | Acc: {train_acc:.2%}")
        print(f"Val Loss: {val_loss:.4f} | Acc: {val_acc:.2%}")
        
        # Learning rate scheduling
        scheduler.step(val_loss)
        
        # Save best model
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save(model.state_dict(), filename + '.pth')
    
    print("Training complete!")

In [None]:
final_model = Improved_BI_LSTM_GloVe(
    vocab_size=len(vocab),
    embed_dim=100,
    hidden_dim=256,
    pad_idx=PAD_IDX,
    output_dim=6
)
final_model.embedding.weight.data.copy_(weights)
final_model.embedding.weight.requires_grad = True
train_model(final_model, train_loader, val_loader, 10, 0.0001, "final")

## 9. Model Evaluation

**Evaluation Metrics**:

- Average class-wise ROC-AUC scores as used in the competition

- Batch-wise processing for memory efficiency

In [None]:
ev_data = pd.concat([test_data,test_labels_data.iloc[:,1:]], axis=1)
# dropping -1 rows, these rows weren't used for evaluation models in the competetion and marked with -1 
ev_data = ev_data[ev_data['toxic']!= -1]
ev_data

In [None]:
eval_dataset = PaddedDataset(ev_data, vocab, max_length=256)
eval_loader = DataLoader(
    eval_dataset,
    batch_size=512,
    collate_fn=collate_batch,
    pin_memory=True,
    num_workers=4
)

In [None]:
def calc_roc(model):
    model.eval()
    all_labels = []
    all_outputs = []
    
    with torch.no_grad():
        for texts, labels, lengths in eval_loader:
            texts, labels = texts.to(device), labels.to(device)
            outputs = model(texts, lengths)
            
            # Store batch results
            all_labels.append(labels.cpu().numpy())
            all_outputs.append(outputs.cpu().numpy())
    
    # Concatenate all batches
    all_labels = np.concatenate(all_labels, axis=0)
    all_outputs = np.concatenate(all_outputs, axis=0)
    
    # Calculate ROC-AUC for each class
    roc_scores = []
    for col in range(6):  # the original evaluation method is to take ROC-AUC scores average for the 6 classed
        if np.sum(all_labels[:, col]) > 0:
            roc = roc_auc_score(all_labels[:, col], all_outputs[:, col])
            roc_scores.append(roc)
    
    # Return average
    return np.mean(roc_scores)

## 10. Results


In [None]:
lstm_model = lstm_model = lstm(
    vocab_size=len(vocab),
    embed_dim=50,
    hidden_dim=256,
    output_dim=6,
    pad_idx=PAD_IDX
).to(device)

bi_lstm_model = BIDirectional_model = BIDirectional_lstm(
    vocab_size=len(vocab),
    embed_dim=50,
    hidden_dim=256,
    output_dim=6,
    pad_idx=PAD_IDX
).to(device)

vocab_size = len(vocab)
bi_lstm_glove_model = BI_lstm_GloVe_model(
    vocab_size,
    100, 
    hidden_dim = 256,
    pad_idx = PAD_IDX,
    output_dim = 6
).to(device)

final_model = Improved_BI_LSTM_GloVe(
    vocab_size=len(vocab),
    embed_dim=100,
    hidden_dim=256,
    pad_idx=PAD_IDX,
    output_dim=6
).to(device)

lstm_model.load_state_dict(torch.load("lstm.pth"))
bi_lstm_model.load_state_dict(torch.load("bidirctional_lstm.pth"))
bi_lstm_glove_model.load_state_dict(torch.load("bi_lstm_glove.pth"))
final_model.load_state_dict(torch.load("final.pth"))

print("lstm model roc-auc ", calc_roc(lstm_model))
print("BiDirectional lstm model roc-auc ", calc_roc(bi_lstm_model))
print("BiDirectional lstm with pretrained embedding model roc-auc ", calc_roc(bi_lstm_glove_model))
print("stacked Bidirectional lstm model with pretrained embedding roc-auc ", calc_roc(final_model))

In [None]:
lstm_model = torch.load("/kaggle/working/lstm.pth")
# bi_lstm_model.load_state_dict(torch.load("bidirctional_lstm.pth"))
# bi_lstm_glove_model.load_state_dict(torch.load("bi_lstm_glove.pth"))
final_model = torch.load("/kaggle/working/final.pth") 


In [None]:
test_df = pd.read_csv("/kaggle/input/rahul2/dataset_instagram-scraper_2025-03-30_09-30-50-381.csv")
test_df

In [None]:
class TestDataset(Dataset):
    def __init__(self, df, vocab, max_length=None):
        self.df = df.reset_index(drop=True)  # Ensure clean indices
        self.vocab = vocab
        self.max_length = max_length
        self.valid_indices = []  # Will track which indices were valid
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        text = str(self.df.iloc[idx]['text'])
        tokens = tokenizer(preprocess(text))
        
        # Track valid (non-empty) samples
        if len(tokens) > 0:
            if idx not in self.valid_indices:
                self.valid_indices.append(idx)
        else:
            return torch.tensor([], dtype=torch.long)  # Return empty for invalid
            
        if self.max_length:
            tokens = tokens[:self.max_length]
        numericalized = [self.vocab[token] for token in tokens]
        return torch.tensor(numericalized, dtype=torch.long)

In [None]:
def collate_test_batch(batch):
    # Track original indices of valid samples
    valid_indices = [i for i, t in enumerate(batch) if len(t) > 0]
    texts = [t for t in batch if len(t) > 0]
    
    if not texts:
        # Return dummy batch with empty indices
        return torch.zeros((1, 1)), torch.tensor([1]), []
        
    lengths = torch.tensor([len(t) for t in texts])
    padded = pad_sequence(texts, batch_first=True, padding_value=PAD_IDX)
    return padded, lengths, valid_indices  # Now returns indices

In [None]:
test_dataset = TestDataset(test_df, vocab, max_length=MAX_SEQ_LEN)
test_loader = DataLoader(
    test_dataset,
    batch_size=BATCH_SIZE,
    shuffle=False,  # Critical for maintaining order!
    collate_fn=collate_test_batch,
    pin_memory=True,
    num_workers=2
)# Create dataset and loader
test_dataset = TestDataset(test_df, vocab, max_length=MAX_SEQ_LEN)
test_loader = DataLoader(
    test_dataset,
    batch_size=BATCH_SIZE,
    shuffle=False,
    collate_fn=collate_test_batch,
    num_workers=2
)

# Get predictions
preds, probs = predict_with_loader(lstm_model, test_loader, device, len(test_df))

# Add to DataFrame
test_df['predicted_class'] = preds
for i, name in enumerate(['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']):
    test_df[f'prob_{name}'] = probs[:, i]

# Mark predicted rows (only if you need this)
test_df['was_predicted'] = False
test_df.loc[test_dataset.valid_indices, 'was_predicted'] = True  # Note: test_dataset, not loader

In [None]:
print(test_df.columns.tolist())

In [None]:
test_df

In [None]:
eval_dataset2 = PaddedDataset(ev_data, vocab, max_length=256)
eval_loader = DataLoader(
    eval_dataset,
    batch_size=512,
    collate_fn=collate_batch,
    pin_memory=True,
    num_workers=4
)

In [11]:
test_df = pd.read_csv("/kaggle/input/rahul2/dataset_instagram-scraper_2025-03-30_09-30-50-381.csv")
test_df.rename(columns={'text': 'comment_text'}, inplace=True)

In [17]:
vocab = torch.load("/kaggle/working/vocab.pth")


FileNotFoundError: [Errno 2] No such file or directory: '/kaggle/working/vocab.pth'

In [14]:
import pandas as pd
import torch
import re
import string
import emoji
from nltk.corpus import stopwords
import spacy
from torchtext.data.utils import get_tokenizer
from torch.nn.utils.rnn import pad_sequence
import numpy as np

class ToxicityClassifierPipeline:
    def __init__(self, model, vocab_path="vocab.pth"):
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model = model.to(self.device)
        self.model.eval()
        self.tokenizer = get_tokenizer("basic_english")
        self.vocab = torch.load(vocab_path)
        self.PAD_IDX = self.vocab['<pad>']
        self.nlp = spacy.load("en_core_web_sm")
        self.stpwds = stopwords.words('english')
        
        # Define preprocessing patterns
        self.punc = string.punctuation.replace('#', '').replace('!', '').replace('?', '') + "∞θ÷α•à−β∅³π‘₹´°£€\\×™√²—"
        self.patterns = [
            r'\\[nrtbfv\\]',         # \n, \t etc
            '<.*?>',                 # HTML tags
            r'https?://\S+|www\.\S+', # Links
            r'\ufeff',               # BOM characters
            r'^[^a-zA-Z0-9]+$',      # Non-alphanumeric tokens
            r'ｗｗｗ．\S+',            # Full-width URLs
            r'[\uf700-\uf7ff]',      # Unicode private-use chars
            r'^[－—…]+$',            # Special punctuation
            r'[︵︶]'                # CJK parentheses
        ]
        
        # Chat words mapping (truncated for brevity)
        self.chat_words = {
            "AFAIK": "As Far As I Know",
            "AFK": "Away From Keyboard",
            # ... include all your chat words mapping
        }
        
        self.time_zone_abbreviations = [
            "UTC", "GMT", "EST", "CST", "PST", "MST",
            "EDT", "CDT", "PDT", "MDT", "CET", "EET",
            "WET", "AEST", "ACST", "AWST", "HST",
            "AKST", "IST", "JST", "KST", "NZST"
        ]

    def preprocess_text(self, text):
        """Apply all preprocessing steps to a single text"""
        if not isinstance(text, str) or not text.strip():
            return ""
            
        # Apply regex patterns
        for regex in self.patterns:
            text = re.sub(regex, '', text)
            
        # Remove punctuation
        text = text.translate(str.maketrans(self.punc, ' ' * len(self.punc)))
        
        # Remove time zones and stopwords
        text = ' '.join(word for word in text.split() 
                       if word not in self.time_zone_abbreviations 
                       and word not in self.stpwds)
        
        # Expand chat words
        text = ' '.join(self.chat_words.get(word.lower(), word) for word in text.split())
        
        # Lowercase and emoji handling
        text = text.lower()
        text = emoji.demojize(text)
        text = re.sub(r'\s+', ' ', text).strip()
        
        return text

    def tokenize_and_numericalize(self, text, max_length=256):
        """Tokenize and convert to numerical tokens"""
        if not text:  # Handle empty text
            return torch.empty(0, dtype=torch.long)
            
        tokens = [token for token in self.tokenizer(text) if 1 < len(token) < 25]
        tokens = tokens[:max_length]
        numericalized = [self.vocab[token] if token in self.vocab else self.vocab['<unk>'] 
                        for token in tokens]
        return torch.tensor(numericalized, dtype=torch.long)

    def predict_toxicity(self, df, text_column='comment_text', batch_size=64):
        """
        Predict toxicity for a DataFrame of texts
        
        Args:
            df: Input DataFrame containing text to classify
            text_column: Name of column containing text
            batch_size: Batch size for prediction
            
        Returns:
            DataFrame with original text and toxicity predictions
        """
        # Create a copy of the original DataFrame to preserve indices
        result_df = df.copy()
        
        # Preprocess all texts and keep track of non-empty texts
        processed_data = []
        valid_indices = []
        
        for idx, text in enumerate(df[text_column]):
            processed = self.preprocess_text(text)
            if processed:  # Only keep non-empty texts
                processed_data.append(processed)
                valid_indices.append(idx)
        
        # If all texts are empty after preprocessing
        if not processed_data:
            # Return all zeros for all predictions
            result_df['toxic'] = 0
            result_df['severe_toxic'] = 0
            result_df['obscene'] = 0
            result_df['threat'] = 0
            result_df['insult'] = 0
            result_df['identity_hate'] = 0
            return result_df
        
        # Tokenize and numericalize only non-empty texts
        tokenized = [self.tokenize_and_numericalize(text) for text in processed_data]
        
        # Create batches only for valid sequences
        batches = []
        for i in range(0, len(tokenized), batch_size):
            batch_texts = tokenized[i:i+batch_size]
            lengths = torch.tensor([len(t) for t in batch_texts])
            
            # Filter out empty sequences in this batch
            valid_mask = lengths > 0
            if not valid_mask.any():
                continue
                
            batch_texts = [t for t, valid in zip(batch_texts, valid_mask) if valid]
            lengths = lengths[valid_mask]
            
            # Pad sequences
            padded = pad_sequence(batch_texts, batch_first=True, padding_value=self.PAD_IDX)
            batches.append((padded, lengths, valid_mask))
        
        # Make predictions
        all_preds = np.zeros((len(df), 6), dtype=int)  # Initialize with zeros
        
        with torch.no_grad():
            current_idx = 0
            for batch, lengths, valid_mask in batches:
                batch = batch.to(self.device)
                outputs = self.model(batch, lengths.to(self.device))
                preds = (outputs > 0.5).int().cpu().numpy()
                
                # Assign predictions to the correct positions
                batch_size = len(preds)
                for i in range(batch_size):
                    if current_idx + i < len(valid_indices):
                        all_preds[valid_indices[current_idx + i]] = preds[i]
                
                current_idx += batch_size
        
        # Add predictions to result DataFrame
        result_df['toxic'] = all_preds[:, 0]
        result_df['severe_toxic'] = all_preds[:, 1]
        result_df['obscene'] = all_preds[:, 2]
        result_df['threat'] = all_preds[:, 3]
        result_df['insult'] = all_preds[:, 4]
        result_df['identity_hate'] = all_preds[:, 5]
        
        return result_df

# Example usage:
if __name__ == "__main__":
    # Load your trained model (example using the final model)
    final_model = Improved_BI_LSTM_GloVe(
        vocab_size=len(vocab),
        embed_dim=100,
        hidden_dim=256,
        pad_idx=PAD_IDX,
        output_dim=6
    )
    final_model.load_state_dict(torch.load("final.pth"))
    
    # Initialize pipeline
    pipeline = ToxicityClassifierPipeline(final_model)
    
    # Example test DataFrame
    test_df = pd.read_csv("/kaggle/input/rahul2/dataset_instagram-scraper_2025-03-30_09-30-50-381.csv")
    test_df.rename(columns={'text': 'comment_text'}, inplace=True)
    
    # Get predictions
    results = pipeline.predict_toxicity(test_df)
    print(results[['comment_text', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']])

NameError: name 'vocab' is not defined

In [115]:
results

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,18069096703645733,Why bro in sea always,0,0,0,0,0,0
1,18052302512112156,😂😂😂😂😂😂😂 misericórdia,0,0,0,0,0,0
2,18119491555435268,🙌,0,0,0,0,0,0
3,17884548831142635,🔥,0,0,0,0,0,0
4,18059375443875265,nigga what,1,1,1,0,1,1
5,18097418647542771,Vessel of NBA youngboy,0,0,0,0,0,0
6,18071614537687683,JAJAJAJA,0,0,0,0,0,0
7,17947968803927907,wtf,1,1,1,0,1,0
8,17916781080051191,fuck that bitch,1,1,1,0,1,1
9,18036427937403276,Is this real,0,0,0,0,0,0


In [116]:
final_ans = results.drop('identity_hate',axis = 1)
final_ans

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult
0,18069096703645733,Why bro in sea always,0,0,0,0,0
1,18052302512112156,😂😂😂😂😂😂😂 misericórdia,0,0,0,0,0
2,18119491555435268,🙌,0,0,0,0,0
3,17884548831142635,🔥,0,0,0,0,0
4,18059375443875265,nigga what,1,1,1,0,1
5,18097418647542771,Vessel of NBA youngboy,0,0,0,0,0
6,18071614537687683,JAJAJAJA,0,0,0,0,0
7,17947968803927907,wtf,1,1,1,0,1
8,17916781080051191,fuck that bitch,1,1,1,0,1
9,18036427937403276,Is this real,0,0,0,0,0
