In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
# Create the requirements.txt file
requirements = """
numpy
matplotlib
seaborn
pandas
ipykernel
nltk
wordcloud
scikit-learn
langdetect
googletrans==4.0.0rc1
torch
torchvision
torchaudio
transformers
datasets
accelerate
tqdm
"""

with open("requirements.txt", "w") as f:
    f.write(requirements)

In [None]:
!pip install -r requirements.txt

Collecting langdetect (from -r requirements.txt (line 10))
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/981.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m55.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting googletrans==4.0.0rc1 (from -r requirements.txt (line 11))
  Downloading googletrans-4.0.0rc1.tar.gz (20 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting datasets (from -r requirements.txt (line 16))
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting httpx==0.13.3 (from googletrans==4.0.0rc1->-r requirements.txt (line 11))
  Downloading httpx-0.13.3-py3-none-any.whl.metadata (25 kB)
Collecting hstspreload (from httpx==0.13.3->googletrans==4.0.0rc1->-r requirements.txt (line 11))
  Downloading hstspreload-2025.1.1-py3-none-any.whl.metadata (

In [None]:
import re
import numpy as np
import pandas as pd
from tqdm import tqdm

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler

from sklearn.metrics import f1_score, classification_report
from transformers import AutoTokenizer

In [None]:
train_df = pd.read_csv('/content/drive/MyDrive/DATASET/train.csv')
validation_df = pd.read_csv('/content/drive/MyDrive/DATASET/validation.csv')
test_df = pd.read_csv('/content/drive/MyDrive/DATASET/test.csv')

In [None]:
# Model Configuration
MODEL_NAME = "xlm-roberta-base"
EMBEDDING_DIM = 300
HIDDEN_DIM = 128
NUM_LAYERS = 1
DROPOUT = 0.3
BATCH_SIZE = 16
NUM_EPOCHS = 10
LEARNING_RATE = 2e-3
PATIENCE = 3
MAX_SEQ_LEN = 128

In [None]:
LABEL_COLS = ['toxic', 'abusive', 'vulgar', 'menace', 'offense', 'bigotry']

###### Preprocessing: Minimal cleaning for Bert-like model, remove newline, urls, lowercasing.

In [None]:
def clean_text(text):
    text = str(text).replace('\n', ' ')
    text = re.sub(r'http\S+', '', text)
    text = text.lower().strip()
    return text


In [None]:
for df in [train_df, validation_df]:
    df['feedback_text'] = df['feedback_text'].apply(clean_text)

######Ensuring all label columns are present, filling the missing values and lang colun are append before the feesback_text for multilingual contexts

In [None]:
def preprocess_train(df):
    for col in LABEL_COLS:
        df[col] = df.get(col, 0).fillna(0).astype(int)
    return df[['id', 'feedback_text'] + LABEL_COLS]

In [None]:
def preprocess_val(df):
    df['toxic'] = df['toxic'].astype(int)
    return df[['id', 'feedback_text', 'lang', 'toxic']]

train_df = preprocess_train(train_df)
validation_df = preprocess_val(validation_df)

In [None]:
if 'lang' in validation_df.columns:
    validation_df['feedback_text'] = validation_df.apply(lambda row: f"<{row['lang']}> " + row['feedback_text'], axis=1)

###### Tokenization: Tokenizing both dataset for preparing as a input to the transformer model

In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def tokenize_text(text):
    return tokenizer.encode(text, add_special_tokens=True, max_length=MAX_SEQ_LEN, truncation=True)

train_df['input_ids'] = train_df['feedback_text'].apply(tokenize_text)
validation_df['input_ids'] = validation_df['feedback_text'].apply(tokenize_text)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

Tokenizing training data...
Tokenizing validation data...


###### Padded to the same length for batching

In [None]:
def pad_sequence(seq, max_len):
    return seq + [tokenizer.pad_token_id] * (max_len - len(seq))

###### ToxicDataset: Training Dataset includes all columns whereas validation_df includes only toxic labels and lang for language specific analysis. and padding is neccessary for transformer based models for fixed length input.

In [None]:
class ToxicDataset(Dataset):
    def __init__(self, df, label_cols, is_train=True):
        self.df = df
        self.label_cols = label_cols
        self.is_train = is_train
        self.max_len = max(df['input_ids'].apply(len))

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        input_ids = row['input_ids']
        input_ids = pad_sequence(input_ids, self.max_len)
        item = {
            'input_ids': torch.tensor(input_ids, dtype=torch.long)
        }
        if self.is_train:
            labels = row[self.label_cols].values.astype(np.float32)
            item['labels'] = torch.tensor(labels, dtype=torch.float)
        else:
            item['labels'] = torch.tensor([row['toxic']], dtype=torch.float)
            item['lang'] = row['lang'] if 'lang' in row else "unknown"

        return item

In [None]:
train_dataset = ToxicDataset(train_df, LABEL_COLS, is_train=True)
val_dataset = ToxicDataset(validation_df, ['toxic'], is_train=False)


###### WeightRandomSampler: Handle class imbalance by assigning higher weights to the toxic labels which is minor class. and it is necessary for highly imbalanced datasets.

In [None]:
toxic_array = train_df['toxic'].values
class_sample_count = np.array([len(np.where(toxic_array == t)[0]) for t in [0,1]])
weight = 1. / class_sample_count
samples_weight = np.array([weight[int(t)] for t in toxic_array])
samples_weight = torch.from_numpy(samples_weight)
sampler = WeightedRandomSampler(samples_weight.type('torch.DoubleTensor'), len(samples_weight), replacement=True)


In [None]:
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, sampler=sampler, num_workers=2)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=2)

###### GRUToxicClassifier: It is a GRU based architechture with bidirectional processing. and the reson for choosing this model is for well-suiting of this task where sequence context is crucial. Additionaly, Dropout is added to prevent overfitting and logits is used with loss function, BCEWithLogitsLoss for training.

In [None]:
class GRUToxicClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers, output_dim, dropout):
        super(GRUToxicClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=tokenizer.pad_token_id)
        self.gru = nn.GRU(embedding_dim, hidden_dim, num_layers=num_layers,
                          batch_first=True, bidirectional=True, dropout=dropout)
        self.dropout = nn.Dropout(dropout)

        self.fc = nn.Linear(hidden_dim * 2, output_dim)

    def forward(self, input_ids):

        embedded = self.embedding(input_ids)
        gru_out, _ = self.gru(embedded)

        pooled = torch.mean(gru_out, dim=1)
        dropped = self.dropout(pooled)
        logits = self.fc(dropped)
        return logits


In [None]:
vocab_size = tokenizer.vocab_size
output_dim = len(LABEL_COLS)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = GRUToxicClassifier(vocab_size, EMBEDDING_DIM, HIDDEN_DIM, NUM_LAYERS, output_dim, DROPOUT)
model.to(device)



GRUToxicClassifier(
  (embedding): Embedding(250002, 300, padding_idx=1)
  (gru): GRU(300, 128, batch_first=True, dropout=0.3, bidirectional=True)
  (dropout): Dropout(p=0.3, inplace=False)
  (fc): Linear(in_features=256, out_features=6, bias=True)
)

###### Using positive wights for imbalanced dataset. Binary cross entropy with logit loss function is used.

In [None]:
label_sums = train_df[LABEL_COLS].sum().values
total_samples = len(train_df)

pos_weights = (total_samples - label_sums) / (label_sums + 1e-5)
pos_weights = torch.tensor(pos_weights, dtype=torch.float).to(device)
criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weights)
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE, weight_decay=1e-5)

In [None]:
def evaluate(model, dataloader):
    model.eval()
    all_logits = []
    all_labels = []
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            labels = batch['labels'].to(device)
            logits = model(input_ids)
            all_logits.append(logits.cpu())
            all_labels.append(labels.cpu())
    all_logits = torch.cat(all_logits)
    all_labels = torch.cat(all_labels)
    return all_logits, all_labels


######aggregate_toxic: Post processing multi-label predictions. To extract toxic labels out of six labels, this function simply selcts the first column. and it is also helpful for determining F1 score and threshold tuning.

In [None]:
def aggregate_toxic(logits):
    toxic_logits = logits[:, 0]
    return toxic_logits

###### Best models are saved and using early stopping preventing overfitting. and efficiently fine tune the GRU based classifier

In [None]:
best_val_f1 = 0
epochs_since_improve = 0
best_model_path = "best_gru_model.pt"

for epoch in range(NUM_EPOCHS):
    model.train()
    epoch_losses = []
    for batch in tqdm(train_loader, desc=f"Epoch {epoch+1} training"):
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        targets = batch['labels'].to(device)
        logits = model(input_ids)
        loss = criterion(logits, targets)
        loss.backward()
        optimizer.step()
        epoch_losses.append(loss.item())

    avg_loss = np.mean(epoch_losses)

    val_logits, val_labels = evaluate(model, val_loader)
    val_toxic_logits = aggregate_toxic(val_logits)
    val_toxic_probs = torch.sigmoid(val_toxic_logits).numpy()
    val_toxic_labels = val_labels[:, 0].numpy().astype(int)

    val_pred = (val_toxic_probs >= 0.5).astype(int)
    val_f1 = f1_score(val_toxic_labels, val_pred)

    print(f"Epoch {epoch+1}: Train Loss={avg_loss:.4f}, Val Toxic F1={val_f1:.4f}")

    if val_f1 > best_val_f1:
        best_val_f1 = val_f1
        epochs_since_improve = 0
        torch.save(model.state_dict(), best_model_path)
        print("  -> New best model saved!")
    else:
        epochs_since_improve += 1
        if epochs_since_improve >= PATIENCE:
            print("Early stopping triggered.")
            break


Epoch 1 training: 100%|██████████| 1468/1468 [00:47<00:00, 30.77it/s]


Epoch 1: Train Loss=1.1687, Val Toxic F1=0.3033
  -> New best model saved!


Epoch 2 training: 100%|██████████| 1468/1468 [00:47<00:00, 31.23it/s]


Epoch 2: Train Loss=0.5155, Val Toxic F1=0.3571
  -> New best model saved!


Epoch 3 training: 100%|██████████| 1468/1468 [00:46<00:00, 31.26it/s]


Epoch 3: Train Loss=0.2621, Val Toxic F1=0.3402


Epoch 4 training: 100%|██████████| 1468/1468 [00:47<00:00, 31.23it/s]


Epoch 4: Train Loss=0.2191, Val Toxic F1=0.3580
  -> New best model saved!


Epoch 5 training: 100%|██████████| 1468/1468 [00:46<00:00, 31.26it/s]


Epoch 5: Train Loss=0.1922, Val Toxic F1=0.2638


Epoch 6 training: 100%|██████████| 1468/1468 [00:47<00:00, 31.20it/s]


Epoch 6: Train Loss=0.1243, Val Toxic F1=0.2581


Epoch 7 training: 100%|██████████| 1468/1468 [00:46<00:00, 31.28it/s]


Epoch 7: Train Loss=0.0978, Val Toxic F1=0.2633
Early stopping triggered.


In [None]:
model.load_state_dict(torch.load(best_model_path))
print("Best model loaded.")

Best model loaded.


###### Threshold optimization: Binary classification always assumes 0.5 as a optimized threshold value. But imbalanced datasets are benefitted by fine tuning threshold for maximizing F1 value

In [None]:
val_logits, val_labels = evaluate(model, val_loader)
val_toxic_logits = aggregate_toxic(val_logits)
val_toxic_probs = torch.sigmoid(val_toxic_logits).numpy()
val_toxic_labels = val_labels[:, 0].numpy().astype(int)

best_thresh, best_thresh_f1 = 0.5, 0
for thresh in np.arange(0.1, 0.9, 0.01):
    val_pred = (val_toxic_probs >= thresh).astype(int)
    f1 = f1_score(val_toxic_labels, val_pred)
    if f1 > best_thresh_f1:
        best_thresh_f1 = f1
        best_thresh = thresh

print(f"Optimal threshold for toxic: {best_thresh:.2f} with F1 {best_thresh_f1:.4f}")


Optimal threshold for toxic: 0.51 with F1 0.3621


In [None]:
if 'lang' in validation_df.columns:

    model.eval()
    all_preds = []
    all_langs = []
    for batch in DataLoader(val_dataset, batch_size=BATCH_SIZE):
        input_ids = batch['input_ids'].to(device)
        with torch.no_grad():
            logits = model(input_ids)
        toxic_probs = torch.sigmoid(logits[:, 0]).cpu().numpy()
        preds = (toxic_probs >= best_thresh).astype(int)
        all_preds.extend(preds)

        all_langs.extend(batch['lang'])

    validation_df['predicted_toxic'] = all_preds
    for lang in validation_df['lang'].unique():
        subset = validation_df[validation_df['lang'] == lang]
        print(f"\nLanguage: {lang}")
        print(classification_report(subset['toxic'], subset['predicted_toxic']))



Language: tr
              precision    recall  f1-score   support

           0       0.92      0.97      0.94       271
           1       0.36      0.18      0.24        28

    accuracy                           0.89       299
   macro avg       0.64      0.57      0.59       299
weighted avg       0.87      0.89      0.88       299


Language: es
              precision    recall  f1-score   support

           0       0.87      0.61      0.72       227
           1       0.27      0.62      0.37        52

    accuracy                           0.61       279
   macro avg       0.57      0.61      0.55       279
weighted avg       0.76      0.61      0.66       279


Language: it
              precision    recall  f1-score   support

           0       0.85      0.74      0.79       208
           1       0.33      0.48      0.39        54

    accuracy                           0.69       262
   macro avg       0.59      0.61      0.59       262
weighted avg       0.74      0.6