In [1]:
import gdown
files = {
   "mal_full_offensive_train.csv": "1RZ8vMy7SrCl70X7CWDD5LEmwLcYOxSpn",
   "mal_full_offensive_dev.csv": '1wT5EM_k-D81sZFTf0JPRHHm-2Lr7nxiR'
}
for file_name, file_id in files.items():
  url = f'https://drive.google.com/uc?export=download&id={file_id}'
  gdown.download(url, file_name, quiet=False)

Downloading...
From: https://drive.google.com/uc?export=download&id=1RZ8vMy7SrCl70X7CWDD5LEmwLcYOxSpn
To: /kaggle/working/mal_full_offensive_train.csv
100%|██████████| 2.02M/2.02M [00:00<00:00, 131MB/s]
Downloading...
From: https://drive.google.com/uc?export=download&id=1wT5EM_k-D81sZFTf0JPRHHm-2Lr7nxiR
To: /kaggle/working/mal_full_offensive_dev.csv
100%|██████████| 258k/258k [00:00<00:00, 94.3MB/s]


In [2]:
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification
import torch.optim as optim
from torch.optim.lr_scheduler import StepLR
import pandas as pd
from tqdm import tqdm 
import random
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report


In [3]:
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed) 
    torch.cuda.manual_seed_all(seed) 
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
# set_seed(42)

In [4]:
## hyperparameters
device = 'cuda' if torch.cuda.is_available() else "cpu" 
embed_vector_size = 256
num_classes = 5
lr = 1e-4
lr_step_size = 2
lr_decay = 0.9
early_stopping_patience = 5
num_epochs = 50
model_name = "google/muril-base-cased"
num_heads = 4

In [5]:
train_df = pd.read_csv("mal_full_offensive_train.csv")
class_weights = torch.tensor(1e4 / train_df.Labels.value_counts().values)

In [6]:
class OffensiveDataset(Dataset):
    def __init__(self, path, device="cpu"):
        self._path = path
        assert path.endswith('.csv'), "expected a csv file"
        self.df = pd.read_csv(path)
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.classes = list(self.df.Labels.unique())
        self.classes.reverse()
        self.nclasses = len(self.classes)
        self.columns = self.df.columns
    def __len__(self) -> int:
        return self.df.shape[0]

    def get_label(self, labels):
        assert isinstance(labels, np.ndarray)
        assert labels.ndim == 1
        batch_size = labels.shape[0]
        vec = torch.zeros(batch_size, self.nclasses)
        indices = torch.tensor([self.classes.index(label) for label in labels])
        vec[torch.arange(batch_size), indices] = 1
        return vec.squeeze()
        
    def __getitem__(self, idx):
        sentences = self.df.Text.iloc[idx] if isinstance(idx, int) else self.df.Text.iloc[idx].tolist()
        labels = np.array([self.df.Labels.iloc[idx]]) if isinstance(idx, int) else self.df.Labels.iloc[idx].values
        return self.tokenize(sentences, max_length=embed_vector_size), self.get_label(labels)
        
    def tokenize(self, sentences, max_length=256):
        inputs = self.tokenizer(sentences, return_tensors='pt', padding='max_length', truncation=True, max_length=max_length)
        inputs = {key: value.squeeze() for key, value in inputs.items()}
        return tuple(inputs.values())

In [7]:
class Embedding(nn.Module):
    def __init__(self, device='cpu'):
        super(Embedding, self).__init__()
        self.model = AutoModel.from_pretrained(model_name)
        self.output_vector_size = self.model.config.hidden_size
    def forward(self, input_ids, token_type_ids, attention_mask):
        output = self.model(input_ids=input_ids, token_type_ids=token_type_ids, attention_mask = attention_mask)
        return output.last_hidden_state[:, 0, :]

In [8]:
class FFNClassifier(nn.Module):
    def __init__(self, num_classes, hidden_size=512, dropout_prob=0.2):
        super(FFNClassifier, self).__init__()
        self.embed = Embedding()
        self.dropout = nn.Dropout(dropout_prob)
        self.relu = nn.ReLU()
        self.softmax = nn.Softmax(dim=1)
        self.fc1 = nn.Linear(self.embed.output_vector_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, num_classes)
        self.layernorm = nn.LayerNorm(hidden_size)

        for param in self.embed.parameters():
            param.requires_grad = False

    def forward(self,input_ids, token_type_ids, attention_mask):
        x = self.embed(input_ids, token_type_ids, attention_mask)
        x = self.fc1(x)
        x = self.dropout(x)
        x = self.layernorm(x)
        x = self.relu(x)
        logits = self.fc2(x)
        return self.softmax(logits)
        

In [9]:
class BiLSTMClassifier(nn.Module):
    def __init__(self, num_classes, hidden_size=512, dropout_prob=0.2):
        super(BiLSTMClassifier, self).__init__()
        self.embed = Embedding()
        self.bilstm = nn.LSTM(self.embed.output_vector_size, hidden_size, bidirectional=True, batch_first=True)
        self.dropout = nn.Dropout(dropout_prob)
        self.fc1 = nn.Linear(hidden_size * 2, 512) 
        self.fc2 = nn.Linear(512, num_classes)
        self.relu = nn.ReLU()
        self.softmax = nn.Softmax(dim=1)

        for param in self.embed.parameters():
            param.requires_grad = False

    def forward(self, input_ids, token_type_ids, attention_mask):
        x = self.embed(input_ids, token_type_ids, attention_mask)
        x = x.unsqueeze(1)
        lstm_out, _ = self.bilstm(x)
        lstm_out = lstm_out[:, -1, :]  
        x = self.fc1(lstm_out)
        x = self.dropout(x)
        x = self.relu(x)
        logits = self.fc2(x)
        return self.softmax(logits)

In [10]:
class HANClassifier(nn.Module):
    def __init__(self, num_classes, hidden_size=512, dropout_prob=0.2, num_heads=4):
        super(HANClassifier, self).__init__()
        self.embed = Embedding()

        self.word_rnn = nn.LSTM(self.embed.output_vector_size, hidden_size, batch_first=True)
        self.word_attention = nn.MultiheadAttention(embed_dim=hidden_size, num_heads=num_heads, batch_first=True)
        
        self.sentence_rnn = nn.LSTM(hidden_size, hidden_size, batch_first=True)
        self.sentence_attention = nn.MultiheadAttention(embed_dim=hidden_size, num_heads=num_heads, batch_first=True)
        
        self.dropout = nn.Dropout(dropout_prob)
        self.fc1 = nn.Linear(hidden_size, 512)
        self.fc2 = nn.Linear(512, num_classes)
        self.relu = nn.ReLU()
        self.softmax = nn.Softmax(dim=1)

        for param in self.embed.parameters():
            param.requires_grad = False

    def forward(self, input_ids, token_type_ids, attention_mask):
        x = self.embed(input_ids, token_type_ids, attention_mask)
        x = x.unsqueeze(1)
        word_out, _ = self.word_rnn(x)
        word_out = word_out.permute(1, 0, 2)
        word_attended, _ = self.word_attention(word_out, word_out, word_out)  
        word_attended = word_attended.permute(1, 0, 2)  
        word_attended = word_attended.mean(dim=1)  
        
        sentence_out, _ = self.sentence_rnn(word_attended.unsqueeze(1)) 
        sentence_out = sentence_out.permute(1, 0, 2) 
        sentence_attended, _ = self.sentence_attention(sentence_out, sentence_out, sentence_out)  
        sentence_attended = sentence_attended.permute(1, 0, 2)  
        sentence_attended = sentence_attended.mean(dim=1)  
        
        x = self.fc1(sentence_attended)
        x = self.dropout(x)
        x = self.relu(x)
        logits = self.fc2(x)
        return self.softmax(logits)

In [11]:
train_csv = "./mal_full_offensive_train.csv"
test_csv = "./mal_full_offensive_dev.csv"

train_dataset = OffensiveDataset(train_csv, device=device)
test_dataset = OffensiveDataset(test_csv, device=device)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

tokenizer_config.json:   0%|          | 0.00/206 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/411 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/113 [00:00<?, ?B/s]

In [12]:
for _, labels in train_loader:
    _, encoded = torch.max(labels, dim=1)
    encoded = pd.Series(encoded.numpy())
    print(encoded.value_counts())
    break

4    60
2     4
Name: count, dtype: int64


In [13]:
e = FFNClassifier(num_classes = num_classes)
e(*train_dataset[:10][0]).shape

2025-07-31 10:56:52.792239: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1753959413.148575      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1753959413.250152      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


pytorch_model.bin:   0%|          | 0.00/953M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/953M [00:00<?, ?B/s]

torch.Size([10, 5])

In [14]:
for (input_ids, token_type_ids, attention_mask), label in train_loader:
    print(input_ids.shape, token_type_ids.shape, attention_mask.shape, label.shape)
    break

torch.Size([64, 256]) torch.Size([64, 256]) torch.Size([64, 256]) torch.Size([64, 5])


In [15]:
# model = Classifier(num_classes=num_classes).to(device) 
# model = HANClassifier(num_classes=num_classes, num_heads=num_heads).to(device) 
model = BiLSTMClassifier(num_classes=num_classes).to(device)
model = nn.DataParallel(model)
criterion = nn.CrossEntropyLoss(weight=class_weights.to(device))
optimizer = optim.AdamW(model.parameters(), lr=lr)
scheduler = StepLR(optimizer, step_size=lr_step_size, gamma=lr_decay)  
best_val_loss = float('inf')
patience_counter = 0

In [None]:
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0

    for (input_ids, token_type_ids, attention_mask), label in tqdm(train_loader):
        inputs_ids = input_ids.to(device)
        token_type_ids = token_type_ids.to(device)
        attention_mask = attention_mask.to(device)
        label = label.to(device)
        
        optimizer.zero_grad()

        outputs = model(input_ids, token_type_ids, attention_mask)
        loss = criterion(outputs, label)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        _, predicted = torch.max(outputs, 1)
        _, actual = torch.max(label, 1)
        correct += (predicted == actual).sum().item()
        total += label.size(0)

    train_loss = running_loss / len(train_loader)
    train_accuracy = correct / total

    model.eval()
    val_loss = 0.0
    correct = 0
    total = 0
    y_true = []  
    y_pred = []  
    with torch.no_grad():
        for (input_ids, token_type_ids, attention_mask), label in tqdm(test_loader):
            inputs_ids = input_ids.to(device)
            token_type_ids = token_type_ids.to(device)
            attention_mask = attention_mask.to(device)
            label = label.to(device)
            outputs = model(input_ids, token_type_ids, attention_mask)
            loss = criterion(outputs, label)

            val_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            _, actual = torch.max(label, 1)
            correct += (predicted == actual).sum().item()
            y_true.extend(actual.cpu().numpy())
            y_pred.extend(predicted.cpu().numpy())
            total += label.size(0)

    val_loss /= len(test_loader)
    val_accuracy = correct / total

    print(f'Epoch {epoch+1}/{num_epochs} - '
          f'Train Loss: {train_loss:.4f}, Train Accuracy: {train_accuracy:.4f} - '
          f'Val Loss: {val_loss:.4f}, Val Accuracy: {val_accuracy:.4f}')
    # print("\nClassification Report:")
    # print(classification_report(y_true, y_pred))
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        patience_counter = 0
        torch.save(model.state_dict(), 'best_model.pth')
    else:
        patience_counter += 1
        if patience_counter >= early_stopping_patience:
            print("Early stopping triggered")
            break
    scheduler.step()

 18%|█▊        | 44/251 [00:30<02:16,  1.52it/s]