In [2]:
!pip install --upgrade pip setuptools wheel
!pip install transformers==4.28.0 torch==2.0.0 gradio==3.32.0 pandas scikit-learn



Collecting pip
  Downloading pip-25.1.1-py3-none-any.whl.metadata (3.6 kB)
Collecting setuptools
  Downloading setuptools-80.3.1-py3-none-any.whl.metadata (6.5 kB)
Downloading pip-25.1.1-py3-none-any.whl (1.8 MB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m1.8/1.8 MB[0m [31m20.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading setuptools-80.3.1-py3-none-any.whl (1.2 MB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m1.2/1.2 MB[0m [31m51.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: setuptools, pip
  Attempting uninstall: setuptools
    Found existing installation: setuptools 75.2.0
    Uninstalling setuptools-75.2.0:
      Successfully uninstalled setuptools-75.2.0
  Attempting uninstall: pip
    Found existing installation: pip 24.1.2
    Uninstalling pip-24.1.2:
      

Collecting transformers==4.28.0
  Using cached transformers-4.28.0-py3-none-any.whl.metadata (109 kB)
Collecting torch==2.0.0
  Using cached torch-2.0.0-cp311-cp311-manylinux1_x86_64.whl.metadata (24 kB)
Collecting gradio==3.32.0
  Using cached gradio-3.32.0-py3-none-any.whl.metadata (14 kB)
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers==4.28.0)
  Downloading tokenizers-0.13.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Collecting nvidia-cuda-nvrtc-cu11==11.7.99 (from torch==2.0.0)
  Downloading nvidia_cuda_nvrtc_cu11-11.7.99-2-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu11==11.7.99 (from torch==2.0.0)
  Downloading nvidia_cuda_runtime_cu11-11.7.99-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cuda-cupti-cu11==11.7.101 (from torch==2.0.0)
  Downloading nvidia_cuda_cupti_cu11-11.7.101-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu11==8.5.0.96 (from t

In [1]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, classification_report

import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DistilBertTokenizerFast, DistilBertForSequenceClassification


In [2]:
# Load the toxic comments dataset
df = pd.read_csv('train.csv')  # This CSV contains the comments and their toxic labels
print("Total samples:", len(df))
df.head(3)


Total samples: 159571


Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0


In [3]:
# Define the label columns and number of labels
LABEL_COLS = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
NUM_LABELS = len(LABEL_COLS)

# Split into train/val/test
train_df, temp_df = train_test_split(df, test_size=0.2, random_state=42)   # 20% temp for val+test
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)  # split temp into half for val and test

print(f"Train size: {len(train_df)}, Val size: {len(val_df)}, Test size: {len(test_df)}")


Train size: 127656, Val size: 15957, Test size: 15958


In [4]:
# Initialize tokenizers for DistilBERT and DeBERTa
tokenizer_distil = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
tokenizer_deberta = AutoTokenizer.from_pretrained('microsoft/deberta-v3-base')  # uses DeBERTa-V3 tokenizer

MAX_LENGTH = 128

# Tokenize the datasets for each model
def tokenize_batch(text_list, tokenizer):
    return tokenizer(text_list, padding='max_length', truncation=True, max_length=MAX_LENGTH, return_tensors='pt')

# Prepare tokenized datasets
train_encodings_distil = tokenize_batch(list(train_df['comment_text']), tokenizer_distil)
val_encodings_distil   = tokenize_batch(list(val_df['comment_text']), tokenizer_distil)
test_encodings_distil  = tokenize_batch(list(test_df['comment_text']), tokenizer_distil)

train_encodings_deberta = tokenize_batch(list(train_df['comment_text']), tokenizer_deberta)
val_encodings_deberta   = tokenize_batch(list(val_df['comment_text']), tokenizer_deberta)
test_encodings_deberta  = tokenize_batch(list(test_df['comment_text']), tokenizer_deberta)

# Extract labels as tensors
train_labels = torch.tensor(train_df[LABEL_COLS].values, dtype=torch.float32)
val_labels   = torch.tensor(val_df[LABEL_COLS].values, dtype=torch.float32)
test_labels  = torch.tensor(test_df[LABEL_COLS].values, dtype=torch.float32)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]



In [5]:
from torch.utils.data import Dataset, DataLoader

class ToxicCommentDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    def __len__(self):
        return self.labels.shape[0]
    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

# Create dataset instances
train_dataset_distil = ToxicCommentDataset(train_encodings_distil, train_labels)
val_dataset_distil   = ToxicCommentDataset(val_encodings_distil, val_labels)
test_dataset_distil  = ToxicCommentDataset(test_encodings_distil, test_labels)

train_dataset_deberta = ToxicCommentDataset(train_encodings_deberta, train_labels)
val_dataset_deberta   = ToxicCommentDataset(val_encodings_deberta, val_labels)
test_dataset_deberta  = ToxicCommentDataset(test_encodings_deberta, test_labels)

# DataLoaders for training
batch_size = 16  # using small batch for demonstration; can increase on A100 for speed
train_loader_distil = DataLoader(train_dataset_distil, batch_size=batch_size, shuffle=True)
val_loader_distil   = DataLoader(val_dataset_distil, batch_size=batch_size, shuffle=False)

train_loader_deberta = DataLoader(train_dataset_deberta, batch_size=batch_size, shuffle=True)
val_loader_deberta   = DataLoader(val_dataset_deberta, batch_size=batch_size, shuffle=False)


In [6]:
# Load DistilBERT model for sequence classification
model_distil = DistilBertForSequenceClassification.from_pretrained(
    'distilbert-base-uncased',
    num_labels=NUM_LABELS,
    problem_type="multi_label_classification"
)
model_distil = model_distil.cuda()  # move to GPU if available

# Prepare optimizer
optim_distil = torch.optim.AdamW(model_distil.parameters(), lr=2e-5)


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
num_epochs = 2
model_distil.train()
for epoch in range(num_epochs):
    total_loss = 0
    for batch in train_loader_distil:
        # Move batch to GPU
        input_ids = batch['input_ids'].cuda()
        attention_mask = batch['attention_mask'].cuda()
        labels = batch['labels'].cuda()
        # Forward pass
        outputs = model_distil(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        # Backpropagation
        optim_distil.zero_grad()
        loss.backward()
        optim_distil.step()
        total_loss += loss.item()
    avg_loss = total_loss / len(train_loader_distil)
    print(f"Epoch {epoch+1}/{num_epochs} - Training loss: {avg_loss:.4f}")


Epoch 1/2 - Training loss: 0.0479
Epoch 2/2 - Training loss: 0.0342


In [8]:
# Evaluate on validation set to get raw logits/probabilities
model_distil.eval()
val_logits = []
val_true = []
with torch.no_grad():
    for batch in val_loader_distil:
        input_ids = batch['input_ids'].cuda()
        attention_mask = batch['attention_mask'].cuda()
        labels = batch['labels'].numpy()
        outputs = model_distil(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits.cpu().numpy()
        val_logits.append(logits)
        val_true.append(labels)
val_logits = np.concatenate(val_logits, axis=0)
val_true = np.concatenate(val_true, axis=0)

# Compute sigmoid probabilities
val_probs = 1 / (1 + np.exp(-val_logits))

# Find best threshold for each label
best_thresholds = []
for i in range(NUM_LABELS):
    y_true = val_true[:, i]
    y_prob = val_probs[:, i]
    best_thr = 0.5
    best_f1 = 0.0
    # Try thresholds between 0 and 1
    for thr in np.linspace(0.0, 1.0, 101):
        y_pred = (y_prob >= thr).astype(int)
        score = f1_score(y_true, y_pred, zero_division=0)
        if score > best_f1:
            best_f1 = score
            best_thr = thr
    best_thresholds.append(best_thr)
    print(f"Label {LABEL_COLS[i]:<12}: best threshold = {best_thr:.2f}, F1 = {best_f1:.3f}")


Label toxic       : best threshold = 0.47, F1 = 0.834
Label severe_toxic: best threshold = 0.19, F1 = 0.515
Label obscene     : best threshold = 0.40, F1 = 0.850
Label threat      : best threshold = 0.30, F1 = 0.610
Label insult      : best threshold = 0.36, F1 = 0.772
Label identity_hate: best threshold = 0.32, F1 = 0.602


In [9]:
# Evaluate on test set using the tuned thresholds
test_logits = []
test_true = []
with torch.no_grad():
    for batch in test_dataset_distil:
        input_ids = batch['input_ids'].unsqueeze(0).cuda()
        attention_mask = batch['attention_mask'].unsqueeze(0).cuda()
        labels = batch['labels'].numpy()
        outputs = model_distil(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits.cpu().numpy()
        test_logits.append(logits[0])
        test_true.append(labels)
test_probs = 1 / (1 + np.exp(-np.array(test_logits)))
test_preds = (test_probs >= np.array(best_thresholds)).astype(int)
print("\nClassification Report for Model I (DistilBERT):\n")
print(classification_report(np.array(test_true), test_preds, target_names=LABEL_COLS, zero_division=0))



Classification Report for Model I (DistilBERT):

               precision    recall  f1-score   support

        toxic       0.87      0.79      0.83      1520
 severe_toxic       0.39      0.72      0.50       162
      obscene       0.82      0.86      0.84       856
       threat       0.47      0.62      0.53        37
       insult       0.71      0.86      0.78       808
identity_hate       0.64      0.40      0.49       138

    micro avg       0.77      0.80      0.78      3521
    macro avg       0.65      0.71      0.66      3521
 weighted avg       0.79      0.80      0.79      3521
  samples avg       0.06      0.07      0.07      3521



In [11]:
def explain_attention_distil(text: str, top_k: int = 10) -> str:
    # Tokenize the input text for DistilBERT
    enc = tokenizer_distil(text, padding='max_length', truncation=True, max_length=MAX_LENGTH, return_tensors='pt')
    input_ids = enc['input_ids'].cuda()
    attention_mask = enc['attention_mask'].cuda()
    # Forward pass with attention outputs
    outputs = model_distil(input_ids=input_ids, attention_mask=attention_mask, output_attentions=True)
    attentions = outputs.attentions  # tuple of (num_layers, batch, num_heads, seq_len, seq_len)
    # Convert tuple to tensor for easier manipulation
    attn_tensor = torch.stack(attentions, dim=0)  # shape: (num_layers, 1, num_heads, S, S)
    # Average over heads and layers (collapse batch of size 1 as well)
    attn_mean_heads = attn_tensor.mean(dim=2)       # average over heads -> (num_layers, 1, S, S)
    attn_mean_layers = attn_mean_heads.mean(dim=0)  # average over layers -> (1, S, S)
    avg_attn = attn_mean_layers[0]                  # shape: (S, S)
    # Get attention from CLS token (position 0) to all tokens
    cls_attn = avg_attn[0]  # (S,)
    cls_attn = cls_attn.detach().cpu().numpy()

    # Convert input_ids to tokens
    tokens = tokenizer_distil.convert_ids_to_tokens(enc['input_ids'][0])
    # Pair tokens with their attention weight, filter out special tokens
    pairs = [(tok, float(cls_attn[i]))
             for i, tok in enumerate(tokens) if tok not in ("[PAD]", "[CLS]", "[SEP]")]
    # Sort by attention weight and take top_k
    topk = sorted(pairs, key=lambda x: x[1], reverse=True)[:top_k]
    # Format the result as lines "token: weight"
    lines = [f"{tok:<15} {weight:.4f}" for tok, weight in topk]
    return "Top Attention Tokens (Model I):\n" + "\n".join(lines)

# Example explanation (to verify the function works, using a sample from val set)
example_text = val_df['comment_text'].iloc[0]
print("Comment:", example_text)
print(explain_attention_distil(example_text, top_k=5))


Comment: yo  

you could at least reply to my messages instead of deleting them.
Top Attention Tokens (Model I):
yo              0.1091
.               0.0540
##eti           0.0272
them            0.0233
my              0.0229


In [12]:
# Load DeBERTa model for sequence classification (Model II)
model_deberta_focal = AutoModelForSequenceClassification.from_pretrained(
    'microsoft/deberta-v3-base',
    num_labels=NUM_LABELS,
    problem_type="multi_label_classification"
)
model_deberta_focal = model_deberta_focal.cuda()

# Define Focal Loss with label smoothing
class FocalLoss(nn.Module):
    def __init__(self, gamma=2.0, smoothing=0.1):
        super(FocalLoss, self).__init__()
        self.gamma = gamma
        self.smoothing = smoothing
    def forward(self, logits, targets):
        # Apply label smoothing: y_ls = y*(1-alpha) + alpha/2 for each label
        if self.smoothing > 0:
            targets = targets * (1 - self.smoothing) + 0.5 * self.smoothing
        # Compute binary cross entropy with logits for each label (no reduction)
        bce = nn.functional.binary_cross_entropy_with_logits(logits, targets, reduction='none')
        # Convert to probability space for focal scaling factor pt
        # p_t = exp(-bce) as given by the focal loss formula
        pt = torch.exp(-bce)
        # Compute focal loss scaling factor
        focal_factor = (1 - pt) ** self.gamma
        # Apply focal factor to BCE loss
        loss = focal_factor * bce
        # Average loss over all samples and labels
        return loss.mean()

criterion_focal = FocalLoss(gamma=2.0, smoothing=0.1)
optim_deberta_focal = torch.optim.AdamW(model_deberta_focal.parameters(), lr=2e-5)


pytorch_model.bin:   0%|          | 0.00/371M [00:00<?, ?B/s]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
num_epochs = 2
model_deberta_focal.train()
for epoch in range(num_epochs):
    total_loss = 0.0
    for batch in train_loader_deberta:
        input_ids = batch['input_ids'].cuda()
        attention_mask = batch['attention_mask'].cuda()
        labels = batch['labels'].cuda()
        # Forward pass (get logits)
        outputs = model_deberta_focal(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        loss = criterion_focal(logits, labels)
        optim_deberta_focal.zero_grad()
        loss.backward()
        optim_deberta_focal.step()
        total_loss += loss.item()
    avg_loss = total_loss / len(train_loader_deberta)
    print(f"Epoch {epoch+1} - Model II Focal Loss: {avg_loss:.4f}")


Epoch 1 - Model II Focal Loss: 0.0193
Epoch 2 - Model II Focal Loss: 0.0162


In [16]:
# Evaluate Model II on validation set for threshold tuning
from torch.utils.data import DataLoader

test_loader_deberta = DataLoader(test_dataset_deberta, batch_size=16)

model_deberta_focal.eval()
val_logits = []
val_true = []
with torch.no_grad():
    for batch in val_loader_deberta:
        input_ids = batch['input_ids'].cuda()
        attention_mask = batch['attention_mask'].cuda()
        labels = batch['labels'].numpy()
        outputs = model_deberta_focal(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits.cpu().numpy()
        val_logits.append(logits)
        val_true.append(labels)
val_logits = np.concatenate(val_logits, axis=0)
val_true = np.concatenate(val_true, axis=0)
val_probs = 1 / (1 + np.exp(-val_logits))

best_thresholds_model2 = []
for i in range(NUM_LABELS):
    y_true = val_true[:, i]
    y_prob = val_probs[:, i]
    best_thr = 0.5
    best_f1 = 0.0
    for thr in np.linspace(0, 1, 101):
        y_pred = (y_prob >= thr).astype(int)
        score = f1_score(y_true, y_pred, zero_division=0)
        if score > best_f1:
            best_f1 = score
            best_thr = thr
    best_thresholds_model2.append(best_thr)
    print(f"Label {LABEL_COLS[i]:<12}: best threshold = {best_thr:.2f}, F1 = {best_f1:.3f}")


Label toxic       : best threshold = 0.52, F1 = 0.837
Label severe_toxic: best threshold = 0.43, F1 = 0.491
Label obscene     : best threshold = 0.49, F1 = 0.838
Label threat      : best threshold = 0.58, F1 = 0.526
Label insult      : best threshold = 0.46, F1 = 0.756
Label identity_hate: best threshold = 0.42, F1 = 0.577


In [17]:
# Classification report for Model II
test_logits = []
test_true = []
with torch.no_grad():
    for batch in test_loader_deberta:
        input_ids = batch['input_ids'].cuda()
        attention_mask = batch['attention_mask'].cuda()
        labels = batch['labels'].numpy()
        outputs = model_deberta_focal(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits.cpu().numpy()
        test_logits.append(logits)
        test_true.append(labels)
test_logits = np.concatenate(test_logits, axis=0)
test_true = np.concatenate(test_true, axis=0)
test_probs = 1 / (1 + np.exp(-test_logits))
test_preds = (test_probs >= np.array(best_thresholds_model2)).astype(int)
print("\nClassification Report for Model II (DeBERTa + Focal Loss):\n")
print(classification_report(test_true, test_preds, target_names=LABEL_COLS, zero_division=0))



Classification Report for Model II (DeBERTa + Focal Loss):

               precision    recall  f1-score   support

        toxic       0.84      0.83      0.83      1520
 severe_toxic       0.38      0.78      0.51       162
      obscene       0.84      0.81      0.83       856
       threat       0.39      0.57      0.46        37
       insult       0.78      0.77      0.78       808
identity_hate       0.54      0.54      0.54       138

    micro avg       0.77      0.79      0.78      3521
    macro avg       0.63      0.72      0.66      3521
 weighted avg       0.79      0.79      0.79      3521
  samples avg       0.07      0.07      0.07      3521



In [None]:
# Load DeBERTa model for adversarial training (Model III)
model_deberta_adv = AutoModelForSequenceClassification.from_pretrained(
    'microsoft/deberta-v3-base',
    num_labels=NUM_LABELS,
    problem_type="multi_label_classification"
)
model_deberta_adv = model_deberta_adv.cuda()
optim_deberta_adv = torch.optim.AdamW(model_deberta_adv.parameters(), lr=2e-5)

# Training with FGSM adversarial augmentation
epsilon = 0.1  # perturbation magnitude
num_epochs = 2
model_deberta_adv.train()
for epoch in range(num_epochs):
    total_loss = 0.0
    for batch in train_loader_deberta:
        input_ids = batch['input_ids'].cuda()
        attention_mask = batch['attention_mask'].cuda()
        labels = batch['labels'].cuda()
        # Step 1: Forward pass on clean inputs
        # We will manually get embeddings to apply FGSM
        # Get embedding output for input ids
        embeddings = model_deberta_adv.base_model.embeddings.word_embeddings(input_ids)
        embeddings.retain_grad()  # retain grad on embeddings for FGSM
        outputs_clean = model_deberta_adv(inputs_embeds=embeddings, attention_mask=attention_mask)
        logits_clean = outputs_clean.logits
        loss_clean = nn.functional.binary_cross_entropy_with_logits(logits_clean, labels)
        # Step 2: Backpropagate to get gradients w.rt embeddings
        optim_deberta_adv.zero_grad()
        loss_clean.backward(retain_graph=True)  # compute grad, keep graph for second pass
        # Step 3: FGSM perturbation on embeddings
        grad = embeddings.grad.detach()                 # gradient of loss wrt embeddings
        perturbation = epsilon * torch.sign(grad)       # compute perturbation
        embeddings_adv = embeddings + perturbation      # adversarial embeddings
        # Step 4: Forward pass with adversarial embeddings
        outputs_adv = model_deberta_adv(inputs_embeds=embeddings_adv.detach(), attention_mask=attention_mask)
        logits_adv = outputs_adv.logits
        loss_adv = nn.functional.binary_cross_entropy_with_logits(logits_adv, labels)
        # Step 5: Combine losses (we average them to balance importance)
        total_batch_loss = 0.5 * loss_clean + 0.5 * loss_adv
        # Step 6: Backpropagate combined loss and update weights
        optim_deberta_adv.zero_grad()  # clear gradients (note: also cleared embeddings.grad)
        total_batch_loss.backward()
        optim_deberta_adv.step()
        total_loss += total_batch_loss.item()
    avg_loss = total_loss / len(train_loader_deberta)
    print(f"Epoch {epoch+1} - Model III Adv Training Loss: {avg_loss:.4f}")


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Evaluate Model III on validation set for threshold tuning
model_deberta_adv.eval()
val_logits = []
val_true = []
with torch.no_grad():
    for batch in val_loader_deberta:
        input_ids = batch['input_ids'].cuda()
        attention_mask = batch['attention_mask'].cuda()
        labels = batch['labels'].numpy()
        outputs = model_deberta_adv(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits.cpu().numpy()
        val_logits.append(logits)
        val_true.append(labels)
val_logits = np.concatenate(val_logits, axis=0)
val_true = np.concatenate(val_true, axis=0)
val_probs = 1 / (1 + np.exp(-val_logits))

best_thresholds_model3 = []
for i in range(NUM_LABELS):
    y_true = val_true[:, i]
    y_prob = val_probs[:, i]
    best_thr = 0.5
    best_f1 = 0.0
    for thr in np.linspace(0, 1, 101):
        y_pred = (y_prob >= thr).astype(int)
        score = f1_score(y_true, y_pred, zero_division=0)
        if score > best_f1:
            best_f1 = score
            best_thr = thr
    best_thresholds_model3.append(best_thr)
    print(f"Label {LABEL_COLS[i]:<12}: best threshold = {best_thr:.2f}, F1 = {best_f1:.3f}")


In [None]:
# Classification report for Model III
test_logits = []
test_true = []
with torch.no_grad():
    for batch in test_loader_deberta:
        input_ids = batch['input_ids'].cuda()
        attention_mask = batch['attention_mask'].cuda()
        labels = batch['labels'].numpy()
        outputs = model_deberta_adv(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits.cpu().numpy()
        test_logits.append(logits)
        test_true.append(labels)
test_logits = np.concatenate(test_logits, axis=0)
test_true = np.concatenate(test_true, axis=0)
test_probs = 1 / (1 + np.exp(-test_logits))
test_preds = (test_probs >= np.array(best_thresholds_model3)).astype(int)
print("\nClassification Report for Model III (DeBERTa + Adv Training):\n")
print(classification_report(test_true, test_preds, target_names=LABEL_COLS, zero_division=0))


In [None]:
import gradio as gr

# Prepare label names in a nice format
LABEL_DISPLAY = ["Toxic", "Severe Toxic", "Obscene", "Threat", "Insult", "Identity Hate"]

def predict_toxicity(comment, model_choice):
    comment = comment.strip()
    if len(comment) == 0:
        return "Please enter a comment.", ""
    model_choice = model_choice.lower()
    # Choose model and tokenizer based on selection
    if model_choice == "distilbert (model i)":
        tokenizer = tokenizer_distil
        model = model_distil
        thresholds = best_thresholds
    elif model_choice == "deberta + focal loss (model ii)":
        tokenizer = tokenizer_deberta
        model = model_deberta_focal
        thresholds = best_thresholds_model2
    elif model_choice == "deberta + adv training (model iii)":
        tokenizer = tokenizer_deberta
        model = model_deberta_adv
        thresholds = best_thresholds_model3
    else:
        return "Invalid model choice.", ""
    # Tokenize input
    enc = tokenizer(comment, padding='max_length', truncation=True, max_length=MAX_LENGTH, return_tensors='pt')
    input_ids = enc['input_ids'].cuda()
    attention_mask = enc['attention_mask'].cuda()
    # Get model predictions
    model.eval()
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, output_attentions=(model_choice.startswith("distilbert")))
        logits = outputs.logits
        probs = torch.sigmoid(logits)[0].cpu().numpy()
    # Determine yes/no using thresholds
    pred_lines = []
    for prob, thr, label in zip(probs, thresholds, LABEL_DISPLAY):
        flag = "YES" if prob >= thr else "no"
        pred_lines.append(f"{label}: {flag:<3} ({prob:.2f})")
    prediction_text = "\n".join(pred_lines)
    # Prepare explanation if DistilBERT
    explanation_text = ""
    if model_choice.startswith("distilbert"):
        # Use the explain_attention_distil function
        explanation_text = explain_attention_distil(comment, top_k=10)
    else:
        explanation_text = "Attention-based explanation is only available for Model I (DistilBERT)."
    return prediction_text, explanation_text

# Set up Gradio interface components
model_options = ["DistilBERT (Model I)", "DeBERTa + Focal Loss (Model II)", "DeBERTa + Adv Training (Model III)"]
with gr.Blocks() as demo:
    gr.Markdown("## Toxic Comment Detector üëæ\nSelect a model and enter a comment to analyze its toxicity. Model I (DistilBERT) also provides an attention-based explanation for its prediction.")
    model_choice = gr.Dropdown(choices=model_options, value=model_options[0], label="Choose Model")
    comment_input = gr.Textbox(lines=4, placeholder="Enter a comment here...", label="Comment")
    predict_button = gr.Button("Predict")
    output_prediction = gr.Textbox(label="Toxicity Predictions")
    output_explanation = gr.Textbox(label="Top Attention Tokens (Model I)", lines=10)
    # Set up the event
    predict_button.click(fn=predict_toxicity, inputs=[comment_input, model_choice], outputs=[output_prediction, output_explanation])

demo.launch(share=False)
