In [1]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import (
    BertTokenizerFast, BertForTokenClassification,
    get_linear_schedule_with_warmup
)
from torch.optim import AdamW
from sklearn.metrics import precision_recall_fscore_support, classification_report
from tqdm.auto import tqdm
import os
import ast


In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU:", torch.cuda.get_device_name(0))
else:
    print("Using CPU")

def log(msg, path="../outputs/logs/train_token_log.txt"):
    os.makedirs(os.path.dirname(path), exist_ok=True)
    with open(path, "a") as f:
        f.write(msg + "\n")


🔧 CUDA available: True
🚀 GPU: NVIDIA GeForce RTX 3060 Ti


In [3]:
# Load Toxic Spans (character-level) dataset from CSV
df = pd.read_csv("../data/toxic_spans.csv")
df["position"] = df["position"].apply(ast.literal_eval)
# Keep only text and span columns; rename text column to 'text' for convenience.
df = df[["text_of_post", "position"]].rename(columns={"text_of_post": "text"})

# Drop rows with missing values; reset index for clean iteration.
df = df.dropna().reset_index(drop=True)
print("Loaded dataset:", df.shape)

# Hugging Face fast tokenizer to get offset mappings for character-token alignment.
tokenizer = BertTokenizerFast.from_pretrained("bert-base-cased")


✅ Loaded dataset: (11006, 2)




In [4]:
def encode_and_align_labels(texts, spans, tokenizer, max_len=128):
    tokenized = tokenizer(
        texts,
        return_offsets_mapping=True,
        padding="max_length",
        truncation=True,
        max_length=max_len,
        return_tensors="pt",
    )
    labels = []
    for i, offsets in enumerate(tokenized["offset_mapping"]):
        span = set(spans[i])    # gold characters for this sample
        label = []
        for start, end in offsets:
            if start == end:
                label.append(-100)
            else:
                token_span = set(range(start, end))
                label.append(1 if token_span & span else 0)
        labels.append(label)
    tokenized["labels"] = torch.tensor(labels)
    return tokenized


In [5]:
class ToxicSpanDataset(Dataset):
    def __init__(self, df, tokenizer, max_len=128):
        self.texts = df["text"].tolist()
        self.spans = df["position"].tolist()
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoded = encode_and_align_labels(
            [self.texts[idx]], [self.spans[idx]], self.tokenizer, self.max_len
        )
        return {
            "input_ids": encoded["input_ids"].squeeze(0),
            "attention_mask": encoded["attention_mask"].squeeze(0),
            "labels": encoded["labels"].squeeze(0),
        }


In [6]:
# Train/val/test split
train_df = df.sample(frac=0.8, random_state=42)
temp_df = df.drop(train_df.index)
val_df = temp_df.sample(frac=0.5, random_state=42)
test_df = temp_df.drop(val_df.index)

# Build datasets and dataloaders.
train_dataset = ToxicSpanDataset(train_df, tokenizer)
val_dataset = ToxicSpanDataset(val_df, tokenizer)
test_dataset = ToxicSpanDataset(test_df, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8)
test_loader = DataLoader(test_dataset, batch_size=8)


In [7]:
# Token classification head on top of BERT base cased
model = BertForTokenClassification.from_pretrained("bert-base-cased", num_labels=2)
model.to(device)

# Optimizer/scheduler config
optimizer = AdamW(model.parameters(), lr=2e-5)
num_epochs = 3
num_training_steps = num_epochs * len(train_loader)

lr_scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=int(0.1 * num_training_steps),
    num_training_steps=num_training_steps
)

# Mixed precision training utilities
from torch.cuda.amp import autocast, GradScaler
scaler = GradScaler()
best_f1 = 0.0


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  scaler = GradScaler()


In [9]:
# Training loop with token-level metrics computed for the train split
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    preds_all, labels_all = [], []

    for batch in tqdm(train_loader, desc=f"🔁 Epoch {epoch + 1}"):
        batch = {k: v.to(device) for k, v in batch.items()}
        with autocast():
            outputs = model(**batch)
            loss = outputs.loss

        # Standard AMP + gradient clipping + scheduler step
        scaler.scale(loss).backward()
        scaler.unscale_(optimizer)
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        scaler.step(optimizer)
        lr_scheduler.step()
        scaler.update()
        optimizer.zero_grad()

        total_loss += loss.item()

        # Collect token-level predictions/labels for train metrics
        logits = outputs.logits.argmax(dim=-1).detach().cpu().numpy()
        labels = batch["labels"].detach().cpu().numpy()
        mask = labels != -100
        for p, l, m in zip(logits, labels, mask):
            preds_all.extend(p[m])
            labels_all.extend(l[m])

    avg_loss = total_loss / len(train_loader)
    prec, rec, f1, _ = precision_recall_fscore_support(labels_all, preds_all, average="binary", zero_division=0)

    msg = (
        f"Epoch {epoch + 1} - Train Loss: {avg_loss:.4f} | "
        f"Precision: {prec:.4f} | Recall: {rec:.4f} | F1: {f1:.4f}"
    )
    print(msg)
    log(msg)

    # Save per-epoch checkpoint + track best by train F1
    os.makedirs("../outputs/model", exist_ok=True)
    ckpt_path = f"../outputs/model/token_classifier_epoch{epoch + 1}.pt"
    torch.save(model.state_dict(), ckpt_path)
    log(f"Saved: {ckpt_path}")

    if f1 > best_f1:
        best_f1 = f1
        torch.save(model.state_dict(), "../outputs/model/bert_token_classifier.pt")
        log(f"Best model updated at epoch {epoch + 1} with F1: {f1:.4f}")


🔁 Epoch 1:   0%|          | 0/1101 [00:00<?, ?it/s]

  with autocast():


📉 Epoch 1 - Train Loss: 0.0618 | Precision: 0.6338 | Recall: 0.3619 | F1: 0.4607


🔁 Epoch 2:   0%|          | 0/1101 [00:00<?, ?it/s]

  with autocast():


📉 Epoch 2 - Train Loss: 0.0429 | Precision: 0.6724 | Recall: 0.5457 | F1: 0.6025


🔁 Epoch 3:   0%|          | 0/1101 [00:00<?, ?it/s]

  with autocast():


📉 Epoch 3 - Train Loss: 0.0316 | Precision: 0.7215 | Recall: 0.6924 | F1: 0.7067


In [10]:
# Test-time evaluation (token-level)
model.eval()
test_preds, test_labels = [], []
with torch.no_grad():
    for batch in test_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        logits = outputs.logits.argmax(dim=-1).cpu().numpy()
        labels = batch["labels"].cpu().numpy()
        mask = labels != -100
        for p, l, m in zip(logits, labels, mask):
            test_preds.extend(p[m])
            test_labels.extend(l[m])

# Compute token-level precision/recall/F1 for the positive class (label=1) over all tokens.
prec, rec, f1, _ = precision_recall_fscore_support(test_labels, test_preds, average="binary", zero_division=0)

test_msg = (
    "\nTEST SET RESULTS\n"
    f"Precision: {prec:.4f}\n"
    f"Recall:    {rec:.4f}\n"
    f"F1 Score:  {f1:.4f}\n\n"
    "Classification Report:\n"
    f"{classification_report(test_labels, test_preds, digits=4)}"
)

print(test_msg)
log(test_msg)



🎯 TEST SET RESULTS
📌 Precision: 0.5560
📌 Recall:    0.6014
📌 F1 Score:  0.5778

📋 Classification Report:
              precision    recall  f1-score   support

           0     0.9896    0.9875    0.9886     48085
           1     0.5560    0.6014    0.5778      1247

    accuracy                         0.9778     49332
   macro avg     0.7728    0.7945    0.7832     49332
weighted avg     0.9787    0.9778    0.9782     49332

