In [2]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import (
    BertTokenizerFast, BertForSequenceClassification,
    AdamW, get_linear_schedule_with_warmup
)
from sklearn.metrics import accuracy_score, f1_score, classification_report, roc_auc_score
from tqdm.auto import tqdm
import os

# Select CUDA if available, otherwise CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU:", torch.cuda.get_device_name(0))
else:
    print("Using CPU")


🔧 CUDA available: True
🚀 GPU: NVIDIA GeForce RTX 3060 Ti


In [4]:
# Load the pre-split CSVs created in Notebook 02 (80/10/10 split with stratification)
train_df = pd.read_csv("../outputs/civil_comments/train.csv")
val_df = pd.read_csv("../outputs/civil_comments/val.csv")
test_df = pd.read_csv("../outputs/civil_comments/test.csv")
# Ensure all texts are strings to avoid tokenizer ValueError
train_df["text"] = train_df["text"].astype(str)
val_df["text"] = val_df["text"].astype(str)
test_df["text"] = test_df["text"].astype(str)

# Remove empty or whitespace-only rows
train_df = train_df[train_df["text"].str.strip().astype(bool)]
val_df = val_df[val_df["text"].str.strip().astype(bool)]
test_df = test_df[test_df["text"].str.strip().astype(bool)]

print("Train:", train_df.shape, "Val:", val_df.shape, "Test:", test_df.shape)


Train: (1443899, 2) Val: (180487, 2) Test: (180488, 2)


In [5]:
# Load BERT tokenizer (cased version) for consistent tokenization
tokenizer = BertTokenizerFast.from_pretrained("bert-base-cased")

class ToxicDataset(Dataset):
    def __init__(self, df, tokenizer, max_len=128):
        self.texts = df["text"].tolist()
        self.labels = df["label"].tolist()
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        enc = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding="max_length",
            max_length=self.max_len,
            return_tensors="pt"
        )
        return {
            "input_ids": enc["input_ids"].squeeze(0),
            "attention_mask": enc["attention_mask"].squeeze(0),
            "label": torch.tensor(self.labels[idx], dtype=torch.long)
        }
# Build train/val datasets from the preprocessed splits
train_dataset = ToxicDataset(train_df, tokenizer)
val_dataset = ToxicDataset(val_df, tokenizer)




In [6]:
# Initialize BERT sequence classifier (binary)
model = BertForSequenceClassification.from_pretrained("bert-base-cased", num_labels=2)
model.to(device)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)

# Optimizer + LR schedule (linear warmup + decay)
optimizer = AdamW(model.parameters(), lr=2e-5)
num_epochs = 3
num_training_steps = num_epochs * len(train_loader)

lr_scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=int(0.1 * num_training_steps),
    num_training_steps=num_training_steps
)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
# AMP for faster training on GPU
from torch.cuda.amp import autocast, GradScaler

scaler = GradScaler()
best_f1 = 0.0

def log(msg, path="../outputs/logs/training_log.txt"):
    os.makedirs(os.path.dirname(path), exist_ok=True)
    with open(path, "a") as f:
        f.write(msg + "\n")

# Training loop
for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}"):
        # Move batch to device and rename 'label' to 'labels' for HF model API
        batch = {k: v.to(device) for k, v in batch.items()}
        batch["labels"] = batch.pop("label")

        with autocast():
            outputs = model(**batch)
            loss = outputs.loss

        # Backprop with gradient scaling, clip, step, schedule
        scaler.scale(loss).backward()
        scaler.unscale_(optimizer)
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        scaler.step(optimizer)
        lr_scheduler.step()
        scaler.update()
        optimizer.zero_grad()


        total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)

    # Validation
    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for batch in val_loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            batch["labels"] = batch.pop("label")

            outputs = model(**batch)
            preds = outputs.logits.argmax(dim=1).cpu().numpy()
            labels = batch["labels"].cpu().numpy()

            all_preds.extend(preds)
            all_labels.extend(labels)

    # Accuracy and F1 on validation split
    acc = accuracy_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds)

    msg = f"Epoch {epoch+1} - Train Loss: {avg_loss:.4f} | Val Acc: {acc:.4f} | F1: {f1:.4f}"
    print(msg)
    log(msg)

    # Save best model by validation F1
    if f1 > best_f1:
        best_f1 = f1
        os.makedirs("../outputs/model", exist_ok=True)
        torch.save(model.state_dict(), "../outputs/model/bert_toxic_classifier.pt")
        save_msg = f"Best model saved at epoch {epoch+1} with F1: {f1:.4f}"
        print(save_msg)
        log(save_msg)


  scaler = GradScaler()


Epoch 1:   0%|          | 0/90244 [00:00<?, ?it/s]

  with autocast():


Epoch 1 - Train Loss: 0.1669 | Val Acc: 0.9431 | F1: 0.6279
💾 Best model saved at epoch 1 with F1: 0.6279


Epoch 2:   0%|          | 0/90244 [00:00<?, ?it/s]

  with autocast():


Epoch 2 - Train Loss: 0.1560 | Val Acc: 0.9517 | F1: 0.6541
💾 Best model saved at epoch 2 with F1: 0.6541


Epoch 3:   0%|          | 0/90244 [00:00<?, ?it/s]

  with autocast():


Epoch 3 - Train Loss: 0.1378 | Val Acc: 0.9518 | F1: 0.6631
💾 Best model saved at epoch 3 with F1: 0.6631


In [8]:
print("Evaluating on test set...")

# Build test dataset/loader from the held-out split
test_dataset = ToxicDataset(test_df, tokenizer)
test_loader = DataLoader(test_dataset, batch_size=16)

model.eval()

# Collect hard labels (preds), ground-truth labels, and probabilities for ROC–AUC
test_preds, test_labels, test_probs = [], [], []

with torch.no_grad():
    for batch in test_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        labels = batch.pop("label")
        outputs = model(**batch)
        logits = outputs.logits
        preds = logits.argmax(dim=1).cpu().numpy()
        probs = torch.softmax(logits, dim=1)[:, 1].cpu().numpy()

        test_preds.extend(preds)
        test_labels.extend(labels.cpu().numpy())
        test_probs.extend(probs)

# Compute metrics
test_acc = accuracy_score(test_labels, test_preds)
test_f1 = f1_score(test_labels, test_preds)
test_roc_auc = roc_auc_score(test_labels, test_probs)

test_acc_msg = f"Test Accuracy: {test_acc:.4f}"
test_f1_msg = f"Test F1 Score: {test_f1:.4f}"
test_roc_auc_msg = f"Test ROC AUC Score: {test_roc_auc:.4f}"
report_msg = classification_report(test_labels, test_preds, digits=4)

print(test_acc_msg)
print(test_f1_msg)
print(test_roc_auc_msg)
print("\nClassification Report:\n")
print(report_msg)

log(test_acc_msg)
log(test_f1_msg)
log(test_roc_auc_msg)
log("\nClassification Report:\n" + report_msg)
log("-" * 60)

📊 Evaluating on test set...
