In [1]:
import os, sys, math, json, random, time
from dataclasses import dataclass

import numpy as np
import pandas as pd
import torch

from sklearn.model_selection import train_test_split
from sklearn.utils import resample

from torch.utils.data import Dataset, DataLoader
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification,
    DataCollatorWithPadding, get_linear_schedule_with_warmup
)

from peft import LoraConfig, get_peft_model

os.environ["TOKENIZERS_PARALLELISM"] = "false"

# Project roots relative to notebooks/
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))
DATA_DIR  = os.path.abspath(os.path.join(os.getcwd(), "..", "data", "processed"))
MODEL_DIR = os.path.abspath(os.path.join(os.getcwd(), "..", "models"))
os.makedirs(MODEL_DIR, exist_ok=True)

def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(42)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

## Step 1 — Load, Balance, and Merge Datasets

We load processed_LIAR.csv and processed_FakeNews.csv, balance domain sizes, add a source column, and create an 80/10/10 split that is stratified by both label and source.

In [2]:
liar_path  = os.path.join(DATA_DIR, "processed_LIAR.csv")
news_path  = os.path.join(DATA_DIR, "processed_FakeNews.csv")

assert os.path.exists(liar_path), f"Missing: {liar_path}"
assert os.path.exists(news_path), f"Missing: {news_path}"

liar_df = pd.read_csv(liar_path)
news_df = pd.read_csv(news_path)

# Add 'source' columns
liar_df["source"] = "LIAR"
news_df["source"] = "FakeNews"

# Balance by domain (upsample smaller domain)
target = min(len(liar_df), len(news_df))
liar_bal = resample(liar_df, replace=True, n_samples=target, random_state=42)
news_bal = resample(news_df, replace=True, n_samples=target, random_state=42)

merged = pd.concat([liar_bal, news_bal], ignore_index=True)
merged = merged.sample(frac=1.0, random_state=42).reset_index(drop=True)

display(merged[["source","label"]].value_counts().sort_index())
print("Merged size:", len(merged))

# Stratified split by BOTH label and source (composite key)
merged["strat"] = merged["label"].astype(str) + "_" + merged["source"]
train_df, temp_df = train_test_split(
    merged, test_size=0.2, stratify=merged["strat"], random_state=42
)
val_df, test_df = train_test_split(
    temp_df, test_size=0.5, stratify=temp_df["strat"], random_state=42
)

for name, df in [("train", train_df), ("val", val_df), ("test", test_df)]:
    print(name, df["source"].value_counts().to_dict(), "| label", df["label"].value_counts().to_dict())

# Persist merged splits (useful for reproducibility and external evaluation)
merged_path = os.path.join(DATA_DIR, "merged_LIAR_FakeNews.csv")
train_path  = os.path.join(DATA_DIR, "merged_train.csv")
val_path    = os.path.join(DATA_DIR, "merged_val.csv")
test_path   = os.path.join(DATA_DIR, "merged_test.csv")

merged.drop(columns=["strat"], errors="ignore").to_csv(merged_path, index=False)
train_df.drop(columns=["strat"], errors="ignore").to_csv(train_path, index=False)
val_df.drop(columns=["strat"], errors="ignore").to_csv(val_path, index=False)
test_df.drop(columns=["strat"], errors="ignore").to_csv(test_path, index=False)

print("Saved:", merged_path, train_path, val_path, test_path, sep="\n - ")

source    label
FakeNews  0        6033
          1        6758
LIAR      0        7159
          1        5632
Name: count, dtype: int64

Merged size: 25582
train {'LIAR': 10233, 'FakeNews': 10232} | label {0: 10553, 1: 9912}
val {'LIAR': 1279, 'FakeNews': 1279} | label {0: 1319, 1: 1239}
test {'FakeNews': 1280, 'LIAR': 1279} | label {0: 1320, 1: 1239}
Saved:
 - /gpfs/home/ashwin/FakeNews-Detection/data/processed/merged_LIAR_FakeNews.csv
 - /gpfs/home/ashwin/FakeNews-Detection/data/processed/merged_train.csv
 - /gpfs/home/ashwin/FakeNews-Detection/data/processed/merged_val.csv
 - /gpfs/home/ashwin/FakeNews-Detection/data/processed/merged_test.csv


## Step 2 — Dataset and DataLoader

We optionally add a domain tag to each example. For SHAP/LIME later, we will also save raw text copies alongside tensors when needed, but the explainers will use a HF pipeline that expects raw strings, not tensors.

In [3]:
ADD_DOMAIN_TOKEN = True

class NewsDataset(Dataset):
    def __init__(self, df: pd.DataFrame, tokenizer, max_len=256, add_domain_token=True, return_text=False):
        self.tokenizer = tokenizer
        self.max_len   = max_len
        self.return_text = return_text
        titles = df["title"].astype(str).tolist()
        texts  = df["text"].astype(str).tolist()
        srcs   = df["source"].astype(str).tolist()
        lbls   = df["label"].astype(int).tolist()
        self.labels = lbls
        if add_domain_token:
            self.strings = [f"[{s}] {t} {x}" for s, t, x in zip(srcs, titles, texts)]
        else:
            self.strings = [f"{t} {x}" for t, x in zip(titles, texts)]

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        text = self.strings[idx]
        enc = self.tokenizer(
            text,
            truncation=True, padding="max_length", max_length=self.max_len,
            return_tensors="pt"
        )
        item = {k: v.squeeze(0) for k, v in enc.items()}
        item["labels"] = torch.tensor(self.labels[idx], dtype=torch.long)
        if self.return_text:
            item["raw_text"] = self.strings[idx]
        return item

def build_loaders(train_df, val_df, test_df, tokenizer, batch_size=16, max_len=256, add_domain_token=True):
    collator = DataCollatorWithPadding(tokenizer=tokenizer, padding="longest")
    train_ds = NewsDataset(train_df, tokenizer, max_len, add_domain_token)
    val_ds   = NewsDataset(val_df,   tokenizer, max_len, add_domain_token)
    test_ds  = NewsDataset(test_df,  tokenizer, max_len, add_domain_token)
    train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True,  num_workers=0, collate_fn=collator)
    val_dl   = DataLoader(val_ds,   batch_size=batch_size, shuffle=False, num_workers=0, collate_fn=collator)
    test_dl  = DataLoader(test_ds,  batch_size=batch_size, shuffle=False, num_workers=0, collate_fn=collator)
    return train_dl, val_dl, test_dl

## Step 3 — Model, LoRA, and Label Maps

We load RoBERTa, attach LoRA to attention projections, and define label maps. We’ll save these maps so the HF pipeline can display human-readable labels.

In [4]:
MODEL_NAME = "roberta-base"
NUM_LABELS = 2
id2label = {0: "REAL", 1: "FAKE"}
label2id = {"REAL": 0, "FAKE": 1}

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

base_model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME, num_labels=NUM_LABELS, id2label=id2label, label2id=label2id
)

lora_cfg = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    target_modules=["query", "value"]
)

model = get_peft_model(base_model, lora_cfg)
model.print_trainable_parameters()
model = model.to(device)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 589,824 || all params: 125,236,994 || trainable%: 0.4710


## Step 4 — Hyperparameters and Loaders

We define a simple config, build loaders, and create optimizer/scheduler. Mixed precision is enabled automatically later.

In [5]:
@dataclass
class TrainConfig:
    batch_size: int = 16
    max_len: int = 256
    lr: float = 5e-5
    weight_decay: float = 0.01
    epochs: int = 3
    warmup_ratio: float = 0.06
    grad_accum: int = 1
    max_grad_norm: float = 1.0
    ckpt_prefix: str = "roberta_lora_multidomain"

cfg = TrainConfig()

train_dl, val_dl, test_dl = build_loaders(
    train_df, val_df, test_df, tokenizer,
    batch_size=cfg.batch_size, max_len=cfg.max_len,
    add_domain_token=ADD_DOMAIN_TOKEN
)

# Optimizer/scheduler
optim = torch.optim.AdamW(model.parameters(), lr=cfg.lr, weight_decay=cfg.weight_decay)

num_update_steps_per_epoch = math.ceil(len(train_dl) / cfg.grad_accum)
t_total = cfg.epochs * num_update_steps_per_epoch
warmup_steps = int(cfg.warmup_ratio * t_total)
scheduler = get_linear_schedule_with_warmup(
    optimizer=optim,
    num_warmup_steps=warmup_steps,
    num_training_steps=t_total
)

t_total, warmup_steps

(3840, 230)

## Step 5 — Train/Eval Loops with Mixed Precision and Early Stopping

We train with torch.cuda.amp for speed and stability, evaluate on the val set each epoch, keep the best model by val loss, and rotate step checkpoints.

In [6]:
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
from torch import amp
from tqdm.auto import tqdm
import time

def evaluate(model, dl):
    model.eval()
    losses, preds, labels = [], [], []
    with torch.no_grad():
        for batch in tqdm(dl, desc="Evaluating", leave=False):
            batch = {k: v.to(device) for k, v in batch.items() if k != "raw_text"}
            outputs = model(**batch)
            loss = outputs.loss
            logits = outputs.logits
            losses.append(loss.item())
            preds.extend(torch.argmax(logits, dim=1).cpu().numpy())
            labels.extend(batch["labels"].cpu().numpy())
    loss = float(np.mean(losses))
    acc  = accuracy_score(labels, preds)
    f1   = f1_score(labels, preds, zero_division=0)
    pre  = precision_score(labels, preds, zero_division=0)
    rec  = recall_score(labels, preds, zero_division=0)
    return {"loss": loss, "acc": acc, "f1": f1, "precision": pre, "recall": rec}


def save_json(obj, path):
    with open(path, "w") as f:
        json.dump(obj, f, indent=2)


CKPT_BEST_DIR = os.path.join(MODEL_DIR, f"{cfg.ckpt_prefix}_best")
CKPT_STEP_DIR = os.path.join(MODEL_DIR, f"{cfg.ckpt_prefix}_steps")
os.makedirs(CKPT_BEST_DIR, exist_ok=True)
os.makedirs(CKPT_STEP_DIR, exist_ok=True)

scaler = amp.GradScaler('cuda' if torch.cuda.is_available() else 'cpu')

global_step = 0
best_val = float("inf")
history = []

# === Training Loop ===
for epoch in range(1, cfg.epochs + 1):
    model.train()
    running_loss = 0.0
    epoch_start = time.time()

    progress = tqdm(train_dl, desc=f"Epoch {epoch}/{cfg.epochs}", leave=True)
    for step, batch in enumerate(progress, start=1):
        batch = {k: v.to(device) for k, v in batch.items() if k != "raw_text"}

        with amp.autocast('cuda' if torch.cuda.is_available() else 'cpu'):
            outputs = model(**batch)
            loss = outputs.loss / cfg.grad_accum

        scaler.scale(loss).backward()

        if step % cfg.grad_accum == 0:
            scaler.unscale_(optim)
            torch.nn.utils.clip_grad_norm_(model.parameters(), cfg.max_grad_norm)
            scaler.step(optim)
            scaler.update()
            optim.zero_grad()
            scheduler.step()
            global_step += 1

        running_loss += loss.item()
        avg_loss = running_loss / step
        progress.set_postfix(loss=f"{avg_loss:.4f}", lr=scheduler.get_last_lr()[0])

        # Lightweight checkpoint every 1000 steps
        if global_step > 0 and global_step % 1000 == 0:
            step_path = os.path.join(CKPT_STEP_DIR, f"step_{global_step}")
            os.makedirs(step_path, exist_ok=True)
            model.save_pretrained(step_path)
            tokenizer.save_pretrained(step_path)

    # End-of-epoch evaluation
    epoch_time = (time.time() - epoch_start) / 60
    val_metrics = evaluate(model, val_dl)
    row = {"epoch": epoch,
           "train_loss": avg_loss,
           "val_loss": val_metrics["loss"],
           "val_f1": val_metrics["f1"],
           "val_acc": val_metrics["acc"],
           "epoch_time_min": round(epoch_time, 2)}
    history.append(row)

    print(f"Epoch {epoch}: train_loss={avg_loss:.4f} | val_loss={val_metrics['loss']:.4f} | "
          f"val_f1={val_metrics['f1']:.4f} | val_acc={val_metrics['acc']:.4f} | "
          f"time={epoch_time:.2f} min")

    # Save best model by validation loss
    if val_metrics["loss"] < best_val:
        best_val = val_metrics["loss"]
        model.save_pretrained(CKPT_BEST_DIR)
        tokenizer.save_pretrained(CKPT_BEST_DIR)
        save_json({"best_val_loss": best_val, "epoch": epoch},
                  os.path.join(CKPT_BEST_DIR, "metrics.json"))

# Save training history
pd.DataFrame(history).to_csv(
    os.path.join(MODEL_DIR, f"{cfg.ckpt_prefix}_history.csv"),
    index=False
)

Epoch 1/3:   0%|          | 0/1280 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/160 [00:00<?, ?it/s]

Epoch 1: train_loss=0.4209 | val_loss=0.3341 | val_f1=0.7866 | val_acc=0.8151 | time=0.84 min


Epoch 2/3:   0%|          | 0/1280 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/160 [00:00<?, ?it/s]

Epoch 2: train_loss=0.3342 | val_loss=0.3229 | val_f1=0.8004 | val_acc=0.8241 | time=0.83 min


Epoch 3/3:   0%|          | 0/1280 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/160 [00:00<?, ?it/s]

Epoch 3: train_loss=0.3279 | val_loss=0.3219 | val_f1=0.8195 | val_acc=0.8335 | time=0.84 min
