# 4.0 Fine-tuning

This notebook:
- Loads fixed 80/10/10 splits from `data/splits/`
- Tokenizes with a chosen HF tokenizer
- Fine-tunes a transformer classifier (train on train, select using val, report on test)
- Saves:
  - `results/tables/finetune_metrics.csv`
  - `results/preds/finetune_<dataset>_test.jsonl`
  - `results/figures/finetune_<dataset>_cm.png`, `finetune_<dataset>_reliability.png`
  - `models/finetune_<dataset>/` (HF model + tokenizer)

In [None]:
import os, json, numpy as np, pandas as pd, matplotlib.pyplot as plt
from pathlib import Path
import evaluate, torch, yaml
from datasets import load_dataset, Dataset
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification,
    DataCollatorWithPadding, TrainingArguments, Trainer, set_seed
)
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, ConfusionMatrixDisplay, brier_score_loss
from sklearn.calibration import calibration_curve

# Disable Weights & Biases auto-logging
os.environ["WANDB_DISABLED"] = "true"

# Config
ROOT = Path.cwd()
CFG = yaml.safe_load(open(ROOT / "configs/finetune_best.yaml"))
set_seed(int(CFG.get("seed", 42)))

DIR_TABLES = ROOT / CFG.get("output_tables", "results/tables")
DIR_FIGS   = ROOT / CFG.get("output_figures", "results/figures")
DIR_PREDS  = ROOT / CFG.get("output_preds", "results/preds")
DIR_MODELS = ROOT / CFG.get("output_models", "models")
for p in (DIR_TABLES, DIR_FIGS, DIR_PREDS, DIR_MODELS):
    p.mkdir(parents=True, exist_ok=True)
DIR_SPLITS = ROOT / "data" / "splits"

print("Model:", CFG["model_name"])
print("Working dir:", ROOT)

In [None]:
def to_df(ds, text_key="text", label_key="label"):
    return pd.DataFrame({"text": ds[text_key], "label": ds[label_key]})

# Load datasets
imdb = load_dataset("imdb")
rt   = load_dataset("rotten_tomatoes")

imdb_full = pd.concat([to_df(imdb["train"]), to_df(imdb["test"])], ignore_index=True)
rt_full   = pd.concat([to_df(rt["train"]), to_df(rt["validation"]), to_df(rt["test"])], ignore_index=True)

# Load deterministic split indices
imdb_idx = json.load(open(DIR_SPLITS / "imdb_indices.json"))
rt_idx   = json.load(open(DIR_SPLITS / "rt_indices.json"))

def split(df, idx):
    tr = df.iloc[idx["train"]].reset_index(drop=True)
    va = df.iloc[idx["val"]].reset_index(drop=True)
    te = df.iloc[idx["test"]].reset_index(drop=True)
    return tr, va, te

imdb_tr, imdb_va, imdb_te = split(imdb_full, imdb_idx)
rt_tr,   rt_va,   rt_te   = split(rt_full,   rt_idx)

print("IMDB:", tuple(map(len, (imdb_tr, imdb_va, imdb_te))))
print("RT  :", tuple(map(len, (rt_tr, rt_va, rt_te))))

In [None]:
tokenizer = AutoTokenizer.from_pretrained(CFG["model_name"], use_fast=True)
collator  = DataCollatorWithPadding(tokenizer)

def build_hf(train_df, val_df, test_df, max_length):
    def tok(batch):
        return tokenizer(batch["text"], truncation=True, max_length=max_length)
    to_hf = lambda df: Dataset.from_pandas(df, preserve_index=False)
    hf_tr, hf_va, hf_te = to_hf(train_df), to_hf(val_df), to_hf(test_df)
    hf_tr, hf_va, hf_te = hf_tr.map(tok, batched=True), hf_va.map(tok, batched=True), hf_te.map(tok, batched=True)
    keep = {"input_ids", "attention_mask", "token_type_ids", "label", "text"}
    hf_tr = hf_tr.remove_columns([c for c in hf_tr.column_names if c not in keep])
    hf_va = hf_va.remove_columns([c for c in hf_va.column_names if c not in keep])
    hf_te = hf_te.remove_columns([c for c in hf_te.column_names if c not in keep])
    return hf_tr, hf_va, hf_te

In [None]:
acc_metric = evaluate.load("accuracy")
f1_metric  = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    probs = (logits - logits.max(axis=1, keepdims=True))
    probs = np.exp(probs) / np.exp(probs).sum(axis=1, keepdims=True)
    preds = probs.argmax(axis=1)
    return {
        "accuracy": acc_metric.compute(predictions=preds, references=labels)["accuracy"],
        "macro_f1": f1_metric.compute(predictions=preds, references=labels, average="macro")["f1"],
        "brier":    brier_score_loss(labels, probs[:, 1]),
    }

In [None]:
def run(train_df, val_df, test_df, tag):
    hf_tr, hf_va, hf_te = build_hf(train_df, val_df, test_df, CFG["max_length"])
    model = AutoModelForSequenceClassification.from_pretrained(CFG["model_name"], num_labels=2)

    out_dir = DIR_MODELS / f"finetune_{tag}"
    args = TrainingArguments(
        output_dir=str(out_dir),
        eval_strategy="epoch",
        save_strategy="epoch",
        learning_rate=float(CFG["lr"]),
        per_device_train_batch_size=int(CFG["batch_size"]),
        per_device_eval_batch_size=int(CFG["batch_size"]),
        num_train_epochs=int(CFG["epochs"]),
        weight_decay=float(CFG["weight_decay"]),
        warmup_ratio=float(CFG["warmup_ratio"]),
        gradient_accumulation_steps=int(CFG.get("gradient_accumulation_steps", 1)),
        load_best_model_at_end=True,
        metric_for_best_model="macro_f1",
        greater_is_better=True,
        logging_steps=50,
        seed=int(CFG["seed"]),
        report_to="none"     
    )

    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=hf_tr,
        eval_dataset=hf_va,
        processing_class=tokenizer,
        data_collator=collator,
        compute_metrics=compute_metrics
    )

    trainer.train()

    preds = trainer.predict(hf_te)
    logits = preds.predictions
    probs  = np.exp(logits - logits.max(axis=1, keepdims=True))
    probs  = probs / probs.sum(axis=1, keepdims=True)
    y_prob = probs[:, 1]
    y_pred = probs.argmax(axis=1)
    y_true = np.array(test_df["label"].tolist())

    acc  = accuracy_score(y_true, y_pred)
    f1m  = f1_score(y_true, y_pred, average="macro")
    brier = brier_score_loss(y_true, y_prob)

    # Save predictions
    rows = [{"text": t, "label": int(y), "prob_pos": float(p), "pred": int(h)}
            for t, y, p, h in zip(test_df["text"], y_true, y_prob, y_pred)]
    out_preds = DIR_PREDS / f"finetune_{tag}_test.jsonl"
    with open(out_preds, "w") as f:
        for r in rows:
            f.write(json.dumps(r, ensure_ascii=False) + "\n")

    # Plots
    cm = confusion_matrix(y_true, y_pred, labels=[0, 1])
    ConfusionMatrixDisplay(cm, display_labels=["neg", "pos"]).plot(values_format="d")
    plt.title(f"Fine-tune Confusion Matrix: {tag.upper()}")
    plt.tight_layout(); plt.savefig(DIR_FIGS / f"finetune_{tag}_cm.png", dpi=150); plt.show()

    fracs, means = calibration_curve(y_true, y_prob, n_bins=10, strategy="quantile")
    plt.figure(figsize=(4.8, 4))
    plt.plot([0, 1], [0, 1], linestyle="--")
    plt.plot(fracs, means, marker="o")
    plt.xlabel("Predicted probability (bin avg)")
    plt.ylabel("Empirical positive rate")
    plt.title(f"Reliability Curve: {tag.upper()}")
    plt.tight_layout(); plt.savefig(DIR_FIGS / f"finetune_{tag}_reliability.png", dpi=150); plt.show()

    # Save model/tokenizer
    out_dir.mkdir(parents=True, exist_ok=True)
    tokenizer.save_pretrained(out_dir)
    trainer.model.save_pretrained(out_dir)

    return {"dataset": tag, "model_name": CFG["model_name"],
            "acc": acc, "macro_f1": f1m, "brier": brier, "n_test": len(test_df)}

In [None]:
metrics = []
metrics.append(run(imdb_tr, imdb_va, imdb_te, "imdb"))
metrics.append(run(rt_tr,   rt_va,   rt_te,   "rt"))

dfm = pd.DataFrame(metrics)
dst = DIR_TABLES / "finetune_metrics.csv"
dfm.to_csv(dst, index=False)
display(dfm)
print("Saved metrics to:", dst)
print("Artifacts saved under:", DIR_MODELS, DIR_PREDS, DIR_FIGS)