# 3.0 Prompting

This notebook:
- Initializes the Flan-T5-small model and tokenizer for text generation.
- Implements zero-shot prompting — model classifies reviews without seeing examples.
- Implements few-shot prompting — model is given 4 labeled examples (2 positive, 2 negative) before classification.
- Evaluates model predictions using:
    - Accuracy, Macro F1, and Brier Score
    - Confusion matrices and reliability (calibration) curves

In [84]:
import json
from pathlib import Path
import yaml
import random
import numpy as np
import pandas as pd
import torch
from datasets import load_dataset, concatenate_datasets
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, accuracy_score, f1_score, brier_score_loss
from sklearn.calibration import calibration_curve
import matplotlib.pyplot as plt

In [None]:
# Avoid external loggers (e.g., W&B)
os.environ["WANDB_DISABLED"] = "true"

# --- Repo root finder (same pattern as your NB1) ---
def find_repo_root(start: Path) -> Path:
    for p in [start, *start.parents]:
        if (p / "configs").exists() or (p / "src").exists() or (p / ".git").exists():
            return p
    return start

ROOT = find_repo_root(Path().resolve())
CFG = yaml.safe_load(open(ROOT / "configs" / "data.yaml"))

# Directories (consistent with your project)
DIR_TABLES = ROOT / CFG.get("output_tables", "results/tables")
DIR_FIGS   = ROOT / CFG.get("output_figures", "results/figures")
DIR_PREDS  = ROOT / CFG.get("output_preds", "results/preds")
DIR_SPLITS = ROOT / CFG.get("output_splits", "data/splits")

for p in (DIR_TABLES, DIR_FIGS, DIR_PREDS, DIR_SPLITS):
    p.mkdir(parents=True, exist_ok=True)

# Seed & device (matches other notebooks)
seed = int(CFG.get("seed", 42))
set_seed(seed)
random.seed(seed)
np.random.seed(seed)
device = "cuda" if torch.cuda.is_available() else ("mps" if hasattr(torch.backends, "mps") and torch.backends.mps.is_available() else "cpu")
print("Device:", device)

In [None]:
def to_df(ds, text_key="text", label_key="label"):
    return pd.DataFrame({"text": ds[text_key], "label": ds[label_key]})

# Load raw datasets (cached)
ds_imdb = load_dataset(CFG.get("dataset_imdb", "imdb"))
ds_rt   = load_dataset(CFG.get("dataset_rotten_tomatoes", "rotten_tomatoes"))

# Recombine official splits into full sets
imdb_full = pd.concat([to_df(ds_imdb["train"]), to_df(ds_imdb["test"])], ignore_index=True)
rt_full   = pd.concat([to_df(ds_rt["train"]),  to_df(ds_rt["validation"]), to_df(ds_rt["test"])], ignore_index=True)

# Load our unified 80/10/10 indices created in NB1
imdb_idx = json.load(open(DIR_SPLITS / "imdb_indices.json"))
rt_idx   = json.load(open(DIR_SPLITS / "rt_indices.json"))

def split(df, idx):
    tr = df.iloc[idx["train"]].reset_index(drop=True)
    va = df.iloc[idx["val"]].reset_index(drop=True)
    te = df.iloc[idx["test"]].reset_index(drop=True)
    return tr, va, te

imdb_tr, imdb_va, imdb_te = split(imdb_full, imdb_idx)
rt_tr,   rt_va,   rt_te   = split(rt_full,   rt_idx)

print("IMDB sizes:", tuple(map(len, (imdb_tr, imdb_va, imdb_te))))
print("RT sizes  :", tuple(map(len, (rt_tr, rt_va, rt_te))))

Using device: cuda
NVIDIA GeForce RTX 3050 Ti Laptop GPU
IMDB test size: 5000
RT test size: 1067


In [None]:
MODEL_NAME = "google/flan-t5-small"   # ~77M params; fast, reproducible
tokenizer  = AutoTokenizer.from_pretrained(MODEL_NAME)
model      = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME).to(device)

LABELS = ["Negative", "Positive"]
label_to_id = {"Negative": 0, "Positive": 1}

# Small sanitizer to keep prompts shortish & clean
def clip_text(t, max_chars=600):
    t = (t or "").replace("\n", " ").strip()
    return t[:max_chars]

def build_fewshot_exemplars(train_df, k_pos=2, k_neg=2, seed=seed):
    rng = np.random.RandomState(seed)
    pos = train_df[train_df["label"] == 1].sample(n=k_pos, random_state=seed) if (train_df["label"] == 1).sum() >= k_pos else train_df[train_df["label"] == 1]
    neg = train_df[train_df["label"] == 0].sample(n=k_neg, random_state=seed) if (train_df["label"] == 0).sum() >= k_neg else train_df[train_df["label"] == 0]
    exs = [(clip_text(t), "Positive") for t in pos["text"].tolist()] + [(clip_text(t), "Negative") for t in neg["text"].tolist()]
    # Shuffle for robustness
    rng.shuffle(exs)
    return exs

def decode_label(text: str):
    s = (text or "").strip().lower()
    if s.startswith("pos"): return "Positive"
    if s.startswith("neg"): return "Negative"
    if "positive" in s: return "Positive"
    if "negative" in s: return "Negative"
    # fallback
    return "Negative"

@torch.inference_mode()
def generate_labels(texts, prefix=None, max_new_tokens=16, batch_size=8):
    """Batch generation; returns list[str] labels decoded from first token(s)."""
    outs = []
    for i in range(0, len(texts), batch_size):
        batch_prompts = []
        for t in texts[i:i+batch_size]:
            if prefix:
                prompt = prefix + f"Review: {clip_text(t)}\nSentiment:"
            else:
                prompt = f"Classify the sentiment of this review as Positive or Negative:\n\n{clip_text(t)}\n\nSentiment:"
            batch_prompts.append(prompt)

        enc = tokenizer(batch_prompts, return_tensors="pt", padding=True, truncation=True).to(device)
        gen = model.generate(**enc, max_new_tokens=max_new_tokens)
        decoded = tokenizer.batch_decode(gen, skip_special_tokens=True)
        outs.extend([decode_label(d) for d in decoded])
    return outs

def fewshot_prefix(examples):
    header = "Classify the sentiment of this review as Positive or Negative:\n\n"
    body = "".join([f"Review: {clip_text(t)}\nSentiment: {lab}\n\n" for t, lab in examples])
    return header + body

In [None]:
def probs_from_labels(pred_labels):
    # simple pseudo-probability: 1.0 if "Positive", 0.0 if "Negative"
    # (generative models don't emit calibrated class probs in .generate();
    #  we keep consistency with Brier definition over binary probs)
    return np.array([1.0 if p == "Positive" else 0.0 for p in pred_labels], dtype=float)

def eval_and_save(test_df, zs_preds, fs_preds, tag: str):
    y_true = test_df["label"].to_numpy()
    zs_prob = probs_from_labels(zs_preds)
    fs_prob = probs_from_labels(fs_preds)
    zs_hat  = (zs_prob >= 0.5).astype(int)
    fs_hat  = (fs_prob >= 0.5).astype(int)

    zs_acc = accuracy_score(y_true, zs_hat)
    fs_acc = accuracy_score(y_true, fs_hat)
    zs_f1  = f1_score(y_true, zs_hat, average="macro")
    fs_f1  = f1_score(y_true, fs_hat, average="macro")
    zs_br  = brier_score_loss(y_true, zs_prob)
    fs_br  = brier_score_loss(y_true, fs_prob)

    # Save predictions
    for mode, preds, prob in [("zs", zs_preds, zs_prob), ("fs", fs_preds, fs_prob)]:
        out_jsonl = DIR_PREDS / f"prompting_{tag}_{mode}_test.jsonl"
        with open(out_jsonl, "w") as f:
            for t, y, p, l in zip(test_df["text"], y_true, prob, preds):
                f.write(json.dumps({"text": t, "label": int(y), "prob_pos": float(p), "pred_label": l}, ensure_ascii=False) + "\n")

    # Confusion matrices & reliability
    def plot_and_save(y, yhat, yprob, mode):
        cm = confusion_matrix(y, yhat, labels=[0,1])
        ConfusionMatrixDisplay(cm, display_labels=["neg","pos"]).plot(values_format="d")
        plt.title(f"Prompting: {tag.upper()} — {mode.upper()} Confusion Matrix")
        plt.tight_layout()
        plt.savefig(DIR_FIGS / f"prompting_{tag}_cm_{mode}.png", dpi=150); plt.show()

        fr, me = calibration_curve(y, yprob, n_bins=10, strategy="quantile")
        plt.figure(figsize=(4.8, 4))
        plt.plot([0,1],[0,1], linestyle="--")
        plt.plot(fr, me, marker="o")
        plt.xlabel("Predicted probability (bin avg)")
        plt.ylabel("Empirical positive rate")
        plt.title(f"Prompting: {tag.upper()} — {mode.upper()} Reliability")
        plt.tight_layout()
        plt.savefig(DIR_FIGS / f"prompting_{tag}_reliability_{mode}.png", dpi=150); plt.show()

    plot_and_save(y_true, zs_hat, zs_prob, "zs")
    plot_and_save(y_true, fs_hat, fs_prob, "fs")

    return {
        "dataset": tag,
        "model_name": MODEL_NAME,
        "zs_acc": zs_acc, "zs_macro_f1": zs_f1, "zs_brier": zs_br,
        "fs_acc": fs_acc, "fs_macro_f1": fs_f1, "fs_brier": fs_br,
        "n_test": len(test_df)
    }

In [None]:
metrics = []

# IMDB
imdb_examples = build_fewshot_exemplars(imdb_tr, k_pos=2, k_neg=2, seed=seed)
zs_imdb = generate_labels(imdb_te["text"].tolist(), prefix=None)
fs_imdb = generate_labels(imdb_te["text"].tolist(), prefix=fewshot_prefix(imdb_examples))
metrics.append(eval_and_save(imdb_te, zs_imdb, fs_imdb, "imdb"))

# RT
rt_examples = build_fewshot_exemplars(rt_tr, k_pos=2, k_neg=2, seed=seed)
zs_rt = generate_labels(rt_te["text"].tolist(), prefix=None)
fs_rt = generate_labels(rt_te["text"].tolist(), prefix=fewshot_prefix(rt_examples))
metrics.append(eval_and_save(rt_te, zs_rt, fs_rt, "rt"))

# Save consolidated metrics
dfm = pd.DataFrame(metrics)
dst = DIR_TABLES / "prompting_metrics.csv"
dfm.to_csv(dst, index=False)
display(dfm)
print("Saved metrics to:", dst)
print("Artifacts →", "tables:", DIR_TABLES, "| figs:", DIR_FIGS, "| preds:", DIR_PREDS)