# 3.0 Prompting

This notebook:
- Initializes the Flan-T5-small model and tokenizer for text generation.
- Implements zero-shot prompting — model classifies reviews without seeing examples.
- Implements few-shot prompting — model is given 4 labeled examples (2 positive, 2 negative) before classification.
- Evaluates model predictions using:
    - Accuracy, Macro F1, and Brier Score
    - Confusion matrices and reliability (calibration) curves

In [None]:
import os
import json
import random
from pathlib import Path

import numpy as np
import pandas as pd
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, set_seed
from sklearn.metrics import accuracy_score, f1_score, brier_score_loss, confusion_matrix, ConfusionMatrixDisplay, classification_report
from sklearn.calibration import calibration_curve
import matplotlib.pyplot as plt

plt.style.use("default")
plt.rcParams["figure.figsize"] = (6, 4)

SEED = 20
random.seed(SEED)
np.random.seed(SEED)
os.environ["PYTHONHASHSEED"] = str(SEED)
set_seed(SEED)

ROOT = Path.cwd().parent
DIR_TABLES = ROOT / "results" / "tables"
DIR_FIGS = ROOT / "results" / "figures"

for p in (DIR_TABLES, DIR_FIGS):
    p.mkdir(parents=True, exist_ok=True)

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")
if device == "cuda":
    print(torch.cuda.get_device_name(0))

# Load the two datasets
DATASET_IMDB = "imdb"
DATASET_RT = "rotten_tomatoes"

ds_imdb = load_dataset(DATASET_IMDB)
ds_rt = load_dataset(DATASET_RT)

def to_df(ds_split, text_key="text", label_key="label"):
    return pd.DataFrame({"text": ds_split[text_key], "label": ds_split[label_key]})

# IMDB: train + test concatenated -> full
imdb_train_df = to_df(ds_imdb["train"])
imdb_test_df = to_df(ds_imdb["test"])
imdb_full = pd.concat([imdb_train_df, imdb_test_df], ignore_index=True)

# Rotten Tomatoes: train + val + test -> full
rt_train_df = to_df(ds_rt["train"])
rt_val_df = to_df(ds_rt["validation"])
rt_test_df = to_df(ds_rt["test"])
rt_full = pd.concat([rt_train_df, rt_val_df, rt_test_df], ignore_index=True)

print("IMDB full size:", len(imdb_full))
print("RT full size:", len(rt_full))

In [None]:
model_name = "google/flan-t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Running on " + device)
model.to(device)
seed = 38

task_prefix = "Classify the sentiment of this review as Positive or Negative:\n\n"

def zero_shot_predict(texts, max_length=30):
    preds = []
    for text in texts:
        prompt = task_prefix + f"Review: {text}\nSentiment:"
        inputs = tokenizer(prompt, return_tensors="pt").to(device)
        outputs = model.generate(**inputs, max_new_tokens=max_length)
        decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
        label = decoded.strip().split()[0].lower().rstrip(".!,")
        preds.append(1 if "positive" in label else 0)
    return preds

def few_shot_predict(texts, examples, max_length=30):
    preds = []
    prefix = task_prefix
    for ex_text, ex_label in examples:
        prefix += f"Review: {ex_text}\nSentiment: {ex_label}\n\n"
    
    for text in texts:
        prompt = prefix + f"Review: {text}\nSentiment:"
        inputs = tokenizer(prompt, return_tensors="pt").to(device)
        outputs = model.generate(**inputs, max_new_tokens=max_length)
        decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
        label = decoded.strip().split()[0]
        preds.append(1 if "positive" in label.lower() else 0)
    return preds

def sample_few_shot_examples(ds_train, n_examples):
    np.random.seed(seed)
    n_half = n_examples // 2
    pos_indices = [i for i, x in enumerate(ds_train) if x["label"] == 1]
    neg_indices = [i for i, x in enumerate(ds_train) if x["label"] == 0]
    sampled_pos = [int(i) for i in np.random.choice(pos_indices, n_half, replace=False)]
    sampled_neg = [int(i) for i in np.random.choice(neg_indices, n_half, replace=False)]
    examples = [(ds_train[i]["text"], "Positive") for i in sampled_pos] + \
               [(ds_train[i]["text"], "Negative") for i in sampled_neg]
    np.random.shuffle(examples)
    return examples

# Sampled test sets for faster computation
sample_size = 100
np.random.seed(seed)

imdb_sample = imdb_test_df.sample(n=sample_size, random_state=seed)
imdb_texts_sample = imdb_sample["text"].tolist()
imdb_labels_sample = imdb_sample["label"].tolist()

rt_sample = rt_test_df.sample(n=sample_size, random_state=seed)
rt_texts_sample = rt_sample["text"].tolist()
rt_labels_sample = rt_sample["label"].tolist()

example_numbers = [2, 4, 6]  # number of examples from train
results_list = []

print("\nIMDB Few-Shot Ablation")
for n in example_numbers:
    imdb_examples = sample_few_shot_examples(ds_imdb["train"], n)
    preds = few_shot_predict(imdb_texts_sample, imdb_examples)
    acc = accuracy_score(imdb_labels_sample, preds)
    f1 = f1_score(imdb_labels_sample, preds, average="macro")
    brier = brier_score_loss(imdb_labels_sample, preds)
    print(f"IMDB Test Metrics with {n} examples: Accuracy={acc:.3f}, Macro F1={f1:.3f}, Brier Score={brier:.3f}")
    results_list.append({
        "dataset": "IMDB",
        "example_count": n,
        "accuracy": acc,
        "macro_f1": f1,
        "brier_score": brier
    })

print("\nRotten Tomatoes Few-Shot Ablation")
for n in example_numbers:
    rt_examples = sample_few_shot_examples(ds_rt["train"], n)
    preds = few_shot_predict(rt_texts_sample, rt_examples)
    acc = accuracy_score(rt_labels_sample, preds)
    f1 = f1_score(rt_labels_sample, preds, average="macro")
    brier = brier_score_loss(rt_labels_sample, preds)
    print(f"RT Test Metrics with {n} examples: Accuracy={acc:.3f}, Macro F1={f1:.3f}, Brier Score={brier:.3f}")
    results_list.append({
        "dataset": "Rotten Tomatoes",
        "example_count": n,
        "accuracy": acc,
        "macro_f1": f1,
        "brier_score": brier
    })

results_df = pd.DataFrame(results_list)
csv_path = DIR_TABLES / "few_shot_ablation_results.csv"
results_df.to_csv(csv_path, index=False)
print(f"\nResults saved to {csv_path}")


In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Running on " + device)
model.to(device)

# Based on ablation study, giving 4 examples in few shot gives the best results, so we use 4 balanced examples in the final few-shot prompting model
# done on full test sets
imdb_texts = imdb_test_df["text"].tolist()

imdb_examples_4 = sample_few_shot_examples(ds_imdb["train"], 4)
imdb_zs_preds = zero_shot_predict(imdb_texts)
imdb_fs_preds = few_shot_predict(imdb_texts, imdb_examples_4)

rt_texts = rt_test_df["text"].tolist()

rt_examples_4 = sample_few_shot_examples(ds_rt["train"], 4)
rt_zs_preds = zero_shot_predict(rt_texts)
rt_fs_preds = few_shot_predict(rt_texts, rt_examples_4)


In [None]:
def evaluate_and_plot(true_labels, zs_preds, fs_preds, texts, dataset_name="Dataset"):
    """
    true_labels: list of ints 0/1
    zs_preds/fs_preds: list of ints 0/1
    texts: list of strings
    """
    results = []

    def compute_metrics(name, preds):
        pred_bin = preds
        prob_pos = [float(p) for p in preds]

        acc = accuracy_score(true_labels, pred_bin)
        f1m = f1_score(true_labels, pred_bin, average="macro")
        brier = brier_score_loss(true_labels, prob_pos)

        print(f"\n{name} metrics ({dataset_name}):")
        print(f"Accuracy: {acc:.3f}, Macro F1: {f1m:.3f}, Brier Score: {brier:.3f}")
        print(classification_report(true_labels, pred_bin, target_names=["Negative","Positive"]))

        # Confusion matrix
        cm = confusion_matrix(true_labels, pred_bin, labels=[0,1])
        disp = ConfusionMatrixDisplay(cm, display_labels=["Negative","Positive"])
        disp.plot(values_format="d")
        plt.title(f"{name} Confusion Matrix: {dataset_name}")
        cm_path = DIR_FIGS / f"{name.lower().replace(' ', '_')}_{dataset_name.lower()}_cm.png"
        plt.tight_layout(); plt.savefig(cm_path, dpi=150); plt.close()

        # Reliability curve
        fracs, means = calibration_curve(true_labels, prob_pos, n_bins=10, strategy="quantile")
        plt.figure()
        plt.plot([0, 1], [0, 1], "k--", label="Perfect calibration")
        plt.plot(means, fracs, marker="o", label="Prompted")
        plt.xlabel("Predicted probability (bin avg)")
        plt.ylabel("Empirical positive rate")
        plt.title(f"{name}: {dataset_name} Reliability Curve")
        plt.legend()
        plt.tight_layout()
        plt.show()
        rel_path = DIR_FIGS / f"{name.lower().replace(' ', '_')}_{dataset_name.lower()}_reliability.png"
        plt.savefig(rel_path, dpi=150); plt.close()

        results.append({"dataset": dataset_name, "model_name": name, "acc": acc, "macro_f1": f1m, "brier": brier, "n_test": len(true_labels)})

        print(f"\nMisclassified examples for {name} ({dataset_name}) (up to 10 shown):\n")
        errors = []
        for t, yt, yp, p in zip(texts, true_labels, pred_bin, prob_pos):
            if yt != yp:
                errors.append((t, yt, yp, p))

        for i, (t, yt, yp, p) in enumerate(errors[:10], 1):
            true_lbl = "Positive" if yt == 1 else "Negative"
            pred_lbl = "Positive" if yp == 1 else "Negative"
            print(f"Example {i}:")
            print(f"  True label : {true_lbl}")
            print(f"  Pred label : {pred_lbl}")
            print(f"  P(Positive): {p:.3f}")
            print("  Text:", t[:400].replace("\n", " "))
            if len(t) > 400:
                print("  ...")
            print()

    compute_metrics("Zero-Shot Flan-T5", zs_preds)
    compute_metrics("Few-Shot Flan-T5", fs_preds)

    csv_path = DIR_TABLES / "prompting_metrics.csv"
    df = pd.DataFrame(results)
    if csv_path.exists():
        df.to_csv(csv_path, mode="a", header=False, index=False)
    else:
        df.to_csv(csv_path, index=False)
    print(f"Saved prompting metrics to {csv_path}")

    return results




# IMDB
imdb_true_labels = imdb_test_df["label"].tolist()
evaluate_and_plot(imdb_true_labels, imdb_zs_preds, imdb_fs_preds, texts=imdb_test_df["text"].tolist(), dataset_name="IMDB")

# Rotten Tomatoes
rt_true_labels = rt_test_df["label"].tolist()
evaluate_and_plot(rt_true_labels, rt_zs_preds, rt_fs_preds, texts=rt_test_df["text"].tolist(), dataset_name="Rotten Tomatoes")
