In [None]:
!pip install transformers datasets accelerate -q

from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from datasets import load_dataset
from tqdm import tqdm
import torch

def evaluate_model(model_name, n=100):
    print(f"\n[INFO] Evaluating model: {model_name}")
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to("cuda" if torch.cuda.is_available() else "cpu")

    dataset = load_dataset("squad_v2", split=f"validation[:{n}]")
    correct = 0

    for q, c, a in tqdm(zip(dataset["question"], dataset["context"], dataset["answers"]), total=n):
        prompt = f"Question: {q}\nContext: {c}\nAnswer:"
        inputs = tokenizer(prompt, return_tensors="pt", truncation=True).to(model.device)
        outputs = model.generate(**inputs, max_new_tokens=100)
        pred = tokenizer.decode(outputs[0], skip_special_tokens=True).lower()
        if any(ans.lower() in pred for ans in a["text"]):
            correct += 1

    acc = correct / n
    print(f"[RESULT] {model_name} Accuracy = {acc:.3f}")
    return acc

models = [
    "google/flan-t5-small",
    "google/flan-t5-base",
    "facebook/bart-base"
]

results = {m: evaluate_model(m) for m in models}
print("\n✅ Final Comparison:")
for m, score in results.items():
    print(f"{m}: {score:.3f}")
