# 02a â€” LLM Offline Evaluation (core)

Timebox: 45â€“60 minutes

Compute EM/F1, simple ROUGE-lite, and rubric-based scoring over toy pairs.



In [None]:
import json
import pandas as pd

from utils.datasets import read_csv, data_path
from utils.metrics import exact_match, f1_score, rouge_lite
from utils.grading import run_checks, assert_equal, assert_true

pairs = read_csv("toy/toy_text_pairs.csv")

pairs["em"] = [exact_match(p, r) for p, r in zip(pairs["prediction"], pairs["reference"])]
pairs["f1"] = [f1_score(p, r) for p, r in zip(pairs["prediction"], pairs["reference"])]
pairs[["rouge_p","rouge_r"]] = [
    pd.Series(rouge_lite(p, r)) for p, r in zip(pairs["prediction"], pairs["reference"])
]

# rubric demo: treat correctness as EM>0 or f1 threshold
pairs["rubric_correctness"] = (pairs["em"] | (pairs["f1"] > 0.6)).astype(int) * 5

em_mean = float(pairs["em"].mean())
f1_mean = float(pairs["f1"].mean())

chk1 = lambda: assert_true("has rows", len(pairs) >= 3)
chk2 = lambda: assert_true("em range", 0.0 <= em_mean <= 1.0)
chk3 = lambda: assert_true("f1 range", 0.0 <= f1_mean <= 1.0)
run_checks(chk1, chk2, chk3)

pairs.head()
