In [1]:

import pandas as pd
import numpy as np
import re
import string
from collections import Counter, defaultdict

pd.set_option('display.max_colwidth', 200)

DATA_PATH = "../data/gold_standard.csv"   # e.g., "atomic_eval.csv"

df = pd.read_csv(DATA_PATH)
print("Loaded rows:", len(df))
print(df.head())


required_cols = {"sentence_id", "model_atomic", "gold_atomic"}
missing = required_cols - set(df.columns)
if missing:
    raise ValueError(f"Missing columns in CSV: {missing}")



def normalize_text(s: str) -> str:

    if not isinstance(s, str):
        return ""
    s = s.lower().strip()
    s = s.translate(str.maketrans("", "", string.punctuation))
    s = re.sub(r"\s+", " ", s)
    return s

def tokenize(s: str):
    return normalize_text(s).split()

print(normalize_text("The Cat, in THE Hat!!"), tokenize("The Cat, in THE Hat!!"))

def token_f1(pred: str, gold: str):
    pred_toks = tokenize(pred)
    gold_toks = tokenize(gold)

    if len(pred_toks) == 0 and len(gold_toks) == 0:
        return 1.0, 1.0, 1.0
    if len(pred_toks) == 0 or len(gold_toks) == 0:
        return 0.0, 0.0, 0.0

    pred_counts = Counter(pred_toks)
    gold_counts = Counter(gold_toks)
    overlap = sum((pred_counts & gold_counts).values())

    precision = overlap / max(len(pred_toks), 1)
    recall = overlap / max(len(gold_toks), 1)
    if precision + recall == 0:
        f1 = 0.0
    else:
        f1 = 2 * precision * recall / (precision + recall)
    return precision, recall, f1

df["norm_model"] = df["model_atomic"].apply(normalize_text)
df["norm_gold"] = df["gold_atomic"].apply(normalize_text)

df["exact_match"] = (df["norm_model"] == df["norm_gold"]).astype(int)

prs, rcs, f1s = [], [], []
for m, g in zip(df["model_atomic"], df["gold_atomic"]):
    p, r, f = token_f1(m, g)
    prs.append(p)
    rcs.append(r)
    f1s.append(f)

df["row_token_precision"] = prs
df["row_token_recall"] = rcs
df["row_token_f1"] = f1s

grouped = defaultdict(lambda: {"gold": set(), "model": set()})

for _, row in df.iterrows():
    sid = row["sentence_id"]
    g = normalize_text(row["gold_atomic"])
    m = normalize_text(row["model_atomic"])
    if g:
        grouped[sid]["gold"].add(g)
    if m:
        grouped[sid]["model"].add(m)

def prf_from_sets(model_set, gold_set):
    """
    Precision/Recall/F1 from two sets of strings (model vs gold).
    """
    if len(model_set) == 0 and len(gold_set) == 0:
        return 1.0, 1.0, 1.0
    if len(model_set) == 0 or len(gold_set) == 0:
        return 0.0, 0.0, 0.0

    inter = model_set & gold_set
    tp = len(inter)
    precision = tp / len(model_set) if model_set else 0.0
    recall = tp / len(gold_set) if gold_set else 0.0
    if precision + recall == 0:
        f1 = 0.0
    else:
        f1 = 2 * precision * recall / (precision + recall)
    return precision, recall, f1

rows = []
micro_tp = micro_pred = micro_gold = 0

for sid, d in grouped.items():
    mset, gset = d["model"], d["gold"]
    p, r, f = prf_from_sets(mset, gset)

    inter = mset & gset
    micro_tp += len(inter)
    micro_pred += len(mset)
    micro_gold += len(gset)

    rows.append({
        "sentence_id": sid,
        "n_model": len(mset),
        "n_gold": len(gset),
        "set_precision": p,
        "set_recall": r,
        "set_f1": f,
        "true_positives": len(inter),
    })

set_df = pd.DataFrame(rows).sort_values("sentence_id").reset_index(drop=True)

macro_p = set_df["set_precision"].mean()
macro_r = set_df["set_recall"].mean()
macro_f = set_df["set_f1"].mean()

micro_p = micro_tp / micro_pred if micro_pred else 0.0
micro_r = micro_tp / micro_gold if micro_gold else 0.0
micro_f = 2 * micro_p * micro_r / (micro_p + micro_r) if (micro_p + micro_r) else 0.0

print("\n=== SET-LEVEL METRICS (per sentence_id) ===")
print("Macro precision:", macro_p)
print("Macro recall   :", macro_r)
print("Macro F1       :", macro_f)
print()
print("Micro precision:", micro_p)
print("Micro recall   :", micro_r)
print("Micro F1       :", micro_f)

set_df.head()


# 5. Inspect worst cases ----------------------------------------------------

print("\n=== WORST ROW-LEVEL F1 EXAMPLES ===")
worst_rows = df.sort_values("row_token_f1").head(10)
print(worst_rows[["sentence_id", "model_atomic", "gold_atomic", "row_token_f1"]])

print("\n=== WORST SET-LEVEL F1 SENTENCE_IDS ===")
print(set_df.sort_values("set_f1").head(10))


# 6. Helper to inspect one sentence_id in detail ----------------------------

def inspect_sentence_id(sid):
    """
    Show all model/gold pairs for a given sentence_id + set-level TP/FP/FN.
    """
    subset = df[df["sentence_id"] == sid].copy()
    if subset.empty:
        print(f"No rows with sentence_id={sid}")
        return

    print(f"\n================ sentence_id = {sid} ================\n")
    print("Row-wise pairs:")
    display(subset[["model_atomic", "gold_atomic", "row_token_f1"]])

    mset = {normalize_text(x) for x in subset["model_atomic"] if isinstance(x, str)}
    gset = {normalize_text(x) for x in subset["gold_atomic"] if isinstance(x, str)}
    inter = mset & gset

    print("\nGold set (normalized):")
    for g in gset:
        print("  G:", g)

    print("\nModel set (normalized):")
    for m in mset:
        print("  M:", m)

    print("\nTrue positives (intersection):")
    for t in inter:
        print("  TP:", t)

    print("\nFalse negatives (gold but not model):")
    for fn in sorted(gset - inter):
        print("  FN:", fn)

    print("\nFalse positives (model but not gold):")
    for fp in sorted(mset - inter):
        print("  FP:", fp)



Loaded rows: 252
  sentence_id  \
0           1   
1           1   
2           1   
3           2   
4           2   

                                                                                             model_atomic  \
0                  Jakobshavn Isbr is a major contributor to the mass balance of the Greenland ice sheet.   
1                                                               Jakobshavn Isbr passing out of the fjord.   
2  Some 10  of all Greenland icebergs some 35 billion tonnes of icebergs calved passing out of the fjord.   
3                                                                       Wright played the part of mufasa.   
4                                          Wright voiced the iguanodon in disney 's cgi film '' dinosaur.   

                                                                                            gold_atomic  
0               Jakobshavn Isbræ is a major contributor to the mass balance of the Greenland ice sheet.  
1            

In [1]:

import pandas as pd
import numpy as np

from rouge_score import rouge_scorer
from bert_score import score as bert_score

pd.set_option("display.max_colwidth", 200)

csv_path = "atomic_eval_pairs.csv"

df = pd.read_csv(csv_path)

expected_cols = {"model_atomic", "gold_atomic"}
missing = expected_cols - set(df.columns)
if missing:
    raise ValueError(f"CSV is missing columns: {missing}. Columns found: {df.columns.tolist()}")

print(f"Loaded rows: {len(df)}")
print(df.head())

def normalize_text(s):
    if isinstance(s, str):
        return " ".join(s.split())
    return ""

df["model_atomic_norm"] = df["model_atomic"].apply(normalize_text)
df["gold_atomic_norm"]  = df["gold_atomic"].apply(normalize_text)


scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

rouge1_p, rouge1_r, rouge1_f = [], [], []
rouge2_p, rouge2_r, rouge2_f = [], [], []
rougel_p, rougel_r, rougel_f = [], [], []

for m, g in zip(df["model_atomic_norm"], df["gold_atomic_norm"]):
    scores = scorer.score(g, m) 

    r1 = scores["rouge1"]
    r2 = scores["rouge2"]
    rl = scores["rougeL"]

    rouge1_p.append(r1.precision)
    rouge1_r.append(r1.recall)
    rouge1_f.append(r1.fmeasure)

    rouge2_p.append(r2.precision)
    rouge2_r.append(r2.recall)
    rouge2_f.append(r2.fmeasure)

    rougel_p.append(rl.precision)
    rougel_r.append(rl.recall)
    rougel_f.append(rl.fmeasure)

# Add per-row ROUGE-L F1 to the dataframe (often the most interpretable)
df["rougeL_f"] = rougel_f

print("\n=== ROUGE (averaged over all pairs) ===")
print(f"ROUGE-1 Precision: {np.mean(rouge1_p):.4f}")
print(f"ROUGE-1 Recall: {np.mean(rouge1_r):.4f}")
print(f"ROUGE-1 F1: {np.mean(rouge1_f):.4f}\n")

print(f"ROUGE-2 Precision: {np.mean(rouge2_p):.4f}")
print(f"ROUGE-2 Recall: {np.mean(rouge2_r):.4f}")
print(f"ROUGE-2 F1: {np.mean(rouge2_f):.4f}\n")

print(f"ROUGE-L Precision: {np.mean(rougel_p):.4f}")
print(f"ROUGE-L Recall: {np.mean(rougel_r):.4f}")
print(f"ROUGE-L F1: {np.mean(rougel_f):.4f}")



cands = df["model_atomic_norm"].tolist()
refs  = df["gold_atomic_norm"].tolist()

P, R, F1 = bert_score(cands, refs, lang="en", rescale_with_baseline=True)

# Convert tensors to floats
df["bertscore_P"]  = P.numpy()
df["bertscore_R"]  = R.numpy()
df["bertscore_F1"] = F1.numpy()

print("\n=== BERTScore (averaged over all pairs) ===")
print(f"BERTScore Precision: {df['bertscore_P'].mean():.4f}")
print(f"BERTScore Recall   : {df['bertscore_R'].mean():.4f}")
print(f"BERTScore F1       : {df['bertscore_F1'].mean():.4f}")



out_path = "atomic_eval_with_rouge_bertscore.csv"
df.to_csv(out_path, index=False)
print(f"\nSaved detailed results to: {out_path}")


ModuleNotFoundError: No module named 'rouge_score'