# Quickstart: Paired Evaluation & Reliability

This notebook mirrors the CLI:
1. Load ratings (control vs rubric)
2. Aggregate medians per prompt
3. Paired Wilcoxon + effect sizes
4. Krippendorff's alpha (interval)
5. Plot per-prompt deltas


In [None]:
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from src.eval.metrics import wilcoxon_signed_rank, rank_biserial_from_wilcoxon, krippendorff_alpha_interval

ROOT = Path.cwd().parents[0]
DATA = ROOT / 'data'
ctrl = pd.read_csv(DATA / 'ratings_control.csv')
rubr = pd.read_csv(DATA / 'ratings_rubric.csv')

def agg_median(df):
    return df.groupby('prompt_id')['rating'].median().sort_index()

x = agg_median(ctrl).values.astype(float)
y = agg_median(rubr).values.astype(float)

w = wilcoxon_signed_rank(x, y)
r_rb = rank_biserial_from_wilcoxon(x, y)
w, r_rb

In [None]:
def alpha(df):
    items = sorted(df['prompt_id'].unique())
    raters = sorted(df['rater_id'].unique())
    import numpy as np
    M = np.full((len(items), len(raters)), np.nan)
    i_map = {p:i for i,p in enumerate(items)}
    r_map = {r:i for i,r in enumerate(raters)}
    for _, row in df.iterrows():
        M[i_map[row['prompt_id']], r_map[row['rater_id']]] = float(row['rating'])
    return krippendorff_alpha_interval(M)

alpha_ctrl = alpha(ctrl)
alpha_rubr = alpha(rubr)
alpha_ctrl, alpha_rubr

In [None]:
d = y - x
plt.figure(figsize=(6,4))
plt.axhline(0, linewidth=1)
plt.scatter(range(1, len(d)+1), d)
plt.xlabel('Prompt index')
plt.ylabel('Rubric − Control (median rating)')
plt.title('Per-prompt paired differences')
plt.tight_layout()
