# 03 - Model Evaluation
## Batch Results, Model Comparison, and Cross-Chapter Analysis

Analyze LLM scoring results from the Anthropic Batch API. Compare models, measure test-retest reliability, and validate across chapters.

**Data source:** Parameterized via `AWESOMEBITS_DB` env var.

**Score files:** Place batch result JSONs in `.scratch/data/` with names like `scores-haiku-run1.json`, `scores-sonnet-run1.json`, etc.

In [None]:
import sys, json, glob
sys.path.insert(0, '.')
from helpers import *
import pandas as pd
import numpy as np
from pathlib import Path
from scipy import stats

setup_plotting()
con = connect()
data_dir = Path(DB_PATH).parent
print(f'Data directory: {data_dir}')
print(f'Score files found:')
for f in sorted(data_dir.glob('scores-*.json')):
    with open(f) as fh:
        d = json.load(fh)
    print(f'  {f.name}: {len(d)} scores')

## Load Scores

Load all available score files and merge with project metadata.

In [None]:
def load_scores(pattern='scores-*.json'):
    """Load score files into a dict of DataFrames keyed by filename stem."""
    results = {}
    for path in sorted(data_dir.glob(pattern)):
        with open(path) as f:
            scores = json.load(f)
        # Handle both list and dict formats
        if isinstance(scores, dict):
            rows = [{'project_id': int(k), **v} for k, v in scores.items()]
        else:
            rows = scores
        sdf = pd.DataFrame(rows)
        results[path.stem] = sdf
    return results

score_sets = load_scores()
print(f'Loaded {len(score_sets)} score sets: {list(score_sets.keys())}')

In [None]:
# Merge scores with labels from DuckDB
projects = con.execute('''
    SELECT p.id as project_id, p.title, p.funded_on, p.hidden_at,
           c.name as chapter_name, c.country
    FROM projects p
    JOIN chapters c ON p.chapter_id = c.id
''').df()
projects['label'] = projects.apply(label_project, axis=1)

# Merge each score set with labels
for name, sdf in score_sets.items():
    merged = sdf.merge(projects, on='project_id', how='left')
    score_sets[name] = merged
    labeled = merged[merged.label.isin(['funded', 'hidden'])]
    print(f'{name}: {len(merged)} total, {len(labeled)} labeled '
          f'(funded={len(labeled[labeled.label=="funded"])}, '
          f'hidden={len(labeled[labeled.label=="hidden"])})')

## Score Distributions

In [None]:
# Composite score distribution per model
score_col = 'composite_score'  # adjust if your JSON uses a different key
# Try to find the right column name
for name, sdf in score_sets.items():
    candidates = [c for c in sdf.columns if 'score' in c.lower() and 'composite' in c.lower()]
    if not candidates:
        candidates = [c for c in sdf.columns if c in ('composite_score', 'score', 'total_score')]
    if candidates:
        score_col = candidates[0]
        break
print(f'Using score column: {score_col}')

fig, axes = plt.subplots(1, len(score_sets), figsize=(6 * len(score_sets), 5), squeeze=False)
for ax, (name, sdf) in zip(axes[0], score_sets.items()):
    if score_col not in sdf.columns:
        ax.set_title(f'{name} (no {score_col})')
        continue
    for label, color in [('funded', 'mediumseagreen'), ('hidden', 'salmon'), ('unlabeled', 'lightgray')]:
        subset = sdf[sdf.label == label][score_col].dropna()
        if len(subset):
            ax.hist(subset, bins=20, alpha=0.6, color=color, label=f'{label} (n={len(subset)})', range=(0, 1))
    ax.set_title(name)
    ax.set_xlabel('Composite Score')
    ax.legend()
plt.suptitle('Score Distributions by Model and Label')
plt.tight_layout()
plt.show()

## Classification Metrics

Using composite score thresholds, how well does each model separate funded from hidden?

In [None]:
def evaluate_model(sdf, score_col, threshold=0.5):
    """Compute classification metrics at a given threshold."""
    labeled = sdf[sdf.label.isin(['funded', 'hidden'])].dropna(subset=[score_col]).copy()
    labeled['predicted'] = (labeled[score_col] >= threshold).map({True: 'funded', False: 'hidden'})

    tp = ((labeled.label == 'funded') & (labeled.predicted == 'funded')).sum()
    tn = ((labeled.label == 'hidden') & (labeled.predicted == 'hidden')).sum()
    fp = ((labeled.label == 'hidden') & (labeled.predicted == 'funded')).sum()
    fn = ((labeled.label == 'funded') & (labeled.predicted == 'hidden')).sum()

    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
    accuracy = (tp + tn) / len(labeled) if len(labeled) > 0 else 0

    return {
        'n': len(labeled), 'tp': tp, 'tn': tn, 'fp': fp, 'fn': fn,
        'precision': round(precision, 3), 'recall': round(recall, 3),
        'f1': round(f1, 3), 'accuracy': round(accuracy, 3),
        'funded_avg': round(labeled[labeled.label == 'funded'][score_col].mean(), 3),
        'hidden_avg': round(labeled[labeled.label == 'hidden'][score_col].mean(), 3),
    }

# Evaluate all models at multiple thresholds
thresholds = [0.3, 0.4, 0.5, 0.6, 0.7]
for name, sdf in score_sets.items():
    if score_col not in sdf.columns:
        print(f'{name}: no {score_col} column')
        continue
    print(f'\n=== {name} ===')
    rows = []
    for t in thresholds:
        m = evaluate_model(sdf, score_col, threshold=t)
        rows.append({'threshold': t, **m})
    print(pd.DataFrame(rows)[['threshold', 'precision', 'recall', 'f1', 'accuracy', 'fn', 'fp']].to_string(index=False))

## Model Comparison

If multiple models scored the same applications, compare them head-to-head.

In [None]:
if len(score_sets) >= 2:
    names = list(score_sets.keys())
    # Merge first two score sets on project_id
    a_name, b_name = names[0], names[1]
    a = score_sets[a_name][['project_id', score_col, 'label']].rename(columns={score_col: f'{a_name}'})
    b = score_sets[b_name][['project_id', score_col]].rename(columns={score_col: f'{b_name}'})
    comp = a.merge(b, on='project_id', how='inner')

    print(f'Comparing {a_name} vs {b_name}: {len(comp)} shared applications')

    # Correlation
    r, p = stats.pearsonr(comp[a_name].dropna(), comp[b_name].dropna())
    rho, _ = stats.spearmanr(comp[a_name].dropna(), comp[b_name].dropna())
    print(f'Pearson r={r:.3f} (p={p:.2e})')
    print(f'Spearman rho={rho:.3f}')

    # Score difference stats
    comp['diff'] = comp[a_name] - comp[b_name]
    print(f'Mean diff: {comp["diff"].mean():.3f}')
    print(f'Std diff: {comp["diff"].std():.3f}')
    print(f'Max abs diff: {comp["diff"].abs().max():.3f}')

    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))
    colors = comp.label.map({'funded': 'mediumseagreen', 'hidden': 'salmon', 'unlabeled': 'lightgray'})
    ax1.scatter(comp[a_name], comp[b_name], alpha=0.4, c=colors, s=20)
    ax1.plot([0, 1], [0, 1], 'k--', alpha=0.3)
    ax1.set_xlabel(a_name)
    ax1.set_ylabel(b_name)
    ax1.set_title(f'Score Comparison (r={r:.3f})')

    ax2.hist(comp['diff'], bins=40, color='steelblue', alpha=0.7)
    ax2.axvline(0, color='black', linestyle='--', alpha=0.3)
    ax2.set_xlabel('Score Difference')
    ax2.set_title(f'Score Difference Distribution')
    plt.tight_layout()
    plt.show()
else:
    print('Only one score set available. Submit additional batches for comparison.')

## Test-Retest Reliability

If the same model was run twice on the same inputs, measure score stability. Look for score files like `scores-haiku-run1.json` and `scores-haiku-run2.json`.

In [None]:
# Find run pairs (same model, different runs)
import re as _re
run_pairs = {}
for name in score_sets:
    m = _re.match(r'(scores-\w+)-run(\d+)', name)
    if m:
        base, run_num = m.group(1), int(m.group(2))
        run_pairs.setdefault(base, {})[run_num] = name

if run_pairs:
    for base, runs in run_pairs.items():
        if len(runs) < 2:
            continue
        r1_name = runs[min(runs)]
        r2_name = runs[max(runs)]
        r1 = score_sets[r1_name][['project_id', score_col]].rename(columns={score_col: 'run1'})
        r2 = score_sets[r2_name][['project_id', score_col]].rename(columns={score_col: 'run2'})
        paired = r1.merge(r2, on='project_id')

        r, _ = stats.pearsonr(paired.run1, paired.run2)
        rho, _ = stats.spearmanr(paired.run1, paired.run2)
        diff = (paired.run1 - paired.run2).abs()

        print(f'\n=== {base} test-retest (n={len(paired)}) ===')
        print(f'Pearson r={r:.4f}, Spearman rho={rho:.4f}')
        print(f'Mean abs diff: {diff.mean():.4f}')
        print(f'Max abs diff: {diff.max():.4f}')
        print(f'Std diff: {(paired.run1 - paired.run2).std():.4f}')
        # Threshold consistency
        for t in [0.3, 0.5, 0.7]:
            agree = ((paired.run1 >= t) == (paired.run2 >= t)).mean()
            print(f'  Threshold {t}: {agree:.1%} agreement')
else:
    print('No test-retest pairs found. Run the same model twice and name files like:')
    print('  scores-haiku-run1.json, scores-haiku-run2.json')

## Cross-Chapter Analysis

Do scores generalize across chapters, or are patterns locality-specific?

Chapters with enough labeled data for validation:

In [None]:
# Show which chapters have enough data
chapter_stats = con.execute('''
    SELECT c.name, c.country, c.inactive_at IS NOT NULL as inactive,
        COUNT(*) as total, COUNT(p.funded_on) as funded, COUNT(p.hidden_at) as hidden
    FROM projects p JOIN chapters c ON p.chapter_id = c.id
    GROUP BY c.name, c.country, inactive
    HAVING COUNT(p.funded_on) >= 10 AND COUNT(p.hidden_at) >= 10
    ORDER BY total DESC
''').df()
print(chapter_stats.to_string(index=False))
print(f'\n{len(chapter_stats)} chapters have enough labeled data for validation')

In [None]:
# Per-chapter score analysis (if scores include chapter info)
for name, sdf in score_sets.items():
    if score_col not in sdf.columns or 'chapter_name' not in sdf.columns:
        continue
    print(f'\n=== {name}: per-chapter scores ===')
    ch_scores = sdf.groupby(['chapter_name', 'label'])[score_col].agg(['mean', 'count']).round(3)
    ch_scores = ch_scores[ch_scores['count'] >= 5].reset_index()
    # Pivot for readability
    pivot = ch_scores.pivot_table(index='chapter_name', columns='label', values='mean')
    if 'funded' in pivot.columns and 'hidden' in pivot.columns:
        pivot['separation'] = (pivot['funded'] - pivot['hidden']).round(3)
        print(pivot.sort_values('separation', ascending=False).to_string())
    else:
        print(pivot.to_string())

## Predicted Quality Ratio

Does the ~28% quality ratio (from Chicago labeled data) hold when scoring unlabeled applications?

In [None]:
for name, sdf in score_sets.items():
    if score_col not in sdf.columns:
        continue
    print(f'\n=== {name} ===')
    for t in [0.3, 0.5, 0.7]:
        unlabeled = sdf[sdf.label == 'unlabeled']
        above = (unlabeled[score_col] >= t).mean()
        print(f'  Threshold {t}: {above:.1%} of unlabeled apps score above')
    # Overall distribution
    print(f'  Unlabeled score stats: mean={sdf[sdf.label=="unlabeled"][score_col].mean():.3f}, '
          f'median={sdf[sdf.label=="unlabeled"][score_col].median():.3f}')