# Evaluation Analysis — LLM Judge Results

This notebook loads all judge results and analyses:
- Overall accuracy, precision, recall, F1
- Confusion matrix
- Disagreement cases between judges
- Per-flag analysis

In [None]:
import sys
sys.path.insert(0, '..')

import json
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from collections import Counter

from pipeline.evaluator import Evaluator
from pipeline.sampler import ReceiptSampler

sns.set_theme(style='whitegrid')
plt.rcParams['figure.figsize'] = (10, 5)

## 1. Load Results

In [None]:
sampler = ReceiptSampler()
sample = sampler.load()
ground_truth = {r['id']: r['label'] for r in sample}

ev = Evaluator()
ev.load_results()

## 2. Evaluation Summary

In [None]:
summary = ev.summary(ground_truth)
print(json.dumps(summary, indent=2))

## 3. Confusion Matrix Heatmap

In [None]:
cm = summary['confusion_matrix']
cm_matrix = [
    [cm.get('TP', 0), cm.get('FN', 0)],
    [cm.get('FP', 0), cm.get('TN', 0)],
]

fig, ax = plt.subplots(figsize=(6, 5))
sns.heatmap(
    cm_matrix, annot=True, fmt='d', cmap='Blues',
    xticklabels=['Pred REAL', 'Pred FAKE'],
    yticklabels=['GT FAKE', 'GT REAL'],
    ax=ax
)
ax.set_title('Confusion Matrix')
plt.tight_layout()
plt.savefig('../outputs/eval_confusion_matrix.png', dpi=150)
plt.show()

## 4. Disagreement Cases

In [None]:
cases = ev.disagreement_cases(n=5)
for case in cases:
    gt = ground_truth.get(case['receipt_id'], '?')
    print(f"\n--- Receipt: {case['receipt_id']} ---")
    print(f"  Ground truth: {gt}")
    print(f"  Final verdict: {case['tally']}")
    for j in case.get('judges', []):
        print(f"  [{j['judge_name']:20s}] {j['label']:10s} ({j['confidence']:.0f}%)")
        for r in j.get('reasons', [])[:2]:
            print(f"    • {r}")

## 5. Flag Frequency Analysis

In [None]:
all_flags = Counter()
for r in ev._results:
    all_flags.update(r.get('all_flags', []))

if all_flags:
    flag_df = pd.DataFrame(all_flags.most_common(), columns=['Flag', 'Count'])
    fig, ax = plt.subplots(figsize=(12, 5))
    sns.barplot(data=flag_df, x='Flag', y='Count', ax=ax)
    ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha='right')
    ax.set_title('Flag Frequency Across All Receipts')
    plt.tight_layout()
    plt.savefig('../outputs/eval_flag_frequency.png', dpi=150)
    plt.show()
else:
    print("No flags found in results.")

## 6. Per-Judge Accuracy

In [None]:
judge_stats = {}
for result in ev._results:
    gt = ground_truth.get(result['receipt_id'], None)
    if gt is None:
        continue
    for j in result.get('judges', []):
        name = j['judge_name']
        if name not in judge_stats:
            judge_stats[name] = {'correct': 0, 'total': 0}
        judge_stats[name]['total'] += 1
        if j['label'] == gt:
            judge_stats[name]['correct'] += 1

for name, stats in judge_stats.items():
    acc = stats['correct'] / max(stats['total'], 1)
    print(f"  {name:25s}  accuracy: {acc:.1%}  ({stats['correct']}/{stats['total']})")