# 04 — Publication Plots & Results Tables

This notebook generates the key figures and tables for the CertiRAG paper.
All plots use synthetic/demo data — replace with real eval results for submission.

## Figures:
1. **Main comparison table** — CertiRAG vs baselines on 3 benchmarks
2. **Ablation table** — Component-level contribution analysis
3. **Threshold sensitivity** — F1 vs τ_e heatmap
4. **MSE compression** — Evidence reduction analysis
5. **Calibration reliability** — Before vs after calibration

In [None]:
import sys, os
sys.path.insert(0, os.path.abspath(".."))

import numpy as np
from certirag.utils import set_all_seeds

set_all_seeds(42)

## 1. Main Results Table

Comparison of CertiRAG against baselines across three benchmarks.
All numbers are synthetic placeholders — replace with actual eval results.

In [None]:
# Synthetic results — structure matches paper format
BENCHMARKS = ["ALCE", "RAGTruth", "AggreFact"]
METHODS = [
    "Vanilla RAG",
    "Self-RAG",
    "CRAG",
    "FActScore",
    "CertiRAG (ours)",
]

# Format: {method: {benchmark: {metric: value}}}
RESULTS = {
    "Vanilla RAG":     {"ALCE": {"P": 0.72, "R": 0.81, "F1": 0.76, "Faith": 0.68},
                        "RAGTruth": {"P": 0.69, "R": 0.78, "F1": 0.73, "Faith": 0.65},
                        "AggreFact": {"P": 0.71, "R": 0.80, "F1": 0.75, "Faith": 0.67}},
    "Self-RAG":        {"ALCE": {"P": 0.78, "R": 0.76, "F1": 0.77, "Faith": 0.74},
                        "RAGTruth": {"P": 0.76, "R": 0.73, "F1": 0.74, "Faith": 0.72},
                        "AggreFact": {"P": 0.77, "R": 0.75, "F1": 0.76, "Faith": 0.73}},
    "CRAG":            {"ALCE": {"P": 0.80, "R": 0.74, "F1": 0.77, "Faith": 0.76},
                        "RAGTruth": {"P": 0.78, "R": 0.71, "F1": 0.74, "Faith": 0.74},
                        "AggreFact": {"P": 0.79, "R": 0.73, "F1": 0.76, "Faith": 0.75}},
    "FActScore":       {"ALCE": {"P": 0.82, "R": 0.70, "F1": 0.76, "Faith": 0.79},
                        "RAGTruth": {"P": 0.80, "R": 0.68, "F1": 0.74, "Faith": 0.77},
                        "AggreFact": {"P": 0.81, "R": 0.69, "F1": 0.75, "Faith": 0.78}},
    "CertiRAG (ours)": {"ALCE": {"P": 0.91, "R": 0.82, "F1": 0.86, "Faith": 0.93},
                        "RAGTruth": {"P": 0.89, "R": 0.80, "F1": 0.84, "Faith": 0.91},
                        "AggreFact": {"P": 0.90, "R": 0.81, "F1": 0.85, "Faith": 0.92}},
}

# Print LaTeX-style table
print("Table 1: Main results across three benchmarks (synthetic data)\n")
header = f"{'Method':<20}"
for bench in BENCHMARKS:
    header += f" | {'P':>5} {'R':>5} {'F1':>5} {'Fth':>5}"
print(header)
print("=" * len(header))

for method in METHODS:
    row = f"{method:<20}"
    for bench in BENCHMARKS:
        r = RESULTS[method][bench]
        bold = method == "CertiRAG (ours)"
        fmt = lambda v: f"**{v:.2f}**" if bold else f"{v:.2f}"
        row += f" | {fmt(r['P']):>7} {fmt(r['R']):>7} {fmt(r['F1']):>7} {fmt(r['Faith']):>7}"
    print(row)

## 2. Ablation Study Table

In [None]:
ABLATIONS = {
    "Full CertiRAG":        {"F1": 0.86, "Faith": 0.93, "MSE_ratio": 0.34, "Latency_ms": 420},
    "− MSE selection":      {"F1": 0.85, "Faith": 0.91, "MSE_ratio": 1.00, "Latency_ms": 380},
    "− Claim normalisation":{"F1": 0.82, "Faith": 0.89, "MSE_ratio": 0.38, "Latency_ms": 400},
    "− Cross-encoder":      {"F1": 0.83, "Faith": 0.90, "MSE_ratio": 0.36, "Latency_ms": 310},
    "BM25-only retrieval":  {"F1": 0.79, "Faith": 0.87, "MSE_ratio": 0.41, "Latency_ms": 280},
    "Dense-only retrieval": {"F1": 0.81, "Faith": 0.88, "MSE_ratio": 0.39, "Latency_ms": 350},
    "NLI verifier":         {"F1": 0.80, "Faith": 0.86, "MSE_ratio": 0.37, "Latency_ms": 390},
    "LLM-judge verifier":   {"F1": 0.84, "Faith": 0.90, "MSE_ratio": 0.35, "Latency_ms": 1200},
}

print("Table 2: Ablation study on ALCE benchmark (synthetic data)\n")
print(f"{'Configuration':<25} {'F1':>6} {'Faith':>7} {'MSE↓':>6} {'Lat(ms)':>8}")
print("=" * 54)

for config_name, metrics in ABLATIONS.items():
    marker = " ★" if config_name == "Full CertiRAG" else ""
    print(f"{config_name:<25} {metrics['F1']:>6.2f} {metrics['Faith']:>7.2f} "
          f"{metrics['MSE_ratio']:>6.2f} {metrics['Latency_ms']:>8d}{marker}")

## 3. Threshold sensitivity heatmap (text-based)

In [None]:
# Simulate F1 scores for threshold grid
tau_e_vals = np.arange(0.50, 0.96, 0.05)
tau_c_vals = np.arange(0.40, 0.86, 0.05)

rng = np.random.default_rng(42)

# Generate plausible F1 surface: peaks around τ_e≈0.85, τ_c≈0.70
def f1_surface(te, tc):
    base = 0.86
    # Penalty for deviation from optimal
    penalty_e = -2.0 * (te - 0.85)**2
    penalty_c = -1.5 * (tc - 0.70)**2
    # Interaction term
    interaction = -0.5 * abs(te - tc - 0.15)
    noise = rng.normal(0, 0.005)
    return np.clip(base + penalty_e + penalty_c + interaction + noise, 0.50, 0.90)

print("Figure 3: F1 vs (τ_e, τ_c) threshold grid (synthetic)\n")
print(f"{'':>8}", end="")
for tc in tau_c_vals:
    print(f" τ_c={tc:.2f}", end="")
print()
print("-" * (8 + len(tau_c_vals) * 10))

for te in tau_e_vals:
    print(f"τ_e={te:.2f}", end=" ")
    for tc in tau_c_vals:
        f1 = f1_surface(te, tc)
        # Use unicode blocks for heatmap effect
        if f1 >= 0.84:
            sym = "█"
        elif f1 >= 0.80:
            sym = "▓"
        elif f1 >= 0.75:
            sym = "▒"
        else:
            sym = "░"
        print(f"  {sym}{f1:.3f} ", end="")
    print()

print(f"\nLegend: █≥0.84  ▓≥0.80  ▒≥0.75  ░<0.75")
print(f"Optimal region: τ_e∈[0.80,0.90], τ_c∈[0.65,0.75]")

## 4. MSE compression analysis

In [None]:
# Simulate MSE compression ratios per claim
N_CLAIMS = 50
rng = np.random.default_rng(42)

total_evidence = rng.integers(3, 15, size=N_CLAIMS)
mse_selected = np.array([max(1, int(t * rng.beta(2, 5))) for t in total_evidence])
compression = mse_selected / total_evidence

print("Figure 4: MSE Evidence Compression\n")
print(f"Claims analysed: {N_CLAIMS}")
print(f"Mean total evidence per claim:  {total_evidence.mean():.1f}")
print(f"Mean MSE selected per claim:    {mse_selected.mean():.1f}")
print(f"Mean compression ratio:         {compression.mean():.3f}")
print(f"Median compression ratio:       {np.median(compression):.3f}")

# Histogram (text-based)
print(f"\nCompression ratio distribution:")
bins = np.arange(0, 1.05, 0.1)
hist, _ = np.histogram(compression, bins=bins)
max_count = max(hist)
for i in range(len(hist)):
    bar = "█" * int(hist[i] / max_count * 40) if max_count > 0 else ""
    print(f"  [{bins[i]:.1f}-{bins[i+1]:.1f}) {bar} {hist[i]}")

print(f"\n→ MSE reduces evidence by {(1-compression.mean())*100:.0f}% on average")
print(f"  while maintaining verification guarantees (Theorem 1).")

## 5. Verifier comparison across benchmarks

In [None]:
VERIFIER_RESULTS = {
    "MiniCheck": {
        "ALCE": {"F1": 0.86, "Faith": 0.93, "ECE": 0.032},
        "RAGTruth": {"F1": 0.84, "Faith": 0.91, "ECE": 0.035},
        "AggreFact": {"F1": 0.85, "Faith": 0.92, "ECE": 0.029},
    },
    "NLI-DeBERTa": {
        "ALCE": {"F1": 0.80, "Faith": 0.86, "ECE": 0.058},
        "RAGTruth": {"F1": 0.78, "Faith": 0.84, "ECE": 0.062},
        "AggreFact": {"F1": 0.79, "Faith": 0.85, "ECE": 0.055},
    },
    "LLM-Judge": {
        "ALCE": {"F1": 0.84, "Faith": 0.90, "ECE": 0.041},
        "RAGTruth": {"F1": 0.82, "Faith": 0.88, "ECE": 0.045},
        "AggreFact": {"F1": 0.83, "Faith": 0.89, "ECE": 0.038},
    },
}

print("Table 3: Verifier comparison (synthetic data)\n")
for bench in BENCHMARKS:
    print(f"\n--- {bench} ---")
    print(f"{'Verifier':<15} {'F1':>6} {'Faith':>7} {'ECE':>6}")
    print("-" * 36)
    for verifier, results in VERIFIER_RESULTS.items():
        r = results[bench]
        best = verifier == "MiniCheck"
        marker = " ★" if best else ""
        print(f"{verifier:<15} {r['F1']:>6.2f} {r['Faith']:>7.2f} {r['ECE']:>6.3f}{marker}")

print("\n★ = Best performer")
print("\nKey insight: MiniCheck achieves highest F1 and faithfulness with")
print("lowest calibration error, validating it as the default verifier.")

## 6. LaTeX table export

Generate copy-paste-ready LaTeX for the paper.

In [None]:
def to_latex_main_table():
    """Generate LaTeX for the main results table."""
    lines = [
        r"\begin{table}[t]",
        r"\centering",
        r"\caption{Main results on three hallucination benchmarks.}",
        r"\label{tab:main-results}",
        r"\resizebox{\textwidth}{!}{%",
        r"\begin{tabular}{l" + "cccc" * len(BENCHMARKS) + "}",
        r"\toprule",
    ]
    
    # Header rows
    header1 = r"\multirow{2}{*}{Method}"
    for bench in BENCHMARKS:
        header1 += f" & \\multicolumn{{4}}{{c}}{{{bench}}}"
    header1 += r" \\"
    lines.append(header1)
    
    header2 = ""
    for _ in BENCHMARKS:
        header2 += r" & P & R & F1 & Faith"
    header2 += r" \\"
    lines.append(r"\cmidrule(lr){2-5}" * len(BENCHMARKS))
    lines.append(header2)
    lines.append(r"\midrule")
    
    # Data rows
    for method in METHODS:
        is_ours = method == "CertiRAG (ours)"
        row = method.replace("(", "\\textbf{(").replace(")", ")}") if is_ours else method
        for bench in BENCHMARKS:
            r = RESULTS[method][bench]
            for metric in ["P", "R", "F1", "Faith"]:
                val = f"{r[metric]:.2f}"
                if is_ours:
                    val = f"\\textbf{{{val}}}"
                row += f" & {val}"
        row += r" \\"
        lines.append(row)
    
    lines.extend([
        r"\bottomrule",
        r"\end{tabular}}",
        r"\end{table}",
    ])
    return "\n".join(lines)

latex = to_latex_main_table()
print("LaTeX output (copy to paper):\n")
print(latex)

---

## Notes for Paper Submission

1. Replace all synthetic data with actual eval runner output
2. Run `make eval` to generate real metrics on all benchmarks
3. Use `eval/plots.py` for matplotlib/seaborn publication-quality figures
4. Ensure all numbers match between paper text, tables, and figures
5. Include confidence intervals (bootstrap) for final submission