# Evaluation of RAG vs Baseline
This notebook evaluates the question answering results stored in `results_baseline.csv` and `results_rag.csv`.
It visualises different metrics to compare the baseline system with the RAG implementation.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

sns.set(style="whitegrid")
baseline = pd.read_csv(Path('results_baseline.csv'))
rag = pd.read_csv(Path('results_rag.csv'))

metrics = ['precision-1','recall-1','ROUGE-1','precision-2','recall-2','ROUGE-2',
           'factual_correctness','completeness','relevance','justification','depth','overall_score']

for df in (baseline, rag):
    for col in metrics:
        df[col] = pd.to_numeric(df[col], errors='coerce')
    df['paper'] = df['question_id'].str.extract(r'^(..)_')


## Average metrics across all questions

In [None]:
avg_base = baseline[metrics].mean()
avg_rag = rag[metrics].mean()
avg_df = pd.DataFrame({'Baseline': avg_base, 'RAG': avg_rag})
ax = avg_df.plot(kind='bar', figsize=(12,4))
ax.set_ylabel('Score')
ax.set_title('Average metrics across all questions')
for p in ax.patches:
    height = p.get_height()
    label = f"{height:.1%}" if height <= 1 else f"{height:.1f}"
    ax.annotate(label, (p.get_x()+p.get_width()/2., height), ha='center', va='bottom', fontsize=8, rotation=0)
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

## Pass/fail counts

In [None]:
pass_base = baseline['pass'].astype(bool).value_counts()
pass_rag = rag['pass'].astype(bool).value_counts()
count_df = pd.DataFrame({'Baseline': pass_base, 'RAG': pass_rag}).fillna(0).T
count_df.columns = ['Fail','Pass']
ax = count_df.plot(kind='bar', stacked=False, figsize=(6,4))
ax.set_ylabel('Count')
ax.set_title('Number of passes and fails')
for c in ax.containers:
    ax.bar_label(c, label_type='edge')
plt.xticks(rotation=0)
plt.tight_layout()
plt.show()

## Heatmaps of precision/recall/ROUGE per question

In [None]:
heat_metrics = ['precision-1','recall-1','ROUGE-1','precision-2','recall-2','ROUGE-2']
fig, axes = plt.subplots(1,2, figsize=(14,8), sharey=True)
vmin = min(baseline[heat_metrics].min().min(), rag[heat_metrics].min().min())
vmax = max(baseline[heat_metrics].max().max(), rag[heat_metrics].max().max())
sns.heatmap(baseline[heat_metrics], ax=axes[0], vmin=vmin, vmax=vmax, cmap='viridis')
axes[0].set_title('Baseline')
sns.heatmap(rag[heat_metrics], ax=axes[1], vmin=vmin, vmax=vmax, cmap='viridis')
axes[1].set_title('RAG')
for ax in axes:
    ax.set_xlabel('Metric')
    ax.set_ylabel('Question')
plt.tight_layout()
plt.show()

## Metrics per paper

In [None]:
for paper, b_group in baseline.groupby('paper'):
    r_group = rag[rag['paper']==paper]
    avg_b = b_group[metrics].mean()
    avg_r = r_group[metrics].mean()
    df = pd.DataFrame({'Baseline': avg_b, 'RAG': avg_r})
    ax = df.plot(kind='bar', figsize=(12,4))
    ax.set_ylabel('Score')
    ax.set_title(f'Average metrics for paper {paper}')
    for p in ax.patches:
        height = p.get_height()
        label = f"{height:.1%}" if height <= 1 else f"{height:.1f}"
        ax.annotate(label, (p.get_x()+p.get_width()/2., height), ha='center', va='bottom', fontsize=8)
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.show()


## Correlation between LLM-as-judge metrics and n-gram metrics

In [None]:
judge_metrics = ['factual_correctness','completeness','relevance','justification','depth']
n_metrics = ['precision-1','recall-1','ROUGE-1','precision-2','recall-2','ROUGE-2']
corr = rag[judge_metrics+n_metrics].corr()
plt.figure(figsize=(10,8))
sns.heatmap(corr, annot=True, fmt='.2f', cmap='coolwarm')
plt.title('Correlation matrix')
plt.tight_layout()
plt.show()

## Additional plots

In [None]:
# Distribution of overall scores
plt.figure(figsize=(6,4))
sns.histplot(baseline['overall_score'], color='b', label='Baseline', kde=True)
sns.histplot(rag['overall_score'], color='orange', label='RAG', kde=True)
plt.legend()
plt.title('Distribution of overall scores')
plt.tight_layout()
plt.show()