# Evaluation of RAG Results
This notebook explores evaluation results comparing a baseline model to a RAG system.

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from pathlib import Path

FIGSIZE = (10,5)
COLOR_PRIMARY = '#792EE5'
COLOR_SECONDARY = '#17C3B2'

sns.set(style='whitegrid', palette=[COLOR_PRIMARY, COLOR_SECONDARY])
base = Path('Experiment')
rag = pd.read_csv(base / 'results_rag.csv')
baseline = pd.read_csv(base / 'results_baseline.csv')
metric_cols = ['precision-1','recall-1','ROUGE-1','precision-2','recall-2','ROUGE-2']

In [None]:
for df in (rag, baseline):
    df['paper'] = df['question_id'].str.split('_').str[0]
    df['question'] = df['question_id'].str.split('_').str[1]

## General information
Summary statistics about the evaluation data.

In [None]:
print('Total questions:', len(rag))
print('Papers:', rag['paper'].unique())
summary = rag[metric_cols].describe()
summary

## Combine baseline and RAG metrics

In [None]:
merged = rag[['question_id','paper','question'] + metric_cols].merge(
    baseline[['question_id'] + metric_cols], on='question_id', suffixes=('_rag','_bl'))
for col in metric_cols:
    merged[f'{col}_diff'] = merged[f'{col}_rag'] - merged[f'{col}_bl']
merged.head()

## Overall improvement heatmap
Shows improvement (RAG - Baseline) per metric and question.

In [None]:
plt.figure(figsize=(FIGSIZE[0]*1.2, len(merged)*0.4))
heat = merged.set_index('question_id')[[c+'_diff' for c in metric_cols]]
sns.heatmap(heat, annot=True, cmap=sns.diverging_palette(240, 10, as_cmap=True), center=0)
plt.title('Metric improvement per question')
plt.show()

## Comparison of all questions for a paper

In [None]:
def plot_paper_questions(df, paper):
    data = df[df['paper']==paper]
    melted = data.melt(id_vars=['question'],
                       value_vars=[c+'_rag' for c in metric_cols]+[c+'_bl' for c in metric_cols],
                       var_name='metric', value_name='score')
    melted['source'] = melted['metric'].apply(lambda x: 'RAG' if x.endswith('_rag') else 'Baseline')
    melted['metric'] = melted['metric'].str.replace('_rag','').str.replace('_bl','')
    plt.figure(figsize=(FIGSIZE[0]*1.3, FIGSIZE[1]*1.2))
    sns.barplot(data=melted, x='question', y='score', hue='source')
    plt.title(f'Metrics for all questions in {paper}')
    plt.xticks(rotation=45)
    plt.show()

plot_paper_questions(merged, 'P1')

## Comparison of a question across papers

In [None]:
def plot_question_across_papers(df, question):
    data = df[df['question']==question]
    melted = data.melt(id_vars=['paper'],
                       value_vars=[c+'_rag' for c in metric_cols]+[c+'_bl' for c in metric_cols],
                       var_name='metric', value_name='score')
    melted['source'] = melted['metric'].apply(lambda x: 'RAG' if x.endswith('_rag') else 'Baseline')
    melted['metric'] = melted['metric'].str.replace('_rag','').str.replace('_bl','')
    plt.figure(figsize=(FIGSIZE[0]*1.3, FIGSIZE[1]*1.2))
    sns.barplot(data=melted, x='paper', y='score', hue='source')
    plt.title(f'Metrics for question {question} across papers')
    plt.show()

plot_question_across_papers(merged, 'Q1')

## Individual question metrics

In [None]:
def plot_question_metrics(df, qid):
    data = df[df['question_id']==qid]
    to_plot = data[[m+'_rag' for m in metric_cols] + [m+'_bl' for m in metric_cols]].T
    to_plot.index = metric_cols*2
    to_plot['source'] = ['RAG']*len(metric_cols) + ['Baseline']*len(metric_cols)
    plt.figure(figsize=FIGSIZE)
    sns.barplot(x=to_plot.index, y=0, hue='source', data=to_plot.reset_index())
    plt.title(f'Metrics for {qid}')
    plt.xticks(rotation=45)
    plt.ylabel('score')
    plt.show()

for q in merged['question_id']:
    plot_question_metrics(merged, q)

## Distribution of improvements

In [None]:
diff_cols=[c+'_diff' for c in metric_cols]
plt.figure(figsize=FIGSIZE)
merged[diff_cols].hist(bins=20, figsize=(FIGSIZE[0]*1.5, FIGSIZE[1]*1.5), color=COLOR_PRIMARY)
plt.suptitle('Histogram of metric improvements')
plt.show()

## Correlation between improvements

In [None]:
plt.figure(figsize=(FIGSIZE[0]*1.2, FIGSIZE[1]*1.2))
sns.heatmap(merged[diff_cols].corr(), annot=True, cmap=sns.cubehelix_palette(as_cmap=True))
plt.title('Correlation of improvements')
plt.show()