# Evaluation Graphics
This notebook compares evaluation metrics from `results_ground_truth.csv` and `results_rag.csv`.
The goal is to visualize how the RAG approach performs across precision, recall and ROUGE metrics for each question and each paper.

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from pathlib import Path
sns.set(style='whitegrid')

In [None]:
# Load the csv files
base = Path('Experiment')
rag = pd.read_csv(base / 'results_rag.csv')
gt = pd.read_csv(base / 'results_ground_truth.csv')
metric_cols = ['precision-1','recall-1','ROUGE-1','precision-2','recall-2','ROUGE-2']

In [None]:
# Extract document and question information
def add_doc_question(df):
    df['document'] = df['question_id'].str.split('_').str[0]
    df['question'] = df['question_id'].str.split('_').str[1:].str.join('_')
    return df

rag = add_doc_question(rag)
gt = add_doc_question(gt)

In [None]:
# Merge dataframes for comparison
merged = rag[['question_id','document','question'] + metric_cols].merge(
    gt[['question_id'] + metric_cols], on='question_id', suffixes=('_rag','_gt'))

for col in metric_cols:
    merged[f'{col}_diff'] = merged[f'{col}_rag'] - merged[f'{col}_gt']

merged.head()

## Heatmap of Metric Improvements
Each cell shows the improvement (RAG minus Ground Truth) for a metric/question pair.

In [None]:
def plot_improvement_heatmap(df):
    heat_data = df.set_index('question_id')[[c + '_diff' for c in metric_cols]]
    plt.figure(figsize=(10, len(df)*0.4 + 1))
    sns.heatmap(heat_data, annot=True, cmap='RdYlGn', center=0)
    plt.title('Metric improvement (RAG - Ground Truth)')
    plt.show()

plot_improvement_heatmap(merged)

## Bar Chart by Question
Compare each metric for RAG and Ground Truth for a specific question.

In [None]:
def plot_question_metrics(df, qid):
    data = df[df['question_id'] == qid]
    to_plot = data[[m + '_rag' for m in metric_cols] + [m + '_gt' for m in metric_cols]].T
    to_plot.index = metric_cols * 2
    to_plot['source'] = ['RAG']*len(metric_cols) + ['Ground Truth']*len(metric_cols)
    plt.figure(figsize=(8,4))
    sns.barplot(x='index', y=0, hue='source', data=to_plot.reset_index())
    plt.title(f'Metrics for {qid}')
    plt.xticks(rotation=45)
    plt.ylabel('score')
    plt.show()

_ = [plot_question_metrics(merged, q) for q in merged['question_id'].unique()[:3]]  # example

## Mean Metrics per Document
Aggregated metrics are averaged per paper (P1-P5).

In [None]:
def plot_document_means(df):
    means = df.groupby('document')[[c + '_rag' for c in metric_cols] + [c + '_gt' for c in metric_cols]].mean()
    melted = means.reset_index().melt(id_vars='document', var_name='metric', value_name='score')
    melted['source'] = melted['metric'].apply(lambda x: 'RAG' if x.endswith('_rag') else 'Ground Truth')
    melted['metric'] = melted['metric'].str.replace('_rag','').str.replace('_gt','')
    plt.figure(figsize=(10,5))
    sns.barplot(data=melted, x='document', y='score', hue='source')
    plt.title('Average metrics per document')
    plt.show()

plot_document_means(merged)

## Boxplot of Improvements by Document


In [None]:
def plot_improvement_boxplot(df):
    diff_cols = [c + '_diff' for c in metric_cols]
    melted = df.melt(id_vars='document', value_vars=diff_cols, var_name='metric', value_name='improvement')
    plt.figure(figsize=(10,5))
    sns.boxplot(data=melted, x='document', y='improvement')
    plt.axhline(0, color='red', linestyle='--')
    plt.title('Distribution of improvements per document')
    plt.show()

plot_improvement_boxplot(merged)