# RAG Evaluation

Evaluate our results from different experiments. We saved our experiment results in csv files. We will load these files and evaluate our generated answers with the ground truths with the evaluation framework `ragas`.

## Imports

In [37]:
import os
from dotenv import load_dotenv
import pandas as pd
import ast
from ragas import EvaluationDataset, evaluate
from ragas.metrics import (
    context_precision,
    context_recall,
    faithfulness,
    answer_relevancy,
)
from tqdm import tqdm
import openai
import time

## Load CSV files

In [39]:
import ast

def process_retrieved_contexts(row):
    if isinstance(row, str):  # If it's a string representation of a list
        parsed = ast.literal_eval(row)  # Parse the string into a list
        return parsed if isinstance(parsed, list) else [parsed]
    elif isinstance(row, list):  # If it's already a list
        return row
    else:  # If it's something else, convert to a single-item list
        return [row]

In [40]:
# load all csv files from ../data/experiments
folder_path = "../data/experiments"

# Initialize a dictionary to store dataframes for each file
experiment_dataframes = {}

# Iterate over all files in the folder
for file_name in os.listdir(folder_path):
    if file_name.endswith('.csv'):
        # Read the CSV file
        file_path = os.path.join(folder_path, file_name)
        df = pd.read_csv(file_path)

        # Select columns
        df = df[['question', 'answer', 'contexts', 'ground_truth']]

        # Rename columns
        df = df.rename(columns={
            'question': 'user_input',
            'contexts': 'retrieved_contexts',
            'answer': 'response', 
            'ground_truth': 'reference'
        })

        # Ensure every row in `retrieved_contexts` is a list
        df['retrieved_contexts'] = df['retrieved_contexts'].apply(process_retrieved_contexts)

        # Convert the dataframe to a RAGAS-compatible dataset
        dataset = df

        # Save each dataset in the dictionary
        experiment_name = os.path.splitext(file_name)[0]
        experiment_dataframes[experiment_name] = dataset

# Check loaded datasets
print(f"Loaded datasets for {len(experiment_dataframes)} experiments.")

for experiment_name, dataset in experiment_dataframes.items():
    print(f"Experiment: {experiment_name}, Number of samples: {len(dataset)}")

Loaded datasets for 2 experiments.
Experiment: recursive_1000_chunksize_100_overlap_ada_002_results, Number of samples: 23
Experiment: recursive_500_chunksize_50_overlap_ada_002_results, Number of samples: 23


In [47]:
from ragas.run_config import RunConfig
my_run_config = RunConfig(max_workers=1, timeout=60)

In [48]:
from ragas.dataset_schema import EvaluationDataset, SingleTurnSample

# Define metrics to evaluate
metrics = [context_precision, context_recall, faithfulness, answer_relevancy]

evaluation_results = {}

for experiment_name, eval_dataset in experiment_dataframes.items():
    print(f"Processing experiment: {experiment_name}")
    
    samples = []
    for _, row in eval_dataset.iterrows():
        sample = SingleTurnSample(
            user_input=row['user_input'],
            reference=row['reference'],
            response=row['response'],
            retrieved_contexts=row['retrieved_contexts']
        )
        samples.append(sample)
    
    evaluation_dataset = EvaluationDataset(samples=samples)

    results = evaluate(
        dataset=evaluation_dataset,
        metrics=metrics,
        run_config=my_run_config,
    )
    evaluation_results[experiment_name] = results
    print(f"Completed evaluation for {experiment_name}")

Processing experiment: recursive_1000_chunksize_100_overlap_ada_002_results


Evaluating:   0%|          | 0/92 [00:00<?, ?it/s]

Exception raised in Job[64]: TimeoutError()
Exception raised in Job[65]: TimeoutError()
Exception raised in Job[66]: TimeoutError()
Exception raised in Job[67]: TimeoutError()
Exception raised in Job[68]: TimeoutError()
Exception raised in Job[69]: TimeoutError()
Exception raised in Job[70]: TimeoutError()
Exception raised in Job[71]: TimeoutError()
Exception raised in Job[72]: TimeoutError()
Exception raised in Job[73]: TimeoutError()
Exception raised in Job[74]: TimeoutError()
Exception raised in Job[75]: TimeoutError()
Exception raised in Job[76]: TimeoutError()
Exception raised in Job[77]: TimeoutError()
Exception raised in Job[79]: TimeoutError()
Exception raised in Job[80]: TimeoutError()
Exception raised in Job[81]: TimeoutError()
Exception raised in Job[82]: TimeoutError()
Exception raised in Job[83]: TimeoutError()
Exception raised in Job[84]: TimeoutError()
Exception raised in Job[85]: TimeoutError()
Exception raised in Job[78]: TimeoutError()
Exception raised in Job[86]: Tim

KeyboardInterrupt: 

In [None]:
evaluation_results