# Evaluation with RAGAS 

## Preliminary
1. The ragas-evaluation package is in the "./use-case-examples/rag-investment-analysis-assistant/libraries/ragas-evaluation" directory. 
2. Run `pip install -e . --quiet` in  the RAGAS package directory "./use-case-examples/rag-investment-analysis-assistant/libraries/ragas-evaluation" 
3. Install LlamaIndex, run `pip install llama-index==0.9.6.post1`


## Load packages and tools

In [1]:
import os
import pandas as pd
from datasets import Dataset
from datasets import load_dataset

In [2]:
from ragas import evaluate

from ragas.metrics.answer_precision import AnswerPrecision, answer_precision
from ragas.metrics.answer_recall import AnswerRecall, answer_recall
from ragas.metrics.answer_correctness import AnswerCorrectness, answer_correctness
from ragas.metrics.answer_relevance import AnswerRelevancy, answer_relevancy
from ragas.metrics.answer_similarity import AnswerSimilarity, answer_similarity
from ragas.metrics.context_precision import (
    ContextPrecision,
    ContextRelevancy,
    context_precision,
)
from ragas.metrics.context_recall import ContextRecall, context_recall
from ragas.metrics.critique import AspectCritique
from ragas.metrics.faithfulness import Faithfulness, faithfulness

## Load and pre-process the samples for evaluation

Load the samples

In [3]:
result_folder = "path/to/your/result/data/folder"
result_csv_file = result_folder+'sample_results.csv'

result_df = pd.read_csv(result_csv_file, index_col=0)

Pre-process the samples: This step depends on the format of the input samples 

In [4]:
# Ensure the "contexts" field is a list
result_df['llm_contexts']=result_df['llm_contexts'].apply(lambda x: eval(x))

# Ensure the "ground_truths" is field name for ground truth answer
result_df.rename(columns={"answer":"ground_truths"}, inplace=True)

# Ensure the "ground_truths" field is a list
result_df['ground_truths']=result_df['ground_truths'].apply(lambda x: [x])

# Ensure the "llm_answer" field has no None type
result_df[["llm_answer"]]=result_df[["llm_answer"]].fillna(value="Unfortunately, I cannot answer this question")

result_ds = Dataset.from_pandas(result_df)

## Run ragas evaluation

In [8]:
%%time
# NOTE: Comment out any metrics you don't want to use
metrics = [
    answer_precision,
    answer_recall,
    answer_correctness,
    answer_similarity,
    # answer_relevancy,
    # faithfulness,
    # context_precision,
    # context_recall, # currently this metric might trigger timeout error raised by bedrock: ValueError: Error raised by bedrock service: Read timeout on endpoint URL: "https://bedrock-runtime.us-east-1.amazonaws.com/model/anthropic.claude-v2/invoke"
]

column_map = {
        "question": "question",
        "contexts": "llm_contexts",
        "answer": "llm_answer",
        "ground_truths": "ground_truths",
    }


# Evaluate
eval_result = evaluate(result_ds, metrics=metrics, column_map=column_map)

evaluating with [answer_precision]


100%|██████████| 1/1 [02:18<00:00, 138.52s/it]


evaluating with [answer_recall]


100%|██████████| 1/1 [01:18<00:00, 78.60s/it]


evaluating with [answer_correctness]


100%|██████████| 1/1 [02:24<00:00, 144.41s/it]


evaluating with [answer_similarity]


100%|██████████| 1/1 [00:02<00:00,  2.34s/it]

CPU times: user 2.26 s, sys: 1.25 s, total: 3.51 s
Wall time: 6min 4s





## Save the evaluation metrics along with the input samples

In [6]:
# Add the fields from the input datafram to the evaluation result dataframe
eval_result_df = eval_result.to_pandas()
metrics_keys = ['answer_precision','answer_recall','answer_correctness','answer_similarity']

eval_result_df_new = result_df.merge(eval_result_df[metrics_keys], 
                                     how='left', left_index=True, right_index=True)

In [7]:
eval_result_csv_file = result_csv_file[:-4]+'_eval.csv'

eval_result_df_new.to_csv(eval_result_csv_file, index=False)
print(f"Save evaluation results to: {eval_result_csv_file}")

Save evaluation results to: ../data/results/sample_results_eval.csv
