In [2]:
from datasets import Dataset
from ragas.metrics import (
    answer_relevancy,
    faithfulness,
    context_recall,
    context_precision,
    context_entity_recall
)
from ragas import evaluate
import openai
import pandas as pd
import numpy as np
import os
os.environ['OPENAI_API_KEY'] = 'YOUR_API_KEY'

In [3]:
def BaselineEvaluation(evaluation_dataset):
    ### Evaluation
    print('Running Evaluation...')
    # Convert to Ragas format
    evaluation_dataset = evaluation_dataset[['question', 'answer', 'contexts', 'ground_truth']] 
    evaluation_dataset['contexts'] = evaluation_dataset['contexts'].apply(lambda x: np.array(eval(x)))
    evaluation_dataset = Dataset.from_pandas(evaluation_dataset)
    result = evaluate(
        evaluation_dataset,
        metrics=[
            context_precision,
            faithfulness,
            answer_relevancy,
            context_recall,
            context_entity_recall,
        ],
    )
    print('Evaluation done!')
    print(f'Here are the results:\n{result}')
    return result.to_pandas()

In [4]:
evaluation_dataset = pd.read_csv('final_evaluation_datasets/evaluation_dataset_multi_vector.csv')
evaluation_dataset.dropna(subset=['answer'], inplace=True)

In [6]:
evaluation_dataset.drop(columns=['Unnamed: 0'], inplace=True)
evaluation_dataset.info()

<class 'pandas.core.frame.DataFrame'>
Index: 164 entries, 0 to 179
Data columns (total 7 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   question                164 non-null    object
 1   contexts                164 non-null    object
 2   ground_truth            164 non-null    object
 3   ground_truth_chunk_ids  164 non-null    object
 4   answer                  164 non-null    object
 5   answer_chunk_ids        164 non-null    object
 6   type                    164 non-null    object
dtypes: object(7)
memory usage: 10.2+ KB


In [7]:
# Evaluation (split into 4 chunks due to rate limit)
tmp1 = evaluation_dataset.iloc[0:50, :]
tmp2 = evaluation_dataset.iloc[50:100, :]
tmp3 = evaluation_dataset.iloc[100:150, :]
tmp4 = evaluation_dataset.iloc[150:, :]
BaselineResult1 = BaselineEvaluation(tmp1) 
BaselineResult2 = BaselineEvaluation(tmp2) 
BaselineResult3 = BaselineEvaluation(tmp3) 
BaselineResult4 = BaselineEvaluation(tmp4) 
BaselineResult = pd.concat([BaselineResult1, BaselineResult2, BaselineResult3, BaselineResult4])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  evaluation_dataset['contexts'] = evaluation_dataset['contexts'].apply(lambda x: np.array(eval(x)))


Running Evaluation...


Evaluating: 100%|██████████| 250/250 [03:22<00:00,  1.23it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  evaluation_dataset['contexts'] = evaluation_dataset['contexts'].apply(lambda x: np.array(eval(x)))


Evaluation done!
Here are the results:
{'context_precision': 0.9417, 'faithfulness': 0.7881, 'answer_relevancy': 0.8659, 'context_recall': 1.0000, 'context_entity_recall': 0.2736}
Running Evaluation...


Evaluating: 100%|██████████| 250/250 [04:11<00:00,  1.01s/it]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  evaluation_dataset['contexts'] = evaluation_dataset['contexts'].apply(lambda x: np.array(eval(x)))


Evaluation done!
Here are the results:
{'context_precision': 0.9000, 'faithfulness': 0.6865, 'answer_relevancy': 0.8694, 'context_recall': 0.9393, 'context_entity_recall': 0.2597}
Running Evaluation...


Evaluating: 100%|██████████| 250/250 [05:07<00:00,  1.23s/it]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  evaluation_dataset['contexts'] = evaluation_dataset['contexts'].apply(lambda x: np.array(eval(x)))


Evaluation done!
Here are the results:
{'context_precision': 0.9180, 'faithfulness': 0.7069, 'answer_relevancy': 0.8989, 'context_recall': 0.9767, 'context_entity_recall': 0.2816}
Running Evaluation...


Evaluating: 100%|██████████| 70/70 [03:37<00:00,  3.11s/it]


Evaluation done!
Here are the results:
{'context_precision': 0.5042, 'faithfulness': 0.8442, 'answer_relevancy': 0.7380, 'context_recall': 0.8524, 'context_entity_recall': 0.1341}


In [8]:
BaselineResult.reset_index(inplace=True)
BaselineResult.drop(columns=['index'], inplace=True)
BaselineResult.to_csv('final_evaluation_datasets/multi_vector_result.csv', index = False)

In [9]:
# Results for whole dataset
description = BaselineResult.describe()
total_results = {
    'context_precision': description.loc['mean', 'context_precision'],
    'faithfulness': description.loc['mean', 'faithfulness'],
    'answer_relevancy': description.loc['mean', 'answer_relevancy'],
    'context_recall': description.loc['mean', 'context_recall'],
    'context_entity_recall': description.loc['mean', 'context_entity_recall'],
}
total_results

{'context_precision': 0.8844004064293154,
 'faithfulness': 0.7355908688982317,
 'answer_relevancy': 0.8661014935202744,
 'context_recall': 0.961788617886179,
 'context_entity_recall': 0.2598870899490058}