# Evaluations
Runs through the different stages of the RAG agent and evaluates the performance of the different stages

In [1]:
import os
import sys
from pathlib import Path

sys.path.append(str(Path.cwd().parent))

LIMIT = 100 # set the number of questions to evaluate
results = {}

## Load the data and prep the dataset

In [2]:
import json

data_dir = os.path.join('..', 'ConvFinQA/data/')

train_data = json.load(open(os.path.join(data_dir, 'train.json')))

Not all the data has qa, so we need to filter out the data that doesn't have qa

In [3]:
qa_data = [
    {
        'id': data['id'],
        'question': data[qa_key]['question'],
        'answer': data[qa_key]['answer']
    }
    for data in train_data
    for qa_key in [k for k in data.keys() if k == 'qa' or k.startswith('qa_')] # account for multiple qa keys
]

In [4]:
print(len(train_data))
print(len(qa_data))

3037
3965


In [5]:
qa_data[:5]

[{'id': 'Single_JKHY/2009/page_28.pdf-3',
  'question': 'what was the percentage change in the net cash from operating activities from 2008 to 2009',
  'answer': '14.1%'},
 {'id': 'Single_RSG/2008/page_114.pdf-2',
  'question': 'what was the percent of the growth in the revenues from 2007 to 2008',
  'answer': '1.3%'},
 {'id': 'Single_AAPL/2002/page_23.pdf-1',
  'question': 'what was the percentage change in net sales from 2000 to 2001?',
  'answer': '-32%'},
 {'id': 'Single_UPS/2009/page_33.pdf-2',
  'question': 'what was the difference in percentage cumulative return on investment for united parcel service inc . compared to the s&p 500 index for the five year period ended 12/31/09?',
  'answer': '-26.16%'},
 {'id': 'Double_UPS/2009/page_33.pdf',
  'question': 'what is the roi of an investment in ups in 2004 and sold in 2006?',
  'answer': '-8.9%'}]

In [6]:
# only take the first LIMIT questions
limit_qa_data = qa_data[:LIMIT]

## Retrieval
First let's take a look at the retrieval stage.

For this I will look at Recall and Mean Reciprocal Rank (MRR). 

Precision is less relevant here as we always retrieve K documents and there's only 1 relevant document for each question.

In [7]:
from src.nodes import retriever_node

In [8]:
for data in limit_qa_data:
    input_dict = {'question': data['question']}
    retrieved_docs = retriever_node(input_dict)['retrieved_docs']
    data['retrieved_docs'] = retrieved_docs

    retrieved_docs_ids = [doc['id'] for doc in retrieved_docs]
    if data['id'] in retrieved_docs_ids:
        data['retrieved'] = 1
        rank = retrieved_docs_ids.index(data['id']) + 1  # +1 because index starts at 0
        data['reciprocal_rank'] = 1.0 / rank
    else:
        data['retrieved'] = 0
        data['reciprocal_rank'] = 0

In [21]:
# Calculate metrics
retrieved_data = [d for d in limit_qa_data if d['retrieved'] == 1]
num_retrieved = len(retrieved_data)

retrieval_recall = sum(data['retrieved'] for data in limit_qa_data) / len(limit_qa_data)
retrieval_mrr = sum(data['reciprocal_rank'] for data in limit_qa_data) / len(limit_qa_data)
retrieval_mrr_retrieved = sum(data['reciprocal_rank'] for data in retrieved_data) / num_retrieved

print(f"Overall Retrieval Recall: {retrieval_recall:.3f}")
print(f"Overall Retrieval MRR: {retrieval_mrr:.3f}")
print(f"Successful Retrieval MRR: {retrieval_mrr_retrieved:.3f}")

Overall Retrieval Recall: 0.620
Overall Retrieval MRR: 0.268
Successful Retrieval MRR: 0.432


In [10]:
results['overall_retrieval_recall'] = retrieval_recall
results['overall_retrieval_mrr'] = retrieval_mrr
results['successful_retrieval_mrr'] = retrieval_mrr_retrieved

## Reranking
Now let's analyse the reranking.

This will also use Recall and MRR.

In [11]:
from src.nodes import reranker_node

import time
from tqdm import tqdm

In [12]:
for data in tqdm(limit_qa_data, desc="Reranking Documents"):
    input_dict = {'question': data['question'], 'retrieved_docs': data['retrieved_docs']}
    reranked_docs = reranker_node(input_dict)['reranked_docs']
    data['reranked_docs'] = reranked_docs

    reranked_docs_ids = [doc['id'] for doc in reranked_docs]
    if data['id'] in reranked_docs_ids:
        data['reranked'] = 1
        rank = reranked_docs_ids.index(data['id']) + 1  # +1 because index starts at 0
        data['reranked_reciprocal_rank'] = 1.0 / rank
    else:
        data['reranked'] = 0
        data['reranked_reciprocal_rank'] = 0

    # Using the free tier Cohere API you get 10 requests per minute
    # Comment this if you have a paid tier
    time.sleep(6)

Reranking Documents: 100%|██████████| 100/100 [10:41<00:00,  6.41s/it]


In [17]:
# Calculate metrics
reranked_data = [d for d in limit_qa_data if d['reranked'] == 1]
num_reranked = len(reranked_data)

reranking_recall = sum(data['reranked'] for data in limit_qa_data) / len(limit_qa_data)
reranking_mrr = sum(data['reranked_reciprocal_rank'] for data in limit_qa_data) / len(limit_qa_data)
reranked_recall_retrieved = sum(data['reranked'] for data in retrieved_data) / num_retrieved
reranking_mrr_retrieved = sum(data['reranked_reciprocal_rank'] for data in reranked_data) / num_reranked

print(f"Overall Reranking Recall: {reranking_recall:.3f}")
print(f"Overall Reranking MRR: {reranking_mrr:.3f}")
print(f"Successful Reranking Recall: {reranked_recall_retrieved:.3f}")
print(f"Successful Reranking MRR: {reranking_mrr_retrieved:.3f}")

Overall Reranking Recall: 0.540
Overall Reranking MRR: 0.427
Successful Reranking Recall: 0.871
Successful Reranking MRR: 0.790


In [20]:
output_path = 'limit_qa_data.json'
with open(output_path, 'w') as f:
    json.dump(limit_qa_data, f, indent=2)

In [None]:
# for tomorrow:
# 1. check over previous work and make sure it makes sense
# 2. run the LLM corectness test, both with and without retrieval_mrr
# 3. Add metadta filtering and test effect on retrieval 
# 4. Write report
# 5. Update readme

## LLM Correctness
In other RAG systems where the answer is expected as a paragraph or may use multiple sources, LLM faithfulness can be  agood metric. In our case with ConvFinQA, the answer is expected as a single number, and therefore a more binary metric is more appropriate.

In [22]:
for data in limit_qa_data:
    print(data['answer'])

14.1%
1.3%
-32%
-26.16%
-8.9%
-26.16%
70.1%
15.6%
15.7%
16%
22.99%
12
3
-19
2.4%
56.6%
2.6
11%
158.82%
11.1%
16.7%
14.7%
.43
11.2%
40.3
594840
700%
30%
-1.2%
350824 thousand
60.3
165%
67%
83.6%
13.25%
-57%
7020
-2%
377000
31.14%
-13.4%
1.79
-31.7%
18%
98.2%
$ 110774.5 million
1.5%
-489.2
6.1%
5805209
31.7%
15.16%
11.55%
32%
24.4%
-76.8%
7.61%
17%
-16.6%
-14.9%
49%
3085000
22.86%
47.4%
-11.7%
-1281
69%
1905.4
67%
45.3%
58%
5%
21%
8.1%
359.67%
-6.4%
645
89.14%
808.5%
45%
25.4%
9.9%
12950000
39.9%
-34.4%
-11.3%
-78.8%
2.3%
25%
7%
3.1%
14.2%
64.6%
2484034
3%
97.8%
66%
48%
84%
17%
