In [1]:
import os
import sys

current_dir = os.getcwd()
path_to_main = os.path.dirname(current_dir)

sys.path.append(path_to_main)
os.chdir(path_to_main)

from main import Driver

In [2]:
import json

file_path = "../train.json"
with open(file_path, "r") as file:
    data = json.load(file)


all_organizations = list(set([item['filename'].split('/')[0] for item in data]))

question_answers = {}
for organization in all_organizations:
    for item in data:
        if item['filename'].split('/')[0] == organization:

            if item.get("qa"):
                question = f"""{organization}. {item["qa"]["question"]}"""
                answer = item["qa"]["answer"]
            else:
                question = f"""{organization}. {item["qa_0"]["question"]}"""
                answer = item["qa_0"]["answer"]
            
            question_answers[question] = answer


In [3]:
## These questions and their correct answers are already present in the train.json file. 
## We can query these questions to the system and see if it is able to generate answers for them correctly or not
## Based on this we can calculate the accuracy of the system on this dataset. 

In [4]:
len(question_answers)

2683

In [5]:
# Running all of them would cost a lot, hence I am taking a random sample of 100 queries and will compare their results. 

import random

random.seed(42)
random_questions = dict(random.sample(list(question_answers.items()), 100))
random_questions


{'AAPL. what was the percentage change in rent expense under operating leases from 2011 to 2012?': '44%',
 'SLB. what is the ratio of the total costs of shares purchased from 2008 to 2009 in dollars': '3.6',
 'ETR. what are the nuclear fuel expenses as a percentage of the decrease in net revenue from 2012 to 2013?': '38.5%',
 'UNP. as of december 31 , 2013 what was the percent of the total operating non-cancelable lease terms in excess of one year due in 2015': '11.7%',
 'PNC. does a .5% ( .5 % ) decrease in expected long-term return on assets have a greater effect on pension expense than a .5% ( .5 % ) increase in compensation rate?': 'yes',
 'GS. what is the percentage change in the net unrecognized tax benefit in 2011 compare to 2010?': '18.8%',
 'MRO. by how much did the wti crude oil benchmark increase from 2009 to 2011?': '53.2%',
 'ECL. how many square feet are leased by the company?': '222000',
 'WRK. what was the percentage change in the segment income': '4.84%',
 'RE. in 2008

In [6]:
driver = Driver()

In [7]:
import math

# Assuming random_questions is a dictionary
updated_questions = {}

for question, correct_answer in random_questions.items():
    try:
        if type(correct_answer) == str:
            correct_answer_float = float(correct_answer.rstrip("%"))
        else:
            correct_answer_float = float(correct_answer)
    except:
        print("Quesion skipped due to bad data")
        continue

    # run the main flow
    result_from_system = driver.main_flow(question)

    # Store the result in a new dictionary
    updated_questions[question] = {"correct_answer": correct_answer_float,
                                   "system_answer": result_from_system['answer'],
                                   "message_from_system": result_from_system['message'] }


Quesion skipped due to bad data
Quesion skipped due to bad data
Quesion skipped due to bad data
Quesion skipped due to bad data
Quesion skipped due to bad data
Quesion skipped due to bad data


In [8]:
# updated_questions

In [9]:
import math

total_questions = 0
success_count = 0

for question in updated_questions:
    correct_answer = updated_questions[question]['correct_answer']
    system_answer = updated_questions[question]['system_answer']

    if system_answer is None:
        # This means either retrieval failure, or question was not mathematical, etc..
        # skipping these scenarios as we are independently checking if the system was able to generate correct resposnse
        # Only keeping those messages in the eval which actually indicate system failure 
        if updated_questions[question]['message_from_system'] in ("Not able to find answer in the provided context", "No steps produced", "Problem in executor"):
            total_questions+=1
    else:
        total_questions+=1
        success = math.isclose(round(correct_answer, 1) , round(system_answer, 1), abs_tol=1)
        if success:
            success_count +=1
    
    # print(system_answer, correct_answer, success)
        

In [10]:
accuracy = success_count/total_questions
accuracy

0.4090909090909091