In [2]:
import tatqa_utils
import pandas as pd

In [3]:
devdf = pd.read_json('dataset_raw/tatqa_dataset_dev.json')

In [4]:
import pandas as pd
import pyreadstat
from langchain_chroma import Chroma
from langchain_openai import OpenAIEmbeddings
import utils
from langchain_community.vectorstores import FAISS
from langchain_core.documents import Document
import numpy as np
from progress.bar import Bar
%load_ext autoreload
%autoreload 2

In [5]:
import os
with open('dataset_raw/openai.api.key', 'r') as filek: 
    openai_key = filek.read()
os.environ["OPENAI_API_KEY"] =  openai_key 

In [6]:
from langchain.globals import set_llm_cache
from langchain_openai import ChatOpenAI
from langchain_openai import OpenAI
from langchain_community.cache import SQLiteCache

llm = ChatOpenAI(temperature=0)
#llm = OpenAI(temperature=0) 
set_llm_cache(SQLiteCache(database_path=".langchain.db"))

In [7]:
import re 
def remove_thousand_separators(text):
    # This regular expression matches numbers with commas as thousand separators
    return re.sub(r'(?<=\d),(?=\d{3})', '', text)
    
def transform_elements(list_of_lists, transform_func):    
    return [
        [transform_func(item) for item in sublist]
        for sublist in list_of_lists
    ]
    
def replace_bracketed_numbers(text):
    return re.sub(r"\((\d+)\)", lambda x: f"-{x.group(1)}", text)

In [31]:
def gen_code(llm, question, table):    
    prompt = f"Generate a Python code that can answer the following question using the following table! The code must contain only one function called 'run', and no wrapping class. The function must return numeric float results with accuracy to two decimal places and format (value, scale). Scale usually is 'thousand','million' or empty string, but sometimes it is percent. Do not write explanation, just code.\nQuestion: {question} \n Table: {table}"
    prompt = remove_thousand_separators(prompt)
    #prompt = replace_bracketed_numbers(prompt)
    prompt = prompt.replace('$','') 
    res = llm.invoke(prompt)
    code = res.content.replace('```python','').replace('```','')
    return (prompt, code)

def exec_code(code, table):  
    table = remove_thousand_separators(f"{table}")
    #table = replace_bracketed_numbers(table)
    table = table.replace('$','')
    #code = code.replace("* 1000000","").replace("*1000000","").replace("/ 1000000","").replace("/1000000","").replace("/ 1000","").replace("/1000","")
    #code = code.replace('abs(','(')
    try: 
        loc = locals()   
        if not "run()" in code:
            exec(code + f"\nr = run({table})\n", globals(), loc)
        else: 
            exec(code + "\nr = run()\n", globals(), loc)
        return loc['r']
    except Exception as e:
            s = '[Error]'+ str(e)
            print(s)
            return (s,'')
#table = "[['', '', 'Years Ended September 30,', ''], ['', '2019', '2018', '2017'], ['Fixed Price', '$  1,452.4', '$  1,146.2', '$  1,036.9'], ['Other', '44.1', '56.7', '70.8'], ['Total sales', '$1,496.5', '$1,202.9', '$1,107.7']]"
#question = 'What is the change in Fixed Price in 2019 from 2018?'
table = "[['', 'December 31,', ''], ['', '2019', '2018'], ['Trade accounts receivable, net, noncurrent (Note 2)', '$26,496', '$15,948'], ['Equity method investments (Note 1)', '9,254', '9,702'], ['Net deferred tax assets, noncurrent (Note 20)', '6,774', '5,797'], ['Rent and other deposits', '6,106', '5,687'], ['Value added tax receivables, net, noncurrent', '592', '519'], ['Other', '6,723', '5,711'], ['', '$55,945', '$43,364']]"
#table = remove_thousand_separators(table)
print(table)
question = 'What was the percentage change in Value added tax receivables, net, noncurrent in 2019 from 2018?'
( prompt, code) = gen_code(llm, question, table)
res = exec_code(code, table)
print(code, res)

[['', 'December 31,', ''], ['', '2019', '2018'], ['Trade accounts receivable, net, noncurrent (Note 2)', '$26,496', '$15,948'], ['Equity method investments (Note 1)', '9,254', '9,702'], ['Net deferred tax assets, noncurrent (Note 20)', '6,774', '5,797'], ['Rent and other deposits', '6,106', '5,687'], ['Value added tax receivables, net, noncurrent', '592', '519'], ['Other', '6,723', '5,711'], ['', '$55,945', '$43,364']]
def run():
    current_year = 2019
    previous_year = 2018
    current_value = 592
    previous_value = 519

    percentage_change = ((current_value - previous_value) / previous_value) * 100

    return round(percentage_change, 2), 'percent' (14.07, 'percent')


In [32]:
from tqdm.notebook import tqdm as log_progress

res = []
for i, item in log_progress(devdf.iterrows()):
    table = item['table']['table']
    #table = transform_elements(table, remove_thousand_separators)
    #table = [remove_thousand_separators(cell) for row in table for cell in row]
    #table = [cell.replace('$', '') for row in table for cell in row]
    #print (table)
    for q in item['questions']:        
        if q['answer_type'] == 'arithmetic' and 'table' in q['answer_from']:             
            ( prompt, code)  = gen_code(llm, q['question'], table)
            r = exec_code(code, table)
            err =  None
            if r is None:
                (pred_value, pred_unit) = ("", "")
            else:                
                if len(r) == 2:
                    (pred_value, pred_scale) = r    
                    if isinstance(pred_value, tuple) and len(pred_value) == 2:
                        print('$$$$')
                        (pred_value, pred_scale) = pred_value 
                    
                    if pred_unit == "%" or pred_unit == "percentage"  :
                        pred_unit = 'percent'
                    if pred_unit not in ["", 'thousand', 'million', 'billion', 'percent']:
                        print('Invalid ', pred_unit)
                        pred_unit = ""    
                    if isinstance(pred_value, str):
                        print("string")
                        if  pred_value.startswith('[Error]'):
                            (pred_value, pred_unit) = ("", "")
                            err,_ = r                                                        
                        
                elif len(r) == 1:
                    (pred_value, pred_unit) = (r, "")        
                else:
                    (pred_value, pred_unit) = ("", "")        
                
            #res.append({"table":table, "q":q, "pred":pred, "code": code})
            res.append(({"answer_type":q["answer_type"], "answer": q["answer"], 'scale': q["scale"]}, pred_value, pred_unit, q, code, prompt, item['table'], err))


0it [00:00, ?it/s]

(87.31, 'percent')
(3680.0, 'million')
(1.9000000000000001, '')


KeyboardInterrupt: 

In [35]:
res

[({'answer_type': 'arithmetic', 'answer': -12.6, 'scale': 'million'},
  -12.6,
  '',
  {'uid': 'eb787966-fa02-401f-bfaf-ccabf3828b23',
   'order': 5,
   'question': 'What is the change in Other in 2019 from 2018?',
   'answer': -12.6,
   'derivation': '44.1-56.7',
   'answer_type': 'arithmetic',
   'answer_from': 'table-text',
   'rel_paragraphs': ['2'],
   'req_comparison': False,
   'scale': 'million'},
  "def run():\n    other_2019 = float(table[3][1])\n    other_2018 = float(table[3][2])\n    \n    change = other_2019 - other_2018\n    \n    if abs(change) >= 1000000:\n        return (round(change/1000000, 2), 'million')\n    elif abs(change) >= 1000:\n        return (round(change/1000, 2), 'thousand')\n    else:\n        return (round(change, 2), '')",
  "Generate a Python code that can answer the following question using the following table! The code must contain only one function called 'run', and no wrapping class. The function must return numeric float results with accuracy to

In [33]:
from tatqa_metric import TaTQAEmAndF1

metrics = TaTQAEmAndF1()

for ans, pred, pred_scale, _,_, _,_,_ in res:
    metrics(ans, pred, pred_scale)
pred_em, pred_f1, scale_score, op_score = metrics.get_overall_metric(reset=False)
print( pred_em, pred_f1, scale_score)

0.02857142857142857 0.02857142857142857 0.11428571428571428


In [34]:
[m for m in metrics._details if m['f1']<0.2]

[{'answer_type': 'arithmetic',
  'answer': -12.6,
  'scale': 'million',
  'pred': [-12.6],
  'pred_scale': '',
  'em': 0.0,
  'f1': 0.0,
  'pred_span': None,
  'gold_span': None,
  'span_em': 0,
  'span_f1': 0},
 {'answer_type': 'arithmetic',
  'answer': -22.22,
  'scale': 'percent',
  'pred': [-22.22],
  'pred_scale': '',
  'em': 0.0,
  'f1': 0.0,
  'pred_span': None,
  'gold_span': None,
  'span_em': 0,
  'span_f1': 0},
 {'answer_type': 'arithmetic',
  'answer': -94,
  'scale': 'million',
  'pred': [-0.09],
  'pred_scale': '',
  'em': 0.0,
  'f1': 0.0,
  'pred_span': None,
  'gold_span': None,
  'span_em': 0,
  'span_f1': 0},
 {'answer_type': 'arithmetic',
  'answer': -12.14,
  'scale': 'percent',
  'pred': [-12.14],
  'pred_scale': '',
  'em': 0.0,
  'f1': 0.0,
  'pred_span': None,
  'gold_span': None,
  'span_em': 0,
  'span_f1': 0},
 {'answer_type': 'arithmetic',
  'answer': 2.1,
  'scale': 'percent',
  'pred': [2.1],
  'pred_scale': '',
  'em': 0.0,
  'f1': 0.0,
  'pred_span': No

In [15]:
metrics._details

[]

In [30]:
good = 0
for ans, pred, pred_scale, q,code, _,table,err in res:
    llimit = ans['answer']*0.99
    ulimit = ans['answer']*1.01
    
    if pred  is None or isinstance(pred, tuple) or isinstance(pred, str):
        #if ans['scale'] != pred_scale:
        #    print ('***********', q['uid'], 'pred: ',pred, 'scale:',pred_scale, ' <> ' , ans['answer'], ans['scale'])
       
        #type error
        #if isinstance(pred, str):
        #    print (pred, pred_scale, ' -- ' , ans['answer'])
        #if isinstance(pred, tuple):
        #    print (pred, pred_scale, ' == ' , ans['answer'], ans['scale'])
        #print('===============================================================================')
        print('{',pred, '}', ans['answer'], ans['scale'], err)
        continue
    
    #pred = round(pred, 2)
    
    if (pred > 0 and llimit < pred and pred < ulimit) or (pred < 0 and llimit > pred and pred > ulimit) or pred==ulimit or pred == llimit:
        good = good + 1
        if ans['scale'] == pred_scale and  pred_scale == 'thousand':
            print ('***********', q['uid'], 'pred: ',pred, 'scale:',pred_scale, ' == ' , ans['answer'], ans['scale'])
        
        #if ans['scale'] != pred_scale:
        #    print ('***********', q['uid'], 'pred: ',pred, 'scale:',pred_scale, ' <> ' , ans['answer'], ans['scale'])
        #if  ans['scale'] == 'thousand':
        #    print (q['uid'], pred, pred_scale, ' == ' , ans['answer'], ans['scale'])
          
    else:
        #value error
        #  if ans['scale'] != pred_scale:
        #    print ('***********', q['uid'], 'pred: ',pred, 'scale:',pred_scale, ' <> ' , ans['answer'], ans['scale'])
        #if ans['answer'] != pred and ans['scale'] == pred_scale and pred_scale != "":
        #    print ('***********', q['uid'], 'pred: ',pred, 'scale:',pred_scale, ' <> ' , ans['answer'], ans['scale'])
       
        None
        #print (q['uid'], pred, pred_scale, ' <> ' , ans['answer'], ans['scale'])
        #print(q['question'])
        #print(table['table'])
        #print(code)
        
    #if isinstance(pred, str):
    #    print (pred, pred_scale, ' -- ' , ans['answer'])
    #if isinstance(pred, tuple):
    #    print (pred, pred_scale, ' == ' , ans['answer'], ans['scale'])
        
print (good/len(res))

{  } -361  [Error]could not convert string to float: '$2,664'
{  } -1647 thousand [Error]could not convert string to float: '(5637)'
{ 58.71% } 58.71 percent None
{ 0.62 } 0.62 percent None
{ 3887.40 } 2988 thousand None
{ 92.27 } 92.27 percent None
{ 0.70 } 0.7000000000000001 million None
{  } 29.17 percent None
{ 83.33% } 83.33 percent None
{ 82.05% } 82.05 percent None
{  } 8500 thousand None
{  } 14.68 percent [Error]could not convert string to float: '$1,562,474'
{  } 79.05 percent [Error]could not convert string to float: '1,235,173'
0.7004405286343612


In [126]:

def parse_scale(scale):
    if scale.lower() == "thousands" or "000" in scale:
        return "thousand"
    if scale.lower() == "millions" or scale.lower() == "£m" or scale.lower() == "$m" or scale.lower() == "m":
        return "million"
    return scale.lower()



In [137]:
res_scale_para = {}
for idx, item in log_progress(devdf.iterrows()): 
    #item = devdf.iloc[2]
    table = item['table']['table']
    pars = [p['text'] for p in item['paragraphs']]
    prompt = f"The context contains some paragraphs that describe a table. Search for text that explicitly states the scale of the values in the table. Scale is usally million, thousand, billion or none, if no specific scale found. Return just the scale, no other words.\n Paragraphs: {pars}"
    scale = llm.invoke(prompt).content
    scale = parse_scale(scale)
    res_scale_para[item['table']['uid']] = scale

0it [00:00, ?it/s]

In [130]:
res_scale_table = {}
for idx, item in log_progress(devdf.iterrows()): 
    #item = devdf.iloc[2]
    table = item['table']['table']
    pars = [p['text'] for p in item['paragraphs']]
    prompt = f"The context contains a table. Search for the scale of the values in table. Scale can be million, thousand, billion or None, if no specific scale found. Return just the scale, no other words.\n Table: {table}"
    scale = llm.invoke(prompt).content
    scale = parse_scale(scale)
    res_scale_table[item['table']['uid']] = scale

0it [00:00, ?it/s]

In [155]:
res_scale = {}
for idx, item in log_progress(devdf.iterrows()): 
    #item = devdf.iloc[2]
    table = item['table']['table']
    pars = [p['text'] for p in item['paragraphs']]
    prompt = f"The context contains some paragraphs that describe a table. Search for text that explicitly states the scale of the values in the table. Scale can be abbreviated in tables. in response use scale  million, thousand, billion or none, if no scale contains exactly in the text. Do NOT try to infer it from values.  \nParagraphs: {pars} \nTable: {table}"
    scale = llm.invoke(prompt).content
    scale = parse_scale(scale)
    res_scale[item['table']['uid']] = scale

0it [00:00, ?it/s]

In [156]:
res = []
good = 0
pct = 0
cnt = 0
for idx, item in devdf.iterrows():
    for q in item['questions']:        
        if q['answer_type'] == 'arithmetic' and 'table' in q['answer_from']: 
            if not item['table']['uid'] in res_scale:
                continue
            if q['scale'] in res_scale[item['table']['uid']]:
                good = good + 1
            elif q['scale'] != 'percent':
                print(q['uid'], q['scale'], res_scale[item['table']['uid']])
            else:
                pct = pct +1
            cnt = cnt+1
print(  (cnt -good-pct)/cnt, good, pct, cnt, cnt -good-pct)

191c3926-7356-4ab8-a8f9-41e7b7c8a492 thousand million
bd2d81eb-46fc-4e62-908d-aebfccf46246 thousand million
010df393-a62e-408d-888a-045a4e435e6a thousand million
29aed76e-30b8-4d76-be8c-b7a58731646b million none
4a253bf4-b9fc-4ec3-b99b-d2744646a296 million none
1238d807-aa57-48a3-93b6-591873788625 thousand none
ef9d4839-5277-4614-bb73-f902fd8a38b6 thousand none
1e20b997-cb3b-4bc5-93e6-b45dbe945fb5 thousand none
a4308d65-606f-4563-94e1-5ff5c049051a million none
9d2bd743-f600-4e40-86dd-ebd18cbbe311 million none
7c199dce-cde4-4685-93cb-f7f8f7ee1e5a thousand none
f12806ca-aa4c-4fe6-84fc-10cefca76a9e thousand none
440e65d7-7c5c-4a28-871c-a828fa9860f3 thousand scale: million
f1d5753b-05a9-4752-b03a-493e97804fb7 thousand scale: million
a366ea30-5ba3-4f62-b027-7fd4c0b7d6af thousand scale: million
07148f97-1ced-46e4-a141-5cdffe6e6528 million none
88795fae-3c5b-48d4-ae2d-57fe04b15c31 million none
ed9e3e80-6941-4888-9e40-bbd1cb092485 million none
28743883-69b5-4cd3-a303-5ac1b2ef01af thousand scal

In [152]:
res = []
good = 0
pct = 0
cnt = 0
for idx, item in devdf.iterrows():
    for q in item['questions']:        
        if q['answer_type'] == 'arithmetic' and 'table' in q['answer_from']: 
            if q['scale'] == res_scale_para[item['table']['uid']]:
                good = good + 1
            elif q['scale'] != 'percent':
                print(q['uid'], q['scale'], res_scale_para[item['table']['uid']])
            else:
                pct = pct +1
            cnt = cnt+1
print(  (cnt -good-pct)/cnt, good, pct, cnt, cnt -good-pct)

b2786c1a-37de-4120-b03c-32bf5c81f157 million none
ba6783f3-8207-419a-b407-3f688682caef  million
263d03ec-83d2-48df-8376-1a72167798f7  million
78fc6d55-c20c-4f71-99fe-bc40a16e61d0  million
a983501d-2eec-486d-9661-e520c7c8af5e thousand million
64c902c6-f426-4432-84b3-c10b3065716f thousand million
5d9b397d-16bb-4463-8f9a-b85507704a8d million none
6100c476-160a-4f1e-bfc1-a16f4cc18b52 million billion
a81f1322-e74f-4e3c-a6cf-4b8d25d01cf5 million billion
65ec782c-691e-45df-b541-caecb85154ff million none
4f7c8e0a-6ae9-40ee-bd16-06db6f96eaf1  million
bbe335ff-414f-48b3-8126-5a2c7c505de3 million billion
ef274d2b-fbd6-4e9f-95f9-79b37827d91c million billion
191c3926-7356-4ab8-a8f9-41e7b7c8a492 thousand million
bd2d81eb-46fc-4e62-908d-aebfccf46246 thousand million
010df393-a62e-408d-888a-045a4e435e6a thousand million
fa43e8cd-7fea-4738-85ee-f61ae8529f96 million none
2067daa1-9905-456b-bcbf-42bc66b47259 million none
bfce0375-cbd0-4e25-9b82-0424058918f1 thousand million
73693527-ed4b-4d07-941e-0e6540

In [162]:
res_scale_pct = {}
for idx, item in log_progress(devdf.iterrows()): 
    #item = devdf.iloc[2]
    table = item['table']['table']
    pars = [p['text'] for p in item['paragraphs']]
    for q in item['questions']:        
        if q['answer_type'] == 'arithmetic' and 'table' in q['answer_from']: 
            prompt = f"Classify the following question. Is it about percentage calculation?  Please answer with one word, yes or no.  \nQuestion: {q['question']}"
            scale = llm.invoke(prompt).content
            res_scale_pct[q['uid']] = {'percent':scale, 'golden_scale': q['scale']}

0it [00:00, ?it/s]

In [173]:
res = []
good = 0
pct = 0
cnt = 0
for idx, item in devdf.iterrows():
    for q in item['questions']: 
        if not q['uid'] in res_scale_pct:
            continue
        if q['answer_type'] == 'arithmetic' and 'table' in q['answer_from']: 
            if( q['scale'] == 'percent' and res_scale_pct[q['uid']]['percent'] == 'Yes')or  (q['scale'] != 'percent' and res_scale_pct[q['uid']]['percent'] == 'No') :
                good = good + 1
            else:
                None
                #print(q['scale'], res_scale_pct[q['uid']] )
            cnt = cnt + 1
print(  (cnt -good-pct)/cnt, good, pct, cnt, cnt -good-pct)


0.10968660968660969 625 0 702 77


In [163]:
res_scale_pct

{'eb787966-fa02-401f-bfaf-ccabf3828b23': {'percent': 'No',
  'golden_scale': 'million'},
 '05b670d3-5b19-438c-873f-9bf6de29c69e': {'percent': 'Yes',
  'golden_scale': 'percent'},
 'b2786c1a-37de-4120-b03c-32bf5c81f157': {'percent': 'No',
  'golden_scale': 'million'},
 'fe11f001-3bfe-4089-8108-412676f0a780': {'percent': 'Yes',
  'golden_scale': 'percent'},
 '5103aed0-b4e8-4fae-bf78-e2c9f4ba84cf': {'percent': 'No',
  'golden_scale': 'percent'},
 '4dc8be43-d8d9-4b08-9ffd-9c19012361ce': {'percent': 'Yes',
  'golden_scale': 'percent'},
 '6c44a1a8-0785-43a0-90ab-7e21df2c57d9': {'percent': 'No',
  'golden_scale': 'percent'},
 'a0414f81-8dc2-44b2-a441-2c9d9c805c4d': {'percent': 'No',
  'golden_scale': 'million'},
 'bf7abd62-d9cd-48d2-8826-1457684019a3': {'percent': 'No',
  'golden_scale': 'million'},
 '4d259081-6da6-44bd-8830-e4de0031744c': {'percent': 'No',
  'golden_scale': 'million'},
 'bed1fce2-69cb-4d1e-a34a-01950a1770bd': {'percent': 'Yes',
  'golden_scale': 'percent'},
 '348d031d-73ab-4

In [134]:
res = []
good = 0
pct = 0
cnt = 0
for idx, item in devdf.iterrows():
    for q in item['questions']:        
        if q['answer_type'] == 'arithmetic' and 'table' in q['answer_from']: 
            if q['scale'] == res_scale_table[item['table']['uid']]:
                good = good + 1
            elif q['scale'] != 'percent':
                print(q['uid'],'\t',  q['scale'], res_scale_table[item['table']['uid']],'\t',  res_scale_para[item['table']['uid']])
            else:
                pct = pct +1
            cnt = cnt+1
print(  (cnt -good-pct)/cnt, good, pct, cnt, cnt -good-pct)

eb787966-fa02-401f-bfaf-ccabf3828b23 	 million none 	 million
ba6783f3-8207-419a-b407-3f688682caef 	  million 	 million
263d03ec-83d2-48df-8376-1a72167798f7 	  million 	 million
78fc6d55-c20c-4f71-99fe-bc40a16e61d0 	  million 	 million
35d602ae-9131-4291-a30c-49a40f32bbe4 	 thousand million 	 thousand
4f7c8e0a-6ae9-40ee-bd16-06db6f96eaf1 	  million 	 million
191c3926-7356-4ab8-a8f9-41e7b7c8a492 	 thousand none 	 million
bd2d81eb-46fc-4e62-908d-aebfccf46246 	 thousand none 	 million
010df393-a62e-408d-888a-045a4e435e6a 	 thousand none 	 million
096680e3-c65a-49cc-9339-55abdafd4a38 	 thousand million 	 thousand
b7ac671c-e199-4a3c-af2e-c33c6e90924a 	 thousand million 	 thousand
73693527-ed4b-4d07-941e-0e654095a43d 	  million 	 million
03de2d3b-2e3d-425f-8f59-7dfca536488e 	  million 	 1
1238d807-aa57-48a3-93b6-591873788625 	 thousand none 	 million
ef9d4839-5277-4614-bb73-f902fd8a38b6 	 thousand none 	 million
1e20b997-cb3b-4bc5-93e6-b45dbe945fb5 	 thousand million 	 percent
d2969c34-708e-

In [79]:
item = devdf.iloc[0]
table = item['table']['table']
pars = [p['text'] for p in item['paragraphs']]
question = item['questions'][4]['question']
( prompt, code) = gen_code(llm, question, table, pars)
res = exec_code(code, table)
print(code, res)

(-12.6, '')
def run():
    table = [['', '', 'Years Ended September 30,', ''], ['', '2019', '2018', '2017'], ['Fixed Price', '  1452.4', '  1146.2', '  1036.9'], ['Other', '44.1', '56.7', '70.8'], ['Total sales', '1496.5', '1202.9', '1107.7']]
    
    sales_2019 = float(table[3][1])
    sales_2018 = float(table[3][2])
    
    change = sales_2019 - sales_2018
    
    return round(change, 2), ''


# Test the function
print(run()) (-12.6, '')


In [None]:
def gen_code(llm, question, table):    
    prompt = f"Generate a Python function that can answer the following question using the following table! The code must contain only one function called 'run', and no wrapping class. The function must return numeric float results with accuracy to two decimal places and format (value, scale). Scale usually is 'thousand','million' or empty string, but sometimes it is percent. Do not write explanation, just code.\nQuestion: {question} \n Table: {table}"
    prompt = remove_thousand_separators(prompt)
    #prompt = replace_bracketed_numbers(prompt)
    prompt = prompt.replace('$','') 
    res = llm.invoke(prompt)
    code = res.content.replace('```python','').replace('```','')
    return (prompt, code)

def exec_code(code, table):  
    table = remove_thousand_separators(f"{table}")
    #table = replace_bracketed_numbers(table)
    table = table.replace('$','')
    #code = code.replace("* 1000000","").replace("*1000000","").replace("/ 1000000","").replace("/1000000","").replace("/ 1000","").replace("/1000","")
    #code = code.replace('abs(','(')
    try: 
        loc = locals()   
        if not "run()" in code:
            exec(code + f"\nr = run({table})\n", globals(), loc)
        else: 
            exec(code + "\nr = run()\n", globals(), loc)
        return loc['r']
    except Exception as e:
            s = '[Error]'+ str(e)
            print(s)
            return (s,'')
#table = "[['', '', 'Years Ended September 30,', ''], ['', '2019', '2018', '2017'], ['Fixed Price', '$  1,452.4', '$  1,146.2', '$  1,036.9'], ['Other', '44.1', '56.7', '70.8'], ['Total sales', '$1,496.5', '$1,202.9', '$1,107.7']]"
#question = 'What is the change in Fixed Price in 2019 from 2018?'
table = "[['', 'December 31,', ''], ['', '2019', '2018'], ['Trade accounts receivable, net, noncurrent (Note 2)', '$26,496', '$15,948'], ['Equity method investments (Note 1)', '9,254', '9,702'], ['Net deferred tax assets, noncurrent (Note 20)', '6,774', '5,797'], ['Rent and other deposits', '6,106', '5,687'], ['Value added tax receivables, net, noncurrent', '592', '519'], ['Other', '6,723', '5,711'], ['', '$55,945', '$43,364']]"
#table = remove_thousand_separators(table)
print(table)
question = 'What was the percentage change in Value added tax receivables, net, noncurrent in 2019 from 2018?'
( prompt, code) = gen_code(llm, question, table)
res = exec_code(code, table)
print(code, res)

In [60]:
question

'What is the change in Other in 2019 from 2018?'

In [61]:
from tqdm.notebook import tqdm as log_progress

res = []
for i, item in log_progress(devdf.iterrows()):
    table = item['table']['table']
    pars = [p['text'] for p in item['paragraphs']]
    #table = transform_elements(table, remove_thousand_separators)
    #table = [remove_thousand_separators(cell) for row in table for cell in row]
    #table = [cell.replace('$', '') for row in table for cell in row]
    #print (table)
    for q in item['questions']:        
        if q['answer_type'] == 'arithmetic' and 'table' in q['answer_from']:             
            ( prompt, code)  = gen_code(llm, q['question'], table, pars)
            r = exec_code(code, table)
            err =  None
            if r is None:
                (pred_value, pred_unit) = ("", "")
            else:                
                if len(r) == 2:
                    (pred_value, pred_unit) = r    
                    if pred_unit == "%":
                        pred_unit = 'percent'
                    if pred_unit == "€m" or pred_unit == "£m":
                        pred_unit = 'million'
                    if pred_unit not in ["", 'thousand', 'million', 'billion', 'percent']:
                        print('Invalid ', pred_unit)
                        pred_unit = ""                        
                    if isinstance(pred_value, tuple) and len(pred_value) == 2:
                         (pred_value, pred_unit) = pred_value 
                    if isinstance(pred_value, str):
                        print("string")
                        if  pred_value.startswith('[Error]'):
                            (pred_value, pred_unit) = ("", "")
                            err,_ = r                                                        
                        
                elif len(r) == 1:
                    (pred_value, pred_unit) = (r, "")        
                else:
                    (pred_value, pred_unit) = ("", "")        
                
            #res.append({"table":table, "q":q, "pred":pred, "code": code})
            res.append(({"answer_type":q["answer_type"], "answer": q["answer"], 'scale': q["scale"]}, pred_value, pred_unit, q, code, prompt, item['table'], err))


0it [00:00, ?it/s]

[Error]run() missing 1 required positional argument: 'paragraphs'
string
(3.1, 'percent')


KeyboardInterrupt: 

In [67]:
from tatqa_metric import TaTQAEmAndF1

metrics = TaTQAEmAndF1()

for ans, pred, pred_scale, _,_, _,_,_ in res:
    metrics(ans, pred, pred_scale)
pred_em, pred_f1, scale_score, op_score = metrics.get_overall_metric(reset=False)
print( pred_em, pred_f1, scale_score)

0.5384615384615384 0.5384615384615384 0.7884615384615384


In [69]:
[m for m in metrics._details if m['f1']<0.2]

[{'answer_type': 'arithmetic',
  'answer': 172,
  'scale': 'million',
  'pred': [178.67],
  'pred_scale': 'million',
  'em': 0.0,
  'f1': 0.0,
  'pred_span': None,
  'gold_span': None,
  'span_em': 0,
  'span_f1': 0},
 {'answer_type': 'arithmetic',
  'answer': 50.5,
  'scale': 'million',
  'pred': [57.0],
  'pred_scale': 'million',
  'em': 0.0,
  'f1': 0.0,
  'pred_span': None,
  'gold_span': None,
  'span_em': 0,
  'span_f1': 0},
 {'answer_type': 'arithmetic',
  'answer': 121.5,
  'scale': 'million',
  'pred': [109],
  'pred_scale': 'million',
  'em': 0.0,
  'f1': 0.0,
  'pred_span': None,
  'gold_span': None,
  'span_em': 0,
  'span_f1': 0},
 {'answer_type': 'arithmetic',
  'answer': 25.1,
  'scale': 'percent',
  'pred': [-25.1],
  'pred_scale': 'percent',
  'em': 0.0,
  'f1': 0.0,
  'pred_span': None,
  'gold_span': None,
  'span_em': 0,
  'span_f1': 0},
 {'answer_type': 'arithmetic',
  'answer': 2.93,
  'scale': '',
  'pred': [2.93],
  'pred_scale': 'thousand',
  'em': 0.0,
  'f1':

In [63]:
good = 0
for ans, pred, pred_scale, q,code, _,table,err in res:
    llimit = ans['answer']*0.99
    ulimit = ans['answer']*1.01
    
    if pred  is None or isinstance(pred, tuple) or isinstance(pred, str):
        #if ans['scale'] != pred_scale:
        #    print ('***********', q['uid'], 'pred: ',pred, 'scale:',pred_scale, ' <> ' , ans['answer'], ans['scale'])
       
        #type error
        #if isinstance(pred, str):
        #    print (pred, pred_scale, ' -- ' , ans['answer'])
        #if isinstance(pred, tuple):
        #    print (pred, pred_scale, ' == ' , ans['answer'], ans['scale'])
        #print('===============================================================================')
        print('{',pred, '}', ans['answer'], ans['scale'], err)
        continue
    
    #pred = round(pred, 2)
    
    if (pred > 0 and llimit < pred and pred < ulimit) or (pred < 0 and llimit > pred and pred > ulimit) or pred==ulimit or pred == llimit:
        good = good + 1
        if ans['scale'] == pred_scale and  pred_scale == 'thousand':
            print ('***********', q['uid'], 'pred: ',pred, 'scale:',pred_scale, ' == ' , ans['answer'], ans['scale'])
        
        #if ans['scale'] != pred_scale:
        #    print ('***********', q['uid'], 'pred: ',pred, 'scale:',pred_scale, ' <> ' , ans['answer'], ans['scale'])
        #if  ans['scale'] == 'thousand':
        #    print (q['uid'], pred, pred_scale, ' == ' , ans['answer'], ans['scale'])
          
    else:
        #value error
        #  if ans['scale'] != pred_scale:
        #    print ('***********', q['uid'], 'pred: ',pred, 'scale:',pred_scale, ' <> ' , ans['answer'], ans['scale'])
        #if ans['answer'] != pred and ans['scale'] == pred_scale and pred_scale != "":
        #    print ('***********', q['uid'], 'pred: ',pred, 'scale:',pred_scale, ' <> ' , ans['answer'], ans['scale'])
       
        None
        #print (q['uid'], pred, pred_scale, ' <> ' , ans['answer'], ans['scale'])
        #print(q['question'])
        #print(table['table'])
        #print(code)
        
    #if isinstance(pred, str):
    #    print (pred, pred_scale, ' -- ' , ans['answer'])
    #if isinstance(pred, tuple):
    #    print (pred, pred_scale, ' == ' , ans['answer'], ans['scale'])
        
print (good/len(res))

*********** 35d602ae-9131-4291-a30c-49a40f32bbe4 pred:  73.0 scale: thousand  ==  73 thousand
{  } 35796 million [Error]run() missing 1 required positional argument: 'paragraphs'
*********** 191c3926-7356-4ab8-a8f9-41e7b7c8a492 pred:  64509 scale: thousand  ==  64509 thousand
*********** 096680e3-c65a-49cc-9339-55abdafd4a38 pred:  70056 scale: thousand  ==  70056 thousand
*********** b7ac671c-e199-4a3c-af2e-c33c6e90924a pred:  67210 scale: thousand  ==  67210 thousand
0.6730769230769231


In [51]:
res


[({'answer_type': 'arithmetic', 'answer': -12.6, 'scale': 'million'},
  -12.6,
  'million',
  {'uid': 'eb787966-fa02-401f-bfaf-ccabf3828b23',
   'order': 5,
   'question': 'What is the change in Other in 2019 from 2018?',
   'answer': -12.6,
   'derivation': '44.1-56.7',
   'answer_type': 'arithmetic',
   'answer_from': 'table-text',
   'rel_paragraphs': ['2'],
   'req_comparison': False,
   'scale': 'million'},
  "def run():\n    value = -12.6\n    scale = 'million'\n    return (value, scale)",
  "Generate a Python code that can answer the following question using the following table! The code must contain only one function called 'run', and no wrapping class. The function must return numeric float results with accuracy to two decimal places and format (value, scale). Do not write explanation, just code.\nQuestion: {'uid': 'eb787966-fa02-401f-bfaf-ccabf3828b23', 'order': 5, 'question': 'What is the change in Other in 2019 from 2018?', 'answer': -12.6, 'derivation': '44.1-56.7', 'answe

In [81]:
good, len(res)

(135, 182)

[['$ million', '2019', '2018', 'Change (%)'], ['Order intake1', '532.0', '470.0', '13.2'], ['Revenue', '503.6', '476.9', '5.6'], ['Gross profit', '368.6', '344.5', '7.0'], ['Gross margin (%)', '73.2', '72.2', '1.0'], ['Adjusted operating costs2', '275.7', '267.4', '3.1'], ['Adjusted operating profit2', '92.9', '77.1', '20.5'], ['Adjusted operating margin3 (%)', '18.4', '16.2', '2.2'], ['Reported operating profit', '88.6', '57.5', '54.1'], ['Effective tax rate4 (%)', '13.0', '15.4', '(2.4)'], ['Reported profit before tax', '89.6', '61.2', '46.4'], ['Adjusted basic earnings per share5 (cents)', '13.40', '10.86', '23.4'], ['Basic earnings per share (cents)', '12.79', '9.14', '39.9'], ['Free cash flow6', '100.1', '50.9', '96.7'], ['Closing cash', '183.2', '121.6', '50.7'], ['Final dividend per share7 (cents)', '3.45', '2.73', '26.4']]
3.45 3.45


(0.72, 'million')

In [157]:
closing_cash_2019

NameError: name 'closing_cash_2019' is not defined