In [2]:
from dotenv import load_dotenv
load_dotenv()

True

In [3]:
import json
import pandas as pd
from pathlib import Path
from copy import deepcopy
from functools import partial

from bellem.qa.ablation import answer_question_standard, answer_question_cte
from bellem.utils import set_seed, jprint
from bellem.musique.singlehop import benchmark as benchmark_single
from bellem.musique.multihop import benchmark as benchmark_multi

set_seed(89)

In [4]:
pd.options.display.float_format = '{:,.3f}'.format

In [5]:
from tqdm.auto import tqdm
tqdm.pandas()

In [7]:
dfs = [pd.read_json(filename, orient='records', lines=True) for filename in Path(".").glob("our-method-report-*.jsonl")]
report_df = pd.concat(dfs, ignore_index=True) 
report_df

Unnamed: 0,qdecomp,context,retrieval,top_k,qa,run,exact_match,f1
0,False,paragraphs,bm25,3,standard,1,0.210,0.305
1,False,paragraphs,bm25,5,standard,1,0.230,0.339
2,False,paragraphs,bm25,10,standard,1,0.260,0.383
3,False,paragraphs,semantic,3,standard,1,0.260,0.356
4,False,paragraphs,semantic,5,standard,1,0.270,0.385
...,...,...,...,...,...,...,...,...
123,True,triplets,semantic,3,standard,1,0.430,0.514
124,True,triplets,semantic,5,standard,1,0.450,0.529
125,True,triplets,semantic,10,standard,1,0.450,0.534
126,True,triplets,dummy,20,standard,1,0.440,0.530


In [5]:
report_df = pd.read_json("./our-method-report-20240814-052359.jsonl", orient='records', lines=True)
report_df.head()

Unnamed: 0,qdecomp,context,retrieval,top_k,qa,run,exact_match,f1
0,False,paragraphs,bm25,3,standard,1,0.19,0.287
1,False,paragraphs,semantic,3,standard,1,0.28,0.372
2,False,paragraphs,bm25,5,standard,1,0.25,0.358
3,False,paragraphs,semantic,5,standard,1,0.27,0.393
4,False,paragraphs,bm25,10,standard,1,0.26,0.386


In [8]:
manual_df = pd.DataFrame.from_records([
    {"qdecomp": True, "context": "triplets", "retrieval": "graph-search", "top_k": 10, "qa": "standard", "exact_match": 0.440, "f1": 0.519, "run": 1},
{"qdecomp": True, "context": "paragraphs+triplets", "retrieval": "graph-search", "top_k": 10, "qa": "standard", "exact_match": 0.510, "f1": 0.604, "run": 1}
])
manual_df.head()

Unnamed: 0,qdecomp,context,retrieval,top_k,qa,exact_match,f1,run
0,True,triplets,graph-search,10,standard,0.44,0.519,1
1,True,paragraphs+triplets,graph-search,10,standard,0.51,0.604,1


In [9]:
all_report_df = pd.concat([report_df, manual_df], axis=0, ignore_index=True)
all_report_df

Unnamed: 0,qdecomp,context,retrieval,top_k,qa,run,exact_match,f1
0,False,paragraphs,bm25,3,standard,1,0.210,0.305
1,False,paragraphs,bm25,5,standard,1,0.230,0.339
2,False,paragraphs,bm25,10,standard,1,0.260,0.383
3,False,paragraphs,semantic,3,standard,1,0.260,0.356
4,False,paragraphs,semantic,5,standard,1,0.270,0.385
...,...,...,...,...,...,...,...,...
125,True,triplets,semantic,10,standard,1,0.450,0.534
126,True,triplets,dummy,20,standard,1,0.440,0.530
127,True,triplets,perfect,2,standard,1,0.550,0.634
128,True,triplets,graph-search,10,standard,1,0.440,0.519


In [10]:
all_report_df.rename(
    columns={
        'qdecomp': 'Q-decomp',
        'context': 'Context',
        'retrieval': 'Retrieval',
        'top_k': 'top-k',
        'qa': 'QA Prompting',
        'run': 'Run',
        'exact_match': "EM",
        'f1': "F1",
    }, 
    inplace=True,
)

In [11]:
avg_report_df = all_report_df.drop(columns=['Run']).groupby(['Q-decomp', 'Context', 'QA Prompting', 'Retrieval', 'top-k']).mean()
avg_report_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,EM,F1
Q-decomp,Context,QA Prompting,Retrieval,top-k,Unnamed: 5_level_1,Unnamed: 6_level_1
False,paragraphs,cte,bm25,3,0.180,0.269
False,paragraphs,cte,bm25,5,0.235,0.318
False,paragraphs,cte,bm25,10,0.295,0.401
False,paragraphs,cte,dummy,20,0.445,0.533
False,paragraphs,cte,perfect,2,0.620,0.724
...,...,...,...,...,...,...
True,triplets,standard,graph-search,10,0.440,0.519
True,triplets,standard,perfect,2,0.550,0.634
True,triplets,standard,semantic,3,0.435,0.519
True,triplets,standard,semantic,5,0.440,0.521


In [13]:
avg_report_df.sort_values('EM')

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,EM,F1
Q-decomp,Context,QA Prompting,Retrieval,top-k,Unnamed: 5_level_1,Unnamed: 6_level_1
False,paragraphs,cte,bm25,3,0.180,0.269
False,paragraphs+triplets,standard,semantic,5,0.190,0.326
False,paragraphs,standard,bm25,3,0.205,0.296
False,triplets,standard,bm25,3,0.210,0.338
False,paragraphs+triplets,standard,bm25,3,0.210,0.334
False,...,...,...,...,...,...
False,paragraphs,standard,perfect,2,0.575,0.672
True,paragraphs+triplets,standard,dummy,20,0.585,0.677
True,paragraphs,standard,perfect,2,0.610,0.695
True,paragraphs,cte,perfect,2,0.610,0.704


In [14]:
avg_report_df.sort_values('F1')

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,EM,F1
Q-decomp,Context,QA Prompting,Retrieval,top-k,Unnamed: 5_level_1,Unnamed: 6_level_1
False,paragraphs,cte,bm25,3,0.180,0.269
False,paragraphs,standard,bm25,3,0.205,0.296
False,paragraphs,cte,bm25,5,0.235,0.318
False,paragraphs+triplets,standard,semantic,5,0.190,0.326
False,paragraphs+triplets,standard,bm25,3,0.210,0.334
...,...,...,...,...,...,...
True,paragraphs+triplets,standard,dummy,20,0.585,0.677
True,paragraphs+triplets,standard,perfect,2,0.570,0.685
True,paragraphs,standard,perfect,2,0.610,0.695
True,paragraphs,cte,perfect,2,0.610,0.704


In [18]:
with open("experiment-results-agg.tex", 'w') as f:
    f.write(avg_report_df.to_latex())

In [22]:
with open("experiment-results-detailed.tex", 'w') as f:
    f.write(all_report_df.sort_values(['Q-decomp', 'Context', 'QA Prompting', 'Retrieval', 'top-k', 'Run']).to_latex(index=False))