In [1]:
from dotenv import load_dotenv
load_dotenv()

True

In [2]:
import json
import pandas as pd
from pathlib import Path
from copy import deepcopy
from functools import partial

from bellek.qa.ablation import answer_question_standard, answer_question_cte
from bellek.utils import set_seed, jprint
from bellek.musique.singlehop import benchmark as benchmark_single
from bellek.musique.multihop import benchmark as benchmark_multi

set_seed(89)



In [3]:
pd.options.display.float_format = '{:,.3f}'.format

In [4]:
from tqdm.auto import tqdm
tqdm.pandas()

In [5]:
report_df = pd.read_json("./our-method-report-20240726-001458.jsonl", orient='records', lines=True)
report_df.head()

Unnamed: 0,qdecomp,context,retrieval,top_k,qa,run,exact_match,f1
0,False,paragraphs,bm25,3,standard,1,0.19,0.287
1,False,paragraphs,semantic,3,standard,1,0.28,0.372
2,False,paragraphs,bm25,5,standard,1,0.25,0.358
3,False,paragraphs,semantic,5,standard,1,0.27,0.393
4,False,paragraphs,bm25,10,standard,1,0.26,0.386


In [6]:
manual_df = pd.DataFrame.from_records([
    {"qdecomp": True, "context": "triplets", "retrieval": "graph-search", "top_k": 10, "qa": "standard", "exact_match": 0.440, "f1": 0.519, "run": 1},
{"qdecomp": True, "context": "paragraphs+triplets", "retrieval": "graph-search", "top_k": 10, "qa": "standard", "exact_match": 0.510, "f1": 0.604, "run": 1}
])
manual_df.head()

Unnamed: 0,qdecomp,context,retrieval,top_k,qa,exact_match,f1,run
0,True,triplets,graph-search,10,standard,0.44,0.519,1
1,True,paragraphs+triplets,graph-search,10,standard,0.51,0.604,1


In [7]:
all_report_df = pd.concat([report_df, manual_df], axis=0).reset_index(drop=True)
all_report_df

Unnamed: 0,qdecomp,context,retrieval,top_k,qa,run,exact_match,f1
0,False,paragraphs,bm25,3,standard,1,0.190,0.287
1,False,paragraphs,semantic,3,standard,1,0.280,0.372
2,False,paragraphs,bm25,5,standard,1,0.250,0.358
3,False,paragraphs,semantic,5,standard,1,0.270,0.393
4,False,paragraphs,bm25,10,standard,1,0.260,0.386
...,...,...,...,...,...,...,...,...
85,True,triplets,semantic,9,standard,2,0.400,0.495
86,True,triplets,bm25,15,standard,2,0.430,0.526
87,True,triplets,semantic,15,standard,2,0.450,0.531
88,True,triplets,graph-search,10,standard,1,0.440,0.519


In [8]:
all_report_df.rename(
    columns={
        'qdecomp': 'Q-decomp',
        'context': 'Context',
        'retrieval': 'Retrieval',
        'top_k': 'top-k',
        'qa': 'QA Prompting',
        'run': 'Run',
        'exact_match': "EM",
        'f1': "F1",
    }, 
    inplace=True,
)

In [9]:
avg_report_df = all_report_df.drop(columns=['Run']).groupby(['Q-decomp', 'Context', 'QA Prompting', 'Retrieval', 'top-k']).mean()
avg_report_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,EM,F1
Q-decomp,Context,QA Prompting,Retrieval,top-k,Unnamed: 5_level_1,Unnamed: 6_level_1
False,paragraphs,cte,bm25,3,0.17,0.269
False,paragraphs,cte,bm25,5,0.2,0.308
False,paragraphs,cte,bm25,10,0.28,0.388
False,paragraphs,cte,semantic,3,0.27,0.378
False,paragraphs,cte,semantic,5,0.275,0.373
False,paragraphs,cte,semantic,10,0.31,0.403
False,paragraphs,standard,bm25,3,0.19,0.293
False,paragraphs,standard,bm25,5,0.25,0.355
False,paragraphs,standard,bm25,10,0.265,0.391
False,paragraphs,standard,semantic,3,0.29,0.38


In [12]:
print(avg_report_df.to_latex())

\begin{tabular}{lllllrr}
\toprule
 &  &  &  &  & EM & F1 \\
Q-decomp & Context & QA Prompting & Retrieval & top-k &  &  \\
\midrule
\multirow[t]{22}{*}{False} & \multirow[t]{12}{*}{paragraphs} & \multirow[t]{6}{*}{cte} & \multirow[t]{3}{*}{bm25} & 3 & 0.170000 & 0.268718 \\
 &  &  &  & 5 & 0.200000 & 0.308480 \\
 &  &  &  & 10 & 0.280000 & 0.388278 \\
\cline{4-7}
 &  &  & \multirow[t]{3}{*}{semantic} & 3 & 0.270000 & 0.378173 \\
 &  &  &  & 5 & 0.275000 & 0.373259 \\
 &  &  &  & 10 & 0.310000 & 0.402842 \\
\cline{3-7} \cline{4-7}
 &  & \multirow[t]{6}{*}{standard} & \multirow[t]{3}{*}{bm25} & 3 & 0.190000 & 0.292683 \\
 &  &  &  & 5 & 0.250000 & 0.355292 \\
 &  &  &  & 10 & 0.265000 & 0.391366 \\
\cline{4-7}
 &  &  & \multirow[t]{3}{*}{semantic} & 3 & 0.290000 & 0.379906 \\
 &  &  &  & 5 & 0.275000 & 0.391969 \\
 &  &  &  & 10 & 0.285000 & 0.419140 \\
\cline{2-7} \cline{3-7} \cline{4-7}
 & \multirow[t]{6}{*}{paragraphs+triplets} & \multirow[t]{6}{*}{standard} & \multirow[t]{3}{*}{bm25}

In [13]:
avg_report_df.sort_values('EM')

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,EM,F1
Q-decomp,Context,QA Prompting,Retrieval,top-k,Unnamed: 5_level_1,Unnamed: 6_level_1
False,paragraphs,cte,bm25,3,0.17,0.269
False,paragraphs+triplets,standard,semantic,5,0.19,0.326
False,paragraphs+triplets,standard,semantic,3,0.19,0.326
False,paragraphs,standard,bm25,3,0.19,0.293
False,paragraphs+triplets,standard,semantic,10,0.195,0.326
False,paragraphs,cte,bm25,5,0.2,0.308
False,triplets,standard,semantic,15,0.205,0.321
False,paragraphs+triplets,standard,bm25,10,0.22,0.366
False,paragraphs+triplets,standard,bm25,5,0.22,0.371
False,paragraphs+triplets,standard,bm25,3,0.22,0.368


In [14]:
avg_report_df.sort_values('F1')

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,EM,F1
Q-decomp,Context,QA Prompting,Retrieval,top-k,Unnamed: 5_level_1,Unnamed: 6_level_1
False,paragraphs,cte,bm25,3,0.17,0.269
False,paragraphs,standard,bm25,3,0.19,0.293
False,paragraphs,cte,bm25,5,0.2,0.308
False,triplets,standard,semantic,15,0.205,0.321
False,paragraphs+triplets,standard,semantic,10,0.195,0.326
False,paragraphs+triplets,standard,semantic,5,0.19,0.326
False,paragraphs+triplets,standard,semantic,3,0.19,0.326
False,triplets,standard,bm25,25,0.23,0.345
False,triplets,standard,bm25,15,0.225,0.351
False,paragraphs,standard,bm25,5,0.25,0.355


In [18]:
with open("experiment-results-agg.tex", 'w') as f:
    f.write(avg_report_df.to_latex())

In [22]:
with open("experiment-results-detailed.tex", 'w') as f:
    f.write(all_report_df.sort_values(['Q-decomp', 'Context', 'QA Prompting', 'Retrieval', 'top-k', 'Run']).to_latex(index=False))