In [1]:
from dotenv import load_dotenv
load_dotenv()

True

In [2]:
import json
import pandas as pd
from pathlib import Path
from copy import deepcopy
from functools import partial

from bellek.qa.ablation import answer_question_standard, answer_question_cte
from bellek.utils import set_seed, jprint
from bellek.musique.singlehop import benchmark as benchmark_single
from bellek.musique.multihop import benchmark as benchmark_multi

set_seed(89)



In [3]:
pd.options.display.float_format = '{:,.3f}'.format

In [4]:
from tqdm.auto import tqdm
tqdm.pandas()

In [6]:
report_df = pd.read_json("./our-method-report-20240726-001458.jsonl", orient='records', lines=True)
report_df.head()

Unnamed: 0,qdecomp,context,retrieval,top_k,qa,run,exact_match,f1
0,False,paragraphs,bm25,3,standard,1,0.19,0.287
1,False,paragraphs,semantic,3,standard,1,0.28,0.372
2,False,paragraphs,bm25,5,standard,1,0.25,0.358
3,False,paragraphs,semantic,5,standard,1,0.27,0.393
4,False,paragraphs,bm25,10,standard,1,0.26,0.386


In [10]:
manual_df = pd.DataFrame.from_records([
    {"qdecomp": True, "context": "triplets", "retrieval": "graph-search", "top_k": 10, "qa": "standard", "exact_match": 0.440, "f1": 0.519, "run": 1},
{"qdecomp": True, "context": "paragraphs+triplets", "retrieval": "graph-search", "top_k": 10, "qa": "standard", "exact_match": 0.510, "f1": 0.604, "run": 1}
])
manual_df.head()

Unnamed: 0,qdecomp,context,retrieval,top_k,qa,exact_match,f1,run
0,True,triplets,graph-search,10,standard,0.44,0.519,1
1,True,paragraphs+triplets,graph-search,10,standard,0.51,0.604,1


In [14]:
all_report_df = pd.concat([report_df, manual_df], axis=0).reset_index(drop=True)
all_report_df

Unnamed: 0,qdecomp,context,retrieval,top_k,qa,run,exact_match,f1
0,False,paragraphs,bm25,3,standard,1,0.190,0.287
1,False,paragraphs,semantic,3,standard,1,0.280,0.372
2,False,paragraphs,bm25,5,standard,1,0.250,0.358
3,False,paragraphs,semantic,5,standard,1,0.270,0.393
4,False,paragraphs,bm25,10,standard,1,0.260,0.386
...,...,...,...,...,...,...,...,...
85,True,triplets,semantic,9,standard,2,0.400,0.495
86,True,triplets,bm25,15,standard,2,0.430,0.526
87,True,triplets,semantic,15,standard,2,0.450,0.531
88,True,triplets,graph-search,10,standard,1,0.440,0.519


In [15]:
avg_report_df = all_report_df.drop(columns=['run']).groupby(['qdecomp', 'context', 'retrieval', 'top_k', 'qa']).mean().sort_values("exact_match")
avg_report_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,exact_match,f1
qdecomp,context,retrieval,top_k,qa,Unnamed: 5_level_1,Unnamed: 6_level_1
False,paragraphs,bm25,3,cte,0.17,0.269
False,paragraphs,bm25,3,standard,0.19,0.293
False,paragraphs+triplets,semantic,5,standard,0.19,0.326
False,paragraphs+triplets,semantic,3,standard,0.19,0.326
False,paragraphs+triplets,semantic,10,standard,0.195,0.326
False,paragraphs,bm25,5,cte,0.2,0.308
False,triplets,semantic,15,standard,0.205,0.321
False,paragraphs+triplets,bm25,10,standard,0.22,0.366
False,paragraphs+triplets,bm25,5,standard,0.22,0.371
False,paragraphs+triplets,bm25,3,standard,0.22,0.368


In [19]:
print(avg_report_df.to_latex())

\begin{tabular}{lllllrr}
\toprule
 &  &  &  &  & exact_match & f1 \\
qdecomp & context & retrieval & top_k & qa &  &  \\
\midrule
\multirow[t]{22}{*}{False} & \multirow[t]{2}{*}{paragraphs} & \multirow[t]{2}{*}{bm25} & \multirow[t]{2}{*}{3} & cte & 0.170000 & 0.268718 \\
 &  &  &  & standard & 0.190000 & 0.292683 \\
\cline{2-7} \cline{3-7} \cline{4-7}
 & \multirow[t]{3}{*}{paragraphs+triplets} & \multirow[t]{3}{*}{semantic} & 5 & standard & 0.190000 & 0.326358 \\
\cline{4-7}
 &  &  & 3 & standard & 0.190000 & 0.326358 \\
\cline{4-7}
 &  &  & 10 & standard & 0.195000 & 0.325525 \\
\cline{2-7} \cline{3-7} \cline{4-7}
 & paragraphs & bm25 & 5 & cte & 0.200000 & 0.308480 \\
\cline{2-7} \cline{3-7} \cline{4-7}
 & triplets & semantic & 15 & standard & 0.205000 & 0.321007 \\
\cline{2-7} \cline{3-7} \cline{4-7}
 & \multirow[t]{3}{*}{paragraphs+triplets} & \multirow[t]{3}{*}{bm25} & 10 & standard & 0.220000 & 0.366213 \\
\cline{4-7}
 &  &  & 5 & standard & 0.220000 & 0.370713 \\
\cline{4-7}
 & 