In [1]:
from dotenv import load_dotenv
load_dotenv()

True

In [2]:
import json
import pandas as pd
from pathlib import Path
from copy import deepcopy
from functools import partial

from bellek.utils import set_seed, jprint

set_seed(89)

In [3]:
pd.options.display.float_format = '{:,.3f}'.format

In [4]:
from tqdm.auto import tqdm
tqdm.pandas()

In [5]:
def load_result_dataframe(pattern: str = "our-method-report-*.jsonl") -> pd.DataFrame:
    dfs = [pd.read_json(filename, orient='records', lines=True) for filename in Path(".").glob(pattern)]

    for run, df in enumerate(dfs):
        df["run"] = run + 1

    return pd.concat(dfs, ignore_index=True) 

In [6]:
raw_df = load_result_dataframe()
raw_df

Unnamed: 0,qdecomp,context,retrieval,top_k,qa,run,exact_match,f1
0,False,paragraphs,Sparse,3,Standard,1,0.200,0.304
1,False,paragraphs,Sparse,5,Standard,1,0.150,0.244
2,False,paragraphs,Sparse,10,Standard,1,0.180,0.248
3,False,paragraphs,Dense,3,Standard,1,0.250,0.324
4,False,paragraphs,Dense,5,Standard,1,0.200,0.290
...,...,...,...,...,...,...,...,...
187,True,triplets,Dense,3,Standard,3,0.410,0.492
188,True,triplets,Dense,5,Standard,3,0.430,0.503
189,True,triplets,Dense,10,Standard,3,0.410,0.498
190,True,triplets,Dummy,20,Standard,3,0.270,0.339


In [7]:
raw_df.rename(
    columns={
        'qdecomp': 'Q-decomp',
        'context': 'Context',
        'retrieval': 'Retrieval',
        'top_k': 'top-k',
        'qa': 'Prompting',
        'run': 'Run',
        'exact_match': "EM",
        'f1': "F1",
    }, 
    inplace=True,
)

In [29]:
mask = (raw_df['Context'].isin(['paragraphs']))  & (raw_df['Retrieval'].isin(['Dense', 'Sparse']))
df = raw_df[mask].copy()
df

Unnamed: 0,Q-decomp,Context,Retrieval,top-k,QA Prompting,Run,EM,F1
0,False,paragraphs,Sparse,3,Standard,1,0.200,0.304
1,False,paragraphs,Sparse,5,Standard,1,0.150,0.244
2,False,paragraphs,Sparse,10,Standard,1,0.180,0.248
3,False,paragraphs,Dense,3,Standard,1,0.250,0.324
4,False,paragraphs,Dense,5,Standard,1,0.200,0.290
...,...,...,...,...,...,...,...,...
153,True,paragraphs,Sparse,5,CTE,3,0.520,0.625
154,True,paragraphs,Sparse,10,CTE,3,0.510,0.599
155,True,paragraphs,Dense,3,CTE,3,0.530,0.628
156,True,paragraphs,Dense,5,CTE,3,0.500,0.612


In [34]:
with open("experiment-results-cte-detailed.tex", 'w') as f:
    f.write(df.sort_values(['Q-decomp', 'Context', 'QA Prompting', 'Retrieval', 'top-k', 'Run']).to_latex(index=False, float_format='%.3f'))

In [31]:
agg_df = df.drop(columns=['Context', 'Run']).groupby(['Q-decomp', 'QA Prompting', 'Retrieval', 'top-k']).mean()
agg_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,EM,F1
Q-decomp,QA Prompting,Retrieval,top-k,Unnamed: 4_level_1,Unnamed: 5_level_1
False,CTE,Dense,3,0.307,0.404
False,CTE,Dense,5,0.34,0.438
False,CTE,Dense,10,0.337,0.476
False,CTE,Sparse,3,0.173,0.279
False,CTE,Sparse,5,0.283,0.385
False,CTE,Sparse,10,0.337,0.432
False,Standard,Dense,3,0.243,0.314
False,Standard,Dense,5,0.21,0.292
False,Standard,Dense,10,0.153,0.204
False,Standard,Sparse,3,0.203,0.295


In [23]:
print(agg_df.to_latex(float_format='%.3f'))

\begin{tabular}{llllrr}
\toprule
 &  &  &  & EM & F1 \\
Q-decomp & QA Prompting & Retrieval & top-k &  &  \\
\midrule
\multirow[t]{12}{*}{False} & \multirow[t]{6}{*}{CTE} & \multirow[t]{3}{*}{Dense} & 3 & 0.307 & 0.404 \\
 &  &  & 5 & 0.340 & 0.438 \\
 &  &  & 10 & 0.337 & 0.476 \\
\cline{3-6}
 &  & \multirow[t]{3}{*}{Sparse} & 3 & 0.173 & 0.279 \\
 &  &  & 5 & 0.283 & 0.385 \\
 &  &  & 10 & 0.337 & 0.432 \\
\cline{2-6} \cline{3-6}
 & \multirow[t]{6}{*}{Standard} & \multirow[t]{3}{*}{Dense} & 3 & 0.243 & 0.314 \\
 &  &  & 5 & 0.210 & 0.292 \\
 &  &  & 10 & 0.153 & 0.204 \\
\cline{3-6}
 &  & \multirow[t]{3}{*}{Sparse} & 3 & 0.203 & 0.295 \\
 &  &  & 5 & 0.163 & 0.253 \\
 &  &  & 10 & 0.173 & 0.243 \\
\cline{1-6} \cline{2-6} \cline{3-6}
\multirow[t]{12}{*}{True} & \multirow[t]{6}{*}{CTE} & \multirow[t]{3}{*}{Dense} & 3 & 0.523 & 0.624 \\
 &  &  & 5 & 0.503 & 0.610 \\
 &  &  & 10 & 0.500 & 0.612 \\
\cline{3-6}
 &  & \multirow[t]{3}{*}{Sparse} & 3 & 0.510 & 0.615 \\
 &  &  & 5 & 0.517 & 0.

In [24]:
agg_df.sort_values('F1', ascending=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,EM,F1
Q-decomp,QA Prompting,Retrieval,top-k,Unnamed: 4_level_1,Unnamed: 5_level_1
True,CTE,Dense,3,0.523,0.624
True,CTE,Sparse,5,0.517,0.619
True,CTE,Sparse,3,0.51,0.615
True,CTE,Dense,10,0.5,0.612
True,CTE,Dense,5,0.503,0.61
True,CTE,Sparse,10,0.493,0.585
True,Standard,Dense,3,0.47,0.565
True,Standard,Sparse,3,0.433,0.498
True,Standard,Dense,5,0.407,0.481
False,CTE,Dense,10,0.337,0.476
