In [1]:
from dotenv import load_dotenv
load_dotenv()

True

In [2]:
import json
import pandas as pd
from pathlib import Path
from copy import deepcopy
from functools import partial

from bellek.utils import set_seed, jprint

set_seed(89)

In [3]:
pd.options.display.float_format = '{:,.3f}'.format

In [4]:
from tqdm.auto import tqdm
tqdm.pandas()

In [5]:
def load_result_dataframe(pattern: str = "our-method-report-20240821-*.jsonl") -> pd.DataFrame:
    dfs = [pd.read_json(filename, orient='records', lines=True) for filename in Path(".").glob(pattern)]

    for run, df in enumerate(dfs):
        df["run"] = run + 1

    return pd.concat(dfs, ignore_index=True) 

In [6]:
raw_df = load_result_dataframe()
raw_df

Unnamed: 0,qdecomp,context,retrieval,top_k,qa,run,exact_match,f1
0,False,paragraphs,Sparse,3,Standard,1,0.180,0.274
1,False,paragraphs,Sparse,5,Standard,1,0.160,0.266
2,False,paragraphs,Sparse,10,Standard,1,0.080,0.138
3,False,paragraphs,Dense,3,Standard,1,0.240,0.319
4,False,paragraphs,Dense,5,Standard,1,0.180,0.257
...,...,...,...,...,...,...,...,...
67,True,paragraphs,Sparse,5,CTE,3,0.490,0.610
68,True,paragraphs,Sparse,10,CTE,3,0.510,0.600
69,True,paragraphs,Dense,3,CTE,3,0.560,0.661
70,True,paragraphs,Dense,5,CTE,3,0.520,0.620


In [7]:
raw_df.rename(
    columns={
        'qdecomp': 'Q-decomp',
        'context': 'Context',
        'retrieval': 'Retrieval',
        'top_k': 'Top-k',
        'qa': 'Prompting',
        'run': 'Run',
        'exact_match': "EM",
        'f1': "F1",
    }, 
    inplace=True,
)

In [8]:
df = raw_df.drop(columns=["Context"])

In [9]:
agg_df = df.drop(columns=['Run']).groupby(['Q-decomp', 'Prompting', 'Retrieval', 'Top-k']).mean()
agg_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,EM,F1
Q-decomp,Prompting,Retrieval,Top-k,Unnamed: 4_level_1,Unnamed: 5_level_1
False,CTE,Dense,3,0.26,0.369
False,CTE,Dense,5,0.31,0.408
False,CTE,Dense,10,0.347,0.477
False,CTE,Sparse,3,0.17,0.278
False,CTE,Sparse,5,0.277,0.384
False,CTE,Sparse,10,0.347,0.473
False,Standard,Dense,3,0.233,0.316
False,Standard,Dense,5,0.18,0.259
False,Standard,Dense,10,0.137,0.194
False,Standard,Sparse,3,0.19,0.283


In [10]:
with open("experiment-results-detailed.tex", 'w') as f:
    f.write(df.sort_values(['Q-decomp', 'Prompting', 'Retrieval', 'Top-k', 'Run']).to_latex(index=False, float_format='%.3f'))

In [11]:
with open("experiment-results-agg.tex", 'w') as f:
    f.write(agg_df.sort_values(['Q-decomp', 'Prompting', 'Retrieval', 'Top-k']).to_latex(float_format='%.3f'))

## Explore

In [12]:
df = agg_df.reset_index()
df.head()

Unnamed: 0,Q-decomp,Prompting,Retrieval,Top-k,EM,F1
0,False,CTE,Dense,3,0.26,0.369
1,False,CTE,Dense,5,0.31,0.408
2,False,CTE,Dense,10,0.347,0.477
3,False,CTE,Sparse,3,0.17,0.278
4,False,CTE,Sparse,5,0.277,0.384


### Inspect

In [13]:
qd_mask = df['Q-decomp']
context_mask = df['Context'] == 'paragraphs'
retrieval_mask = df['Retrieval'].isin(['Dense', 'Sparse'])
df.loc[context_mask & retrieval_mask ].sort_values('F1', ascending=False)

KeyError: 'Context'

In [None]:
df.loc[context_mask & retrieval_mask ].sort_values('F1', ascending=False).set_index(['Q-decomp', 'Context', 'QA Prompting', 'Retrieval', 'top-k'])

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,EM,F1
Q-decomp,Context,QA Prompting,Retrieval,top-k,Unnamed: 5_level_1,Unnamed: 6_level_1
True,triplets,Standard,Dense,3,0.433,0.509
True,triplets,Standard,Dense,10,0.413,0.503
True,triplets,Standard,Dense,5,0.42,0.499
False,triplets,Standard,Dense,5,0.223,0.351
False,triplets,Standard,Dense,3,0.22,0.346
False,triplets,Standard,Dense,10,0.187,0.308


In [None]:
qd_mask = df['Q-decomp']
context_mask = df['Context'] == 'paragraphs'
retrieval_mask = df['Retrieval'].isin(['Perfect'])
df.loc[context_mask & retrieval_mask ].sort_values('F1', ascending=False)

Unnamed: 0,Q-decomp,Context,QA Prompting,Retrieval,top-k,EM,F1
4,False,paragraphs,CTE,Perfect,2,0.627,0.736
36,True,paragraphs,CTE,Perfect,2,0.587,0.696
44,True,paragraphs,Standard,Perfect,2,0.513,0.62
12,False,paragraphs,Standard,Perfect,2,0.507,0.616


In [None]:
agg_df.loc[idx[:, 'paragraphs', :, :]]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,EM,F1
Q-decomp,QA Prompting,Retrieval,top-k,Unnamed: 4_level_1,Unnamed: 5_level_1
False,CTE,Dense,3,0.307,0.404
False,CTE,Dense,5,0.34,0.438
False,CTE,Dense,10,0.337,0.476
False,CTE,Dummy,20,0.517,0.599
False,CTE,Perfect,2,0.627,0.736
False,CTE,Sparse,3,0.173,0.279
False,CTE,Sparse,5,0.283,0.385
False,CTE,Sparse,10,0.337,0.432
False,Standard,Dense,3,0.243,0.314
False,Standard,Dense,5,0.21,0.292


In [None]:
idx = pd.IndexSlice
print(agg_df.loc[idx[:, 'paragraphs', :, :]].to_latex(float_format='%.3f'))

\begin{tabular}{llllrr}
\toprule
 &  &  &  & EM & F1 \\
Q-decomp & QA Prompting & Retrieval & top-k &  &  \\
\midrule
\multirow[t]{16}{*}{False} & \multirow[t]{8}{*}{CTE} & \multirow[t]{3}{*}{Dense} & 3 & 0.307 & 0.404 \\
 &  &  & 5 & 0.340 & 0.438 \\
 &  &  & 10 & 0.337 & 0.476 \\
\cline{3-6}
 &  & Dummy & 20 & 0.517 & 0.599 \\
\cline{3-6}
 &  & Perfect & 2 & 0.627 & 0.736 \\
\cline{3-6}
 &  & \multirow[t]{3}{*}{Sparse} & 3 & 0.173 & 0.279 \\
 &  &  & 5 & 0.283 & 0.385 \\
 &  &  & 10 & 0.337 & 0.432 \\
\cline{2-6} \cline{3-6}
 & \multirow[t]{8}{*}{Standard} & \multirow[t]{3}{*}{Dense} & 3 & 0.243 & 0.314 \\
 &  &  & 5 & 0.210 & 0.292 \\
 &  &  & 10 & 0.153 & 0.204 \\
\cline{3-6}
 &  & Dummy & 20 & 0.127 & 0.167 \\
\cline{3-6}
 &  & Perfect & 2 & 0.507 & 0.616 \\
\cline{3-6}
 &  & \multirow[t]{3}{*}{Sparse} & 3 & 0.203 & 0.295 \\
 &  &  & 5 & 0.163 & 0.253 \\
 &  &  & 10 & 0.173 & 0.243 \\
\cline{1-6} \cline{2-6} \cline{3-6}
\multirow[t]{16}{*}{True} & \multirow[t]{8}{*}{CTE} & \multir