In [1]:
from dotenv import load_dotenv
load_dotenv()

True

In [2]:
import json
import pandas as pd
from pathlib import Path
from copy import deepcopy
from functools import partial

from bellek.utils import set_seed, jprint

set_seed(89)

In [3]:
pd.options.display.float_format = '{:,.3f}'.format

In [4]:
from tqdm.auto import tqdm
tqdm.pandas()

In [5]:
def load_result_dataframe(pattern: str = "our-method-report-*.jsonl") -> pd.DataFrame:
    dfs = [pd.read_json(filename, orient='records', lines=True) for filename in Path(".").glob(pattern)]

    for run, df in enumerate(dfs):
        df["run"] = run + 1

    return pd.concat(dfs, ignore_index=True) 

In [6]:
raw_df = load_result_dataframe()
raw_df

Unnamed: 0,qdecomp,context,retrieval,top_k,qa,run,exact_match,f1
0,False,paragraphs,Sparse,3,Standard,1,0.200,0.304
1,False,paragraphs,Sparse,5,Standard,1,0.150,0.244
2,False,paragraphs,Sparse,10,Standard,1,0.180,0.248
3,False,paragraphs,Dense,3,Standard,1,0.250,0.324
4,False,paragraphs,Dense,5,Standard,1,0.200,0.290
...,...,...,...,...,...,...,...,...
187,True,triplets,Dense,3,Standard,3,0.410,0.492
188,True,triplets,Dense,5,Standard,3,0.430,0.503
189,True,triplets,Dense,10,Standard,3,0.410,0.498
190,True,triplets,Dummy,20,Standard,3,0.270,0.339


In [7]:
raw_df.rename(
    columns={
        'qdecomp': 'Q-decomp',
        'context': 'Context',
        'retrieval': 'Retrieval',
        'top_k': 'top-k',
        'qa': 'QA Prompting',
        'run': 'Run',
        'exact_match': "EM",
        'f1': "F1",
    }, 
    inplace=True,
)

In [8]:
agg_df = raw_df.drop(columns=['Run']).groupby(['Q-decomp', 'Context', 'QA Prompting', 'Retrieval', 'top-k']).mean()
agg_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,EM,F1
Q-decomp,Context,QA Prompting,Retrieval,top-k,Unnamed: 5_level_1,Unnamed: 6_level_1
False,paragraphs,CTE,Dense,3,0.307,0.404
False,paragraphs,CTE,Dense,5,0.340,0.438
False,paragraphs,CTE,Dense,10,0.337,0.476
False,paragraphs,CTE,Dummy,20,0.517,0.599
False,paragraphs,CTE,Perfect,2,0.627,0.736
...,...,...,...,...,...,...
True,triplets,Standard,Dummy,20,0.297,0.367
True,triplets,Standard,Perfect,2,0.510,0.618
True,triplets,Standard,Sparse,3,0.400,0.491
True,triplets,Standard,Sparse,5,0.433,0.540


In [9]:
agg_df.sort_values('F1', ascending=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,EM,F1
Q-decomp,Context,QA Prompting,Retrieval,top-k,Unnamed: 5_level_1,Unnamed: 6_level_1
False,paragraphs,CTE,Perfect,2,0.627,0.736
True,paragraphs,CTE,Perfect,2,0.587,0.696
True,paragraphs+triplets,Standard,Perfect,2,0.553,0.664
False,paragraphs+triplets,Standard,Perfect,2,0.517,0.652
False,triplets,Standard,Perfect,2,0.520,0.644
False,...,...,...,...,...,...
False,paragraphs+triplets,Standard,Dummy,20,0.143,0.230
True,paragraphs,Standard,Dummy,20,0.177,0.221
False,triplets,Standard,Dummy,20,0.137,0.218
False,paragraphs,Standard,Dense,10,0.153,0.204


In [10]:
with open("experiment-results-detailed.tex", 'w') as f:
    f.write(raw_df.sort_values(['Q-decomp', 'Context', 'QA Prompting', 'Retrieval', 'top-k', 'Run']).to_latex(index=False, float_format='%.3f'))

In [11]:
with open("experiment-results-agg.tex", 'w') as f:
    f.write(agg_df.sort_values(['Q-decomp', 'Context', 'QA Prompting', 'Retrieval', 'top-k']).to_latex(index=False, float_format='%.3f'))

## Explore

In [12]:
df = agg_df.reset_index()
df.head()

Unnamed: 0,Q-decomp,Context,QA Prompting,Retrieval,top-k,EM,F1
0,False,paragraphs,CTE,Dense,3,0.307,0.404
1,False,paragraphs,CTE,Dense,5,0.34,0.438
2,False,paragraphs,CTE,Dense,10,0.337,0.476
3,False,paragraphs,CTE,Dummy,20,0.517,0.599
4,False,paragraphs,CTE,Perfect,2,0.627,0.736


### KGQA (only triplets)

Using a pre-constructed KG to answer questions.

In [13]:
qd_mask = df['Q-decomp']
context_mask = df['Context'] == 'triplets'
retrieval_mask = df['Retrieval'].isin(['Sparse', 'Dense'])
mini_df = df.loc[context_mask & retrieval_mask ].sort_values('F1', ascending=False)
print(mini_df.to_latex(index=False, float_format='%.3f'))
mini_df

\begin{tabular}{rlllrrr}
\toprule
Q-decomp & Context & QA Prompting & Retrieval & top-k & EM & F1 \\
\midrule
True & triplets & Standard & Sparse & 5 & 0.433 & 0.540 \\
True & triplets & Standard & Dense & 3 & 0.433 & 0.509 \\
True & triplets & Standard & Dense & 10 & 0.413 & 0.503 \\
True & triplets & Standard & Dense & 5 & 0.420 & 0.499 \\
True & triplets & Standard & Sparse & 3 & 0.400 & 0.491 \\
True & triplets & Standard & Sparse & 10 & 0.340 & 0.410 \\
False & triplets & Standard & Sparse & 5 & 0.240 & 0.366 \\
False & triplets & Standard & Dense & 5 & 0.223 & 0.351 \\
False & triplets & Standard & Dense & 3 & 0.220 & 0.346 \\
False & triplets & Standard & Sparse & 3 & 0.213 & 0.340 \\
False & triplets & Standard & Dense & 10 & 0.187 & 0.308 \\
False & triplets & Standard & Sparse & 10 & 0.170 & 0.269 \\
\bottomrule
\end{tabular}



Unnamed: 0,Q-decomp,Context,QA Prompting,Retrieval,top-k,EM,F1
62,True,triplets,Standard,Sparse,5,0.433,0.54
56,True,triplets,Standard,Dense,3,0.433,0.509
58,True,triplets,Standard,Dense,10,0.413,0.503
57,True,triplets,Standard,Dense,5,0.42,0.499
61,True,triplets,Standard,Sparse,3,0.4,0.491
63,True,triplets,Standard,Sparse,10,0.34,0.41
30,False,triplets,Standard,Sparse,5,0.24,0.366
25,False,triplets,Standard,Dense,5,0.223,0.351
24,False,triplets,Standard,Dense,3,0.22,0.346
29,False,triplets,Standard,Sparse,3,0.213,0.34


In [14]:
qd_mask = df['Q-decomp']
context_mask = df['Context'] == 'triplets'
retrieval_mask = df['Retrieval'].isin(['Dummy', 'Perfect'])
mini_df = df.loc[context_mask & retrieval_mask ].sort_values('F1', ascending=False)
print(mini_df.to_latex(index=False, float_format='%.3f'))
mini_df

\begin{tabular}{rlllrrr}
\toprule
Q-decomp & Context & QA Prompting & Retrieval & top-k & EM & F1 \\
\midrule
False & triplets & Standard & Perfect & 2 & 0.520 & 0.644 \\
True & triplets & Standard & Perfect & 2 & 0.510 & 0.618 \\
True & triplets & Standard & Dummy & 20 & 0.297 & 0.367 \\
False & triplets & Standard & Dummy & 20 & 0.137 & 0.218 \\
\bottomrule
\end{tabular}



Unnamed: 0,Q-decomp,Context,QA Prompting,Retrieval,top-k,EM,F1
28,False,triplets,Standard,Perfect,2,0.52,0.644
60,True,triplets,Standard,Perfect,2,0.51,0.618
59,True,triplets,Standard,Dummy,20,0.297,0.367
27,False,triplets,Standard,Dummy,20,0.137,0.218


### QA with CTE prompting

In [15]:
qd_mask = df['Q-decomp']
context_mask = df['Context'] == 'paragraphs'
retrieval_mask = df['Retrieval'].isin(['Dense', 'Sparse'])
df.loc[context_mask & retrieval_mask ].sort_values('F1', ascending=False)

Unnamed: 0,Q-decomp,Context,QA Prompting,Retrieval,top-k,EM,F1
32,True,paragraphs,CTE,Dense,3,0.523,0.624
38,True,paragraphs,CTE,Sparse,5,0.517,0.619
37,True,paragraphs,CTE,Sparse,3,0.51,0.615
34,True,paragraphs,CTE,Dense,10,0.5,0.612
33,True,paragraphs,CTE,Dense,5,0.503,0.61
39,True,paragraphs,CTE,Sparse,10,0.493,0.585
40,True,paragraphs,Standard,Dense,3,0.47,0.565
45,True,paragraphs,Standard,Sparse,3,0.433,0.498
41,True,paragraphs,Standard,Dense,5,0.407,0.481
2,False,paragraphs,CTE,Dense,10,0.337,0.476


In [22]:
df.loc[context_mask & retrieval_mask ].sort_values('F1', ascending=False).set_index(['Q-decomp', 'Context', 'QA Prompting', 'Retrieval', 'top-k'])

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,EM,F1
Q-decomp,Context,QA Prompting,Retrieval,top-k,Unnamed: 5_level_1,Unnamed: 6_level_1
True,triplets,Standard,Dense,3,0.433,0.509
True,triplets,Standard,Dense,10,0.413,0.503
True,triplets,Standard,Dense,5,0.42,0.499
False,triplets,Standard,Dense,5,0.223,0.351
False,triplets,Standard,Dense,3,0.22,0.346
False,triplets,Standard,Dense,10,0.187,0.308


In [16]:
qd_mask = df['Q-decomp']
context_mask = df['Context'] == 'paragraphs'
retrieval_mask = df['Retrieval'].isin(['Perfect'])
df.loc[context_mask & retrieval_mask ].sort_values('F1', ascending=False)

Unnamed: 0,Q-decomp,Context,QA Prompting,Retrieval,top-k,EM,F1
4,False,paragraphs,CTE,Perfect,2,0.627,0.736
36,True,paragraphs,CTE,Perfect,2,0.587,0.696
44,True,paragraphs,Standard,Perfect,2,0.513,0.62
12,False,paragraphs,Standard,Perfect,2,0.507,0.616


In [43]:
agg_df.loc[idx[:, 'paragraphs', :, :]]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,EM,F1
Q-decomp,QA Prompting,Retrieval,top-k,Unnamed: 4_level_1,Unnamed: 5_level_1
False,CTE,Dense,3,0.307,0.404
False,CTE,Dense,5,0.34,0.438
False,CTE,Dense,10,0.337,0.476
False,CTE,Dummy,20,0.517,0.599
False,CTE,Perfect,2,0.627,0.736
False,CTE,Sparse,3,0.173,0.279
False,CTE,Sparse,5,0.283,0.385
False,CTE,Sparse,10,0.337,0.432
False,Standard,Dense,3,0.243,0.314
False,Standard,Dense,5,0.21,0.292


In [40]:
idx = pd.IndexSlice
print(agg_df.loc[idx[:, 'paragraphs', :, :]].to_latex(float_format='%.3f'))

\begin{tabular}{llllrr}
\toprule
 &  &  &  & EM & F1 \\
Q-decomp & QA Prompting & Retrieval & top-k &  &  \\
\midrule
\multirow[t]{16}{*}{False} & \multirow[t]{8}{*}{CTE} & \multirow[t]{3}{*}{Dense} & 3 & 0.307 & 0.404 \\
 &  &  & 5 & 0.340 & 0.438 \\
 &  &  & 10 & 0.337 & 0.476 \\
\cline{3-6}
 &  & Dummy & 20 & 0.517 & 0.599 \\
\cline{3-6}
 &  & Perfect & 2 & 0.627 & 0.736 \\
\cline{3-6}
 &  & \multirow[t]{3}{*}{Sparse} & 3 & 0.173 & 0.279 \\
 &  &  & 5 & 0.283 & 0.385 \\
 &  &  & 10 & 0.337 & 0.432 \\
\cline{2-6} \cline{3-6}
 & \multirow[t]{8}{*}{Standard} & \multirow[t]{3}{*}{Dense} & 3 & 0.243 & 0.314 \\
 &  &  & 5 & 0.210 & 0.292 \\
 &  &  & 10 & 0.153 & 0.204 \\
\cline{3-6}
 &  & Dummy & 20 & 0.127 & 0.167 \\
\cline{3-6}
 &  & Perfect & 2 & 0.507 & 0.616 \\
\cline{3-6}
 &  & \multirow[t]{3}{*}{Sparse} & 3 & 0.203 & 0.295 \\
 &  &  & 5 & 0.163 & 0.253 \\
 &  &  & 10 & 0.173 & 0.243 \\
\cline{1-6} \cline{2-6} \cline{3-6}
\multirow[t]{16}{*}{True} & \multirow[t]{8}{*}{CTE} & \multir

### Rest

In [17]:
qd_mask = df['Q-decomp']
context_mask = df['Context'] == 'paragraphs+triplets'
retrieval_mask = df['Retrieval'].isin(['Dense', 'Sparse'])
df.loc[context_mask & retrieval_mask ].sort_values('F1', ascending=False)

Unnamed: 0,Q-decomp,Context,QA Prompting,Retrieval,top-k,EM,F1
48,True,paragraphs+triplets,Standard,Dense,3,0.5,0.613
53,True,paragraphs+triplets,Standard,Sparse,3,0.467,0.577
50,True,paragraphs+triplets,Standard,Dense,10,0.463,0.562
55,True,paragraphs+triplets,Standard,Sparse,10,0.473,0.561
49,True,paragraphs+triplets,Standard,Dense,5,0.45,0.557
54,True,paragraphs+triplets,Standard,Sparse,5,0.43,0.513
16,False,paragraphs+triplets,Standard,Dense,3,0.247,0.365
22,False,paragraphs+triplets,Standard,Sparse,5,0.183,0.292
18,False,paragraphs+triplets,Standard,Dense,10,0.153,0.291
21,False,paragraphs+triplets,Standard,Sparse,3,0.19,0.289


In [18]:
qd_mask = df['Q-decomp']
context_mask = df['Context'] == 'triplets'
retrieval_mask = df['Retrieval'].isin(['Dummy', 'Dense', 'Sparse', 'Perfect'])
df.loc[qd_mask & context_mask & retrieval_mask ].sort_values('F1', ascending=False)

Unnamed: 0,Q-decomp,Context,QA Prompting,Retrieval,top-k,EM,F1
60,True,triplets,Standard,Perfect,2,0.51,0.618
62,True,triplets,Standard,Sparse,5,0.433,0.54
56,True,triplets,Standard,Dense,3,0.433,0.509
58,True,triplets,Standard,Dense,10,0.413,0.503
57,True,triplets,Standard,Dense,5,0.42,0.499
61,True,triplets,Standard,Sparse,3,0.4,0.491
63,True,triplets,Standard,Sparse,10,0.34,0.41
59,True,triplets,Standard,Dummy,20,0.297,0.367


In [19]:
qd_mask = df['Q-decomp']
retrieval_mask = df['Retrieval'].isin(['Dense'])
df.loc[qd_mask & retrieval_mask ].sort_values('F1', ascending=False)

Unnamed: 0,Q-decomp,Context,QA Prompting,Retrieval,top-k,EM,F1
32,True,paragraphs,CTE,Dense,3,0.523,0.624
48,True,paragraphs+triplets,Standard,Dense,3,0.5,0.613
34,True,paragraphs,CTE,Dense,10,0.5,0.612
33,True,paragraphs,CTE,Dense,5,0.503,0.61
40,True,paragraphs,Standard,Dense,3,0.47,0.565
50,True,paragraphs+triplets,Standard,Dense,10,0.463,0.562
49,True,paragraphs+triplets,Standard,Dense,5,0.45,0.557
56,True,triplets,Standard,Dense,3,0.433,0.509
58,True,triplets,Standard,Dense,10,0.413,0.503
57,True,triplets,Standard,Dense,5,0.42,0.499


In [20]:
avg_report_df.index

NameError: name 'avg_report_df' is not defined

In [None]:
avg_report_df.loc[True].sort_values('F1', ascending=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,EM,F1
Context,QA Prompting,Retrieval,top-k,Unnamed: 4_level_1,Unnamed: 5_level_1
paragraphs,CTE,Perfect,2,0.59,0.701
paragraphs+triplets,Standard,Perfect,2,0.56,0.667
paragraphs,CTE,Dense,3,0.525,0.625
triplets,Standard,Perfect,2,0.51,0.624
paragraphs,Standard,Perfect,2,0.51,0.621
paragraphs,CTE,Sparse,5,0.51,0.618
paragraphs,CTE,Dense,10,0.5,0.616
paragraphs,CTE,Sparse,3,0.51,0.615
paragraphs,CTE,Dummy,20,0.525,0.614
paragraphs+triplets,Standard,Dense,3,0.495,0.61
