In [1]:
from dotenv import load_dotenv
load_dotenv()

True

In [2]:
import json
import pandas as pd
from pathlib import Path
from copy import deepcopy
from functools import partial

from bellek.utils import set_seed, jprint

set_seed(89)

In [3]:
pd.options.display.float_format = '{:,.3f}'.format

In [4]:
from tqdm.auto import tqdm
tqdm.pandas()

In [5]:
def load_result_dataframe(pattern: str = "our-method-kgqa-report-20240819*.jsonl") -> pd.DataFrame:
    dfs = [pd.read_json(filename, orient='records', lines=True) for filename in Path("./data").glob(pattern)]

    # for run, df in enumerate(dfs):
    #     df["run"] = run + 1

    return pd.concat(dfs, ignore_index=True) 

In [6]:
raw_df = load_result_dataframe()
raw_df.rename(
    columns={
        'qdecomp': 'Q-decomp',
        'context': 'Context',
        'retrieval': 'Retrieval',
        'top_k': 'Top-k',
        'qa': 'QA Prompting',
        'run': 'Run',
        'exact_match': "EM",
        'f1': "F1",
    }, 
    inplace=True,
)
raw_df

Unnamed: 0,Q-decomp,Context,Retrieval,Top-k,QA Prompting,Run,EM,F1
0,False,Triplets,Sparse,30,Standard,1,0.270,0.407
1,False,Triplets,Sparse,40,Standard,1,0.240,0.378
2,False,Triplets,Sparse,50,Standard,1,0.240,0.365
3,False,Triplets,Dense,30,Standard,1,0.210,0.339
4,False,Triplets,Dense,40,Standard,1,0.210,0.361
...,...,...,...,...,...,...,...,...
115,True,Triplets,Dense,100,Standard,1,0.410,0.508
116,False,Triplets,Sparse,100,Standard,2,0.220,0.345
117,False,Triplets,Dense,100,Standard,2,0.190,0.315
118,True,Triplets,Sparse,100,Standard,2,0.360,0.449


In [7]:
mask = (raw_df['Retrieval'].isin(['Dense', 'Sparse'])) & raw_df['QA Prompting'].isin(['Standard'])
df = raw_df.loc[mask].copy().drop(columns=['Context', 'QA Prompting'])
# df = raw_df.copy().drop(columns=['Context', 'QA Prompting'])

In [8]:
with open("experiment-results-detailed-kgqa.tex", 'w') as f:
    f.write(df.sort_values(['Q-decomp', 'Retrieval', 'Top-k', 'Run']).to_latex(index=False, float_format='%.3f'))

In [9]:
agg_df = df.drop(columns=['Run']).groupby(['Q-decomp', 'Retrieval', 'Top-k']).agg(['min', 'mean', 'max', 'std'])
agg_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,EM,EM,EM,EM,F1,F1,F1,F1
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,min,mean,max,std,min,mean,max,std
Q-decomp,Retrieval,Top-k,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
False,Dense,5,0.2,0.203,0.21,0.006,0.312,0.314,0.316,0.002
False,Dense,10,0.15,0.157,0.16,0.006,0.263,0.268,0.274,0.006
False,Dense,15,0.18,0.19,0.2,0.01,0.297,0.312,0.329,0.016
False,Dense,20,0.21,0.213,0.22,0.006,0.329,0.334,0.34,0.006
False,Dense,30,0.21,0.21,0.21,0.0,0.333,0.338,0.343,0.005
False,Dense,40,0.21,0.22,0.23,0.01,0.357,0.364,0.375,0.01
False,Dense,50,0.21,0.223,0.24,0.015,0.333,0.349,0.368,0.018
False,Dense,70,0.19,0.2,0.21,0.01,0.324,0.328,0.332,0.004
False,Dense,100,0.19,0.21,0.22,0.017,0.315,0.335,0.346,0.017
False,Sparse,5,0.21,0.22,0.23,0.01,0.32,0.326,0.331,0.006


In [19]:
mask = (raw_df['Retrieval'].isin(['Dense', 'Sparse'])) & (raw_df['QA Prompting'].isin(['Standard'])) 
# & (raw_df['Top-k'].isin([10, 20, 40, 70, 100]))
kgqa_df = raw_df.loc[mask].copy().sort_values(['Q-decomp', 'Retrieval', 'Top-k', 'Run'])

In [20]:
with open("table-kgqa-all.tex", 'w') as f:
    f.write(kgqa_df.drop(columns=['Context', 'QA Prompting']).to_latex(index=False, float_format='%.3f'))

kgqa_df

Unnamed: 0,Q-decomp,Context,Retrieval,Top-k,QA Prompting,Run,EM,F1
60,False,Triplets,Dense,5,Standard,1,0.210,0.315
80,False,Triplets,Dense,5,Standard,2,0.200,0.316
40,False,Triplets,Dense,5,Standard,3,0.200,0.312
61,False,Triplets,Dense,10,Standard,1,0.160,0.268
81,False,Triplets,Dense,10,Standard,2,0.150,0.263
...,...,...,...,...,...,...,...,...
102,True,Triplets,Sparse,70,Standard,2,0.340,0.421
106,True,Triplets,Sparse,70,Standard,3,0.350,0.425
114,True,Triplets,Sparse,100,Standard,1,0.380,0.462
118,True,Triplets,Sparse,100,Standard,2,0.360,0.449


In [11]:
kgqa_agg_df = kgqa_df.drop(columns=['Context', 'QA Prompting']).drop(columns=['Run']).groupby(['Q-decomp', 'Retrieval', 'Top-k']).mean().sort_values(['Q-decomp', 'Retrieval', 'Top-k'])

with open("table-kgqa-agg.tex", 'w') as f:
    f.write(kgqa_agg_df.to_latex(float_format='%.3f'))

kgqa_agg_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,EM,F1
Q-decomp,Retrieval,Top-k,Unnamed: 3_level_1,Unnamed: 4_level_1
False,Dense,5,0.203,0.314
False,Dense,10,0.157,0.268
False,Dense,15,0.19,0.312
False,Dense,20,0.213,0.334
False,Dense,30,0.21,0.338
False,Dense,40,0.22,0.364
False,Dense,50,0.223,0.349
False,Dense,70,0.2,0.328
False,Dense,100,0.21,0.335
False,Sparse,5,0.22,0.326


## Inspect

In [11]:
agg_df.sort_values('F1', ascending=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,EM,F1
Q-decomp,Retrieval,Top-k,Unnamed: 3_level_1,Unnamed: 4_level_1
True,Dense,50,0.447,0.529
True,Dense,20,0.43,0.514
True,Dense,30,0.437,0.511
True,Sparse,30,0.407,0.505
True,Sparse,20,0.397,0.502
True,Sparse,40,0.4,0.5
True,Dense,15,0.417,0.493
True,Dense,40,0.417,0.489
True,Sparse,15,0.37,0.472
True,Dense,10,0.393,0.471


In [12]:
idf = agg_df.reset_index()
idf.head()

Unnamed: 0,Q-decomp,Retrieval,Top-k,EM,F1
0,False,Dense,5,0.203,0.314
1,False,Dense,10,0.157,0.268
2,False,Dense,15,0.19,0.312
3,False,Dense,20,0.213,0.334
4,False,Dense,30,0.21,0.338


In [13]:
qd_mask = idf['Q-decomp']
retrieval_mask = idf['Retrieval'].isin(['Dummy', 'Sparse', 'Dense', 'Perfect'])
idf.loc[retrieval_mask].sort_values('F1', ascending=False)

Unnamed: 0,Q-decomp,Retrieval,Top-k,EM,F1
20,True,Dense,50,0.447,0.529
17,True,Dense,20,0.43,0.514
18,True,Dense,30,0.437,0.511
25,True,Sparse,30,0.407,0.505
24,True,Sparse,20,0.397,0.502
26,True,Sparse,40,0.4,0.5
16,True,Dense,15,0.417,0.493
19,True,Dense,40,0.417,0.489
23,True,Sparse,15,0.37,0.472
15,True,Dense,10,0.393,0.471
