In [1]:
from dotenv import load_dotenv
load_dotenv()

True

In [2]:
import json
import pandas as pd
from pathlib import Path
from copy import deepcopy
from functools import partial

from bellek.musique.qa import answer_question_standard, answer_question_cot, answer_question_cot_fs, answer_question_cte
from bellek.utils import set_seed, jprint
from bellek.musique.singlehop import benchmark

set_seed(89)

In [3]:
from tqdm.auto import tqdm
tqdm.pandas()

In [4]:
pd.options.display.float_format = '{:,.3f}'.format

In [5]:
def perfect_retrieval_func(docs, query):
    return [doc for doc in docs if doc['is_supporting']]

In [6]:
N_RUNS = 3

In [7]:
from bellek.musique.constants import ABLATION_RECORD_IDS

df = pd.read_json('../../data/generated/musique-common/base-dataset-validation.jsonl', orient='records', lines=True)
df = df.set_index('id', drop=False).loc[ABLATION_RECORD_IDS].copy().reset_index(drop=True)
# df = df.sample(10)

print(df.shape)
df.head()

(100, 8)


Unnamed: 0,id,paragraphs,question,question_decomposition,answer,answer_aliases,answerable,answers
0,2hop__575188_342798,"[{'idx': 0, 'title': 'Liliana Mumy', 'paragrap...",Who is the child of Mahmoud Mirza's father?,"[{'id': 575188, 'question': 'Mahmoud Mirza >> ...",Ahmad Shah Qajar,[Ahmad Shah Qajar],True,[Ahmad Shah Qajar]
1,2hop__731584_700117,"[{'idx': 0, 'title': 'KAPE', 'paragraph_text':...",In which county is the city to which KKVU is l...,"[{'id': 731584, 'question': 'KKVU >> licensed ...",Berrien County,[Berrien County],True,[Berrien County]
2,2hop__690412_526810,"[{'idx': 0, 'title': 'Cabramatta Creek', 'para...",For what river does the river on which Pa Sak ...,"[{'id': 690412, 'question': 'Pa Sak Jolasid Da...",Chao Phraya River,[Chao Phraya River],True,[Chao Phraya River]
3,2hop__263638_69048,"[{'idx': 0, 'title': 'Michael J. Barron', 'par...",Who is the Chief Judge of the Tebesa Nemine's ...,"[{'id': 263638, 'question': 'Tebesa Nemine >> ...",Honorable Justice Abiodun Smith,[Honorable Justice Abiodun Smith],True,[Honorable Justice Abiodun Smith]
4,2hop__142842_68489,"[{'idx': 0, 'title': 'Perfect Night: Live in L...",Who did the performer of Night Rocker play on ...,"[{'id': 142842, 'question': 'Which performer r...",Snapper Foster,[Snapper Foster],True,[Snapper Foster]


In [8]:
results = []

for qa_technique, qa_func in tqdm(
    [
        ("standard", answer_question_standard),
        ("cot-zs", answer_question_cot),
        ("cot-fs", answer_question_cot_fs),
        ("cte", answer_question_cte),
    ]
):
    for run in range(1, N_RUNS + 1):
        _, scores = benchmark(df, qa_func, perfect_retrieval_func)
        results.append(
            {
                **scores,
                "retrieval": "groundtruth",
                "context": "paragraphs",
                "qa": qa_technique,
                "run": run,
            }
        )

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

# Report

In [None]:
report_df = pd.DataFrame.from_records(results, columns=['context', 'retrieval', 'qa', 'run', 'exact_match', 'f1'])
report_df

Unnamed: 0,context,retrieval,qa,run,exact_match,f1
0,paragraphs,groundtruth,standard,1,0.5,0.612
1,paragraphs,groundtruth,standard,2,0.54,0.646
2,paragraphs,groundtruth,standard,3,0.49,0.601
3,paragraphs,groundtruth,cot-zs,1,0.52,0.642
4,paragraphs,groundtruth,cot-zs,2,0.54,0.641
5,paragraphs,groundtruth,cot-zs,3,0.54,0.677
6,paragraphs,groundtruth,cot-fs,1,0.59,0.67
7,paragraphs,groundtruth,cot-fs,2,0.59,0.677
8,paragraphs,groundtruth,cot-fs,3,0.56,0.66
9,paragraphs,groundtruth,cte,1,0.56,0.702


In [None]:
from datetime import datetime
suffix = datetime.utcnow().strftime("%Y%m%d-%H%M%S")
report_df.to_json(f'./ablation-prompting-technique-{suffix}.jsonl', orient='records', lines=True)

In [None]:
report_df.drop(columns=['context', 'retrieval', 'run']).groupby(['qa']).agg(['min', 'mean', 'max', 'std']).loc[['standard', 'cot-zs', 'cot-fs', 'cte']]

Unnamed: 0_level_0,exact_match,exact_match,exact_match,exact_match,f1,f1,f1,f1
Unnamed: 0_level_1,min,mean,max,std,min,mean,max,std
qa,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
standard,0.49,0.51,0.54,0.026,0.601,0.619,0.646,0.023
cot-zs,0.52,0.533,0.54,0.012,0.641,0.654,0.677,0.021
cot-fs,0.56,0.58,0.59,0.017,0.66,0.669,0.677,0.009
cte,0.55,0.56,0.57,0.01,0.687,0.699,0.707,0.011


In [None]:
report_df.drop(columns=['context', 'retrieval', 'run']).groupby(['qa']).mean().loc[['standard', 'cot-zs', 'cot-fs', 'cte']]

Unnamed: 0_level_0,exact_match,f1
qa,Unnamed: 1_level_1,Unnamed: 2_level_1
standard,0.51,0.619
cot-zs,0.533,0.654
cot-fs,0.58,0.669
cte,0.56,0.699


## Inspect

In [None]:
report_df.to_latex(index=False, float_format='%.3f')

'\\begin{tabular}{lllrrr}\n\\toprule\ncontext & retrieval & qa & run & exact_match & f1 \\\\\n\\midrule\nparagraphs & groundtruth & standard & 1 & 0.500 & 0.612 \\\\\nparagraphs & groundtruth & standard & 2 & 0.540 & 0.646 \\\\\nparagraphs & groundtruth & standard & 3 & 0.490 & 0.601 \\\\\nparagraphs & groundtruth & cot-zs & 1 & 0.520 & 0.642 \\\\\nparagraphs & groundtruth & cot-zs & 2 & 0.540 & 0.641 \\\\\nparagraphs & groundtruth & cot-zs & 3 & 0.540 & 0.677 \\\\\nparagraphs & groundtruth & cot-fs & 1 & 0.590 & 0.670 \\\\\nparagraphs & groundtruth & cot-fs & 2 & 0.590 & 0.677 \\\\\nparagraphs & groundtruth & cot-fs & 3 & 0.560 & 0.660 \\\\\nparagraphs & groundtruth & cte & 1 & 0.560 & 0.702 \\\\\nparagraphs & groundtruth & cte & 2 & 0.570 & 0.707 \\\\\nparagraphs & groundtruth & cte & 3 & 0.550 & 0.687 \\\\\n\\bottomrule\n\\end{tabular}\n'