In [24]:
from dotenv import load_dotenv
load_dotenv()

True

In [25]:
import json
import pandas as pd
from pathlib import Path
from copy import deepcopy
from functools import partial

from bellek.qa.ablation import answer_question_standard, answer_question_cot, answer_question_cot_fs, answer_question_cte, answer_question_cte_cot
from bellek.utils import set_seed, jprint
from bellek.musique.singlehop import benchmark

set_seed(89)

In [None]:
from tqdm.auto import tqdm
tqdm.pandas()

In [39]:
pd.options.display.float_format = '{:,.3f}'.format

In [26]:
N_RUNS = 3
SAMPLE_SIZE = 100

In [28]:
from bellek.musique.constants import ABLATION_RECORD_IDS

df = pd.read_json('../../data/generated/musique-evaluation/dataset.jsonl', orient='records', lines=True)
df = df.set_index('id', drop=False).loc[ABLATION_RECORD_IDS].copy().reset_index(drop=True)
qd_df = pd.read_json('../../data/generated/musique-evaluation/question-decomposition.jsonl', orient='records', lines=True)
df = pd.merge(df.drop(columns=['question', 'question_decomposition']), qd_df, on='id', suffixes=('', ''))
df = df.head(SAMPLE_SIZE)

print(df.shape)
df.head()

(100, 8)


Unnamed: 0,id,paragraphs,answer,answer_aliases,answerable,answers,question,question_decomposition
0,2hop__575188_342798,"[{'idx': 0, 'title': 'Liliana Mumy', 'paragrap...",Ahmad Shah Qajar,[Ahmad Shah Qajar],True,[Ahmad Shah Qajar],Who is the child of Mahmoud Mirza's father?,"[{'id': 575188, 'question': 'Who is Mahmoud Mi..."
1,2hop__731584_700117,"[{'idx': 0, 'title': 'KAPE', 'paragraph_text':...",Berrien County,[Berrien County],True,[Berrien County],In which county is the city to which KKVU is l...,"[{'id': 731584, 'question': 'To which city is ..."
2,2hop__690412_526810,"[{'idx': 0, 'title': 'Cabramatta Creek', 'para...",Chao Phraya River,[Chao Phraya River],True,[Chao Phraya River],For what river does the river on which Pa Sak ...,"[{'id': 690412, 'question': 'On which river is..."
3,2hop__263638_69048,"[{'idx': 0, 'title': 'Michael J. Barron', 'par...",Honorable Justice Abiodun Smith,[Honorable Justice Abiodun Smith],True,[Honorable Justice Abiodun Smith],Who is the Chief Judge of the Tebesa Nemine's ...,"[{'id': 263638, 'question': 'Where was Tebesa ..."
4,2hop__142842_68489,"[{'idx': 0, 'title': 'Perfect Night: Live in L...",Snapper Foster,[Snapper Foster],True,[Snapper Foster],Who did the performer of Night Rocker play on ...,"[{'id': 142842, 'question': 'Who performed Nig..."


In [29]:
perfect_retrieval_func = lambda docs, query: [doc for doc in docs if doc['is_supporting']]

In [30]:
results = []

In [31]:
for i in range(1, N_RUNS+1):
    df_standard, scores = benchmark(df, answer_question_standard, perfect_retrieval_func, ignore_errors=True)
    results.append({**scores, "retrieval": "groundtruth", "context": "paragraphs", "qa": "standard", "run": i})
    jprint(scores)

  0%|          | 0/100 [00:00<?, ?it/s]

{
  "exact_match": 0.57,
  "f1": 0.6653958916900092,
  "fuzzy_match": 0.7
}


  0%|          | 0/100 [00:00<?, ?it/s]

{
  "exact_match": 0.58,
  "f1": 0.6738557847381377,
  "fuzzy_match": 0.71
}


  0%|          | 0/100 [00:00<?, ?it/s]

{
  "exact_match": 0.56,
  "f1": 0.6608860877684406,
  "fuzzy_match": 0.7
}


In [32]:
for i in range(1, N_RUNS+1):
    df_cot, scores = benchmark(df, answer_question_cot, perfect_retrieval_func, ignore_errors=True)
    results.append({**scores, "retrieval": "groundtruth", "context": "paragraphs", "qa": "cot-zs", "run": i})
    jprint(scores)

  0%|          | 0/100 [00:00<?, ?it/s]

{
  "exact_match": 0.47,
  "f1": 0.6037142857142856,
  "fuzzy_match": 0.61
}


  0%|          | 0/100 [00:00<?, ?it/s]

{
  "exact_match": 0.46,
  "f1": 0.6016428571428571,
  "fuzzy_match": 0.6
}


  0%|          | 0/100 [00:00<?, ?it/s]

{
  "exact_match": 0.45,
  "f1": 0.6001785714285713,
  "fuzzy_match": 0.62
}


In [33]:
for i in range(1, N_RUNS+1):
    df_cot_fs, scores = benchmark(df, answer_question_cot_fs, perfect_retrieval_func, ignore_errors=True)
    results.append({**scores, "retrieval": "groundtruth", "context": "paragraphs", "qa": "cot-fs", "run": i})
    jprint(scores)

  0%|          | 0/100 [00:00<?, ?it/s]

{
  "exact_match": 0.5,
  "f1": 0.6384376091744514,
  "fuzzy_match": 0.67
}


  0%|          | 0/100 [00:00<?, ?it/s]

{
  "exact_match": 0.51,
  "f1": 0.6531378533746954,
  "fuzzy_match": 0.7
}


  0%|          | 0/100 [00:00<?, ?it/s]

{
  "exact_match": 0.52,
  "f1": 0.6674578111946532,
  "fuzzy_match": 0.71
}


In [34]:
for i in range(1, N_RUNS+1):
    df_cte, scores = benchmark(df, answer_question_cte, perfect_retrieval_func, ignore_errors=True)
    results.append({**scores, "retrieval": "groundtruth", "context": "paragraphs", "qa": "cte", "run": i})
    jprint(scores)

  0%|          | 0/100 [00:00<?, ?it/s]

{
  "exact_match": 0.6,
  "f1": 0.7112052859421278,
  "fuzzy_match": 0.73
}


  0%|          | 0/100 [00:00<?, ?it/s]

{
  "exact_match": 0.62,
  "f1": 0.7364293755779824,
  "fuzzy_match": 0.75
}


  0%|          | 0/100 [00:00<?, ?it/s]

{
  "exact_match": 0.61,
  "f1": 0.7255243188184365,
  "fuzzy_match": 0.74
}


In [35]:
# for i in range(1, N_RUNS+1):
#     df_cte_cot, scores = benchmark(df, answer_question_cte_cot, perfect_retrieval_func, ignore_errors=True)
#     results.append({**scores, "retrieval": "groundtruth", "context": "paragraphs", "qa": "cte+cot-fs", "run": i})
#     jprint(scores)

# Report

In [36]:
report_df = pd.DataFrame.from_records(results, columns=['context', 'retrieval', 'qa', 'run', 'exact_match', 'f1'])
report_df

Unnamed: 0,context,retrieval,qa,run,exact_match,f1
0,paragraphs,groundtruth,standard,1,0.57,0.665396
1,paragraphs,groundtruth,standard,2,0.58,0.673856
2,paragraphs,groundtruth,standard,3,0.56,0.660886
3,paragraphs,groundtruth,cot-zs,1,0.47,0.603714
4,paragraphs,groundtruth,cot-zs,2,0.46,0.601643
5,paragraphs,groundtruth,cot-zs,3,0.45,0.600179
6,paragraphs,groundtruth,cot-fs,1,0.5,0.638438
7,paragraphs,groundtruth,cot-fs,2,0.51,0.653138
8,paragraphs,groundtruth,cot-fs,3,0.52,0.667458
9,paragraphs,groundtruth,cte,1,0.6,0.711205


In [40]:
report_df.drop(columns=['context', 'retrieval', 'run']).groupby(['qa']).mean().loc[['standard', 'cot-zs', 'cot-fs', 'cte']]

Unnamed: 0_level_0,exact_match,f1
qa,Unnamed: 1_level_1,Unnamed: 2_level_1
standard,0.57,0.667
cot-zs,0.46,0.602
cot-fs,0.51,0.653
cte,0.61,0.724


## Inspect

In [38]:
i = 2
row = df_cot.iloc[i]
hop = row['raw_output']['hops'][0]
print(row['question'])
print(row['answers'])
print(hop['llm_output'].answer)
print(hop['llm_output'].reasoning)

For what river does the river on which Pa Sak Jolasid Dam is located serve as the mouth?
['Chao Phraya River']
Chao Phraya River
The Pa Sak Jolasid Dam impounds the Pa Sak River. The Pa Sak River is one of the principal tributaries of the Chao Phraya River in Thailand. Therefore, the river on which the Pa Sak Jolasid Dam is located serves as a tributary to the Chao Phraya River.
