In [1]:
from dotenv import load_dotenv
load_dotenv()

True

In [2]:
import json
import pandas as pd
from pathlib import Path
from copy import deepcopy
from functools import partial

from bellek.qa.ablation import answer_question, answer_question_with_reasoning, answer_question_with_triplets, answer_question_reasoning_with_triplets
from bellek.utils import set_seed, jprint
from bellek.musique.singlehop import benchmark

set_seed(89)



In [3]:
from tqdm.auto import tqdm
tqdm.pandas()

In [4]:
from bellek.musique.constants import ABLATION_RECORD_IDS

df = pd.read_json('../../data/generated/musique-evaluation/dataset.jsonl', orient='records', lines=True)
df = df.set_index('id', drop=False).loc[ABLATION_RECORD_IDS].copy().reset_index(drop=True)
qd_df = pd.read_json('../../data/generated/musique-evaluation/question-decomposition.jsonl', orient='records', lines=True)
df = pd.merge(df.drop(columns=['question', 'question_decomposition']), qd_df, on='id', suffixes=('', ''))
df = df.head(10)

print(df.shape)
df.head()

(10, 8)


Unnamed: 0,id,paragraphs,answer,answer_aliases,answerable,answers,question,question_decomposition
0,2hop__575188_342798,"[{'idx': 0, 'title': 'Liliana Mumy', 'paragrap...",Ahmad Shah Qajar,[Ahmad Shah Qajar],True,[Ahmad Shah Qajar],Who is the child of Mahmoud Mirza's father?,"[{'id': 575188, 'question': 'Who is Mahmoud Mi..."
1,2hop__731584_700117,"[{'idx': 0, 'title': 'KAPE', 'paragraph_text':...",Berrien County,[Berrien County],True,[Berrien County],In which county is the city to which KKVU is l...,"[{'id': 731584, 'question': 'To which city is ..."
2,2hop__690412_526810,"[{'idx': 0, 'title': 'Cabramatta Creek', 'para...",Chao Phraya River,[Chao Phraya River],True,[Chao Phraya River],For what river does the river on which Pa Sak ...,"[{'id': 690412, 'question': 'On which river is..."
3,2hop__263638_69048,"[{'idx': 0, 'title': 'Michael J. Barron', 'par...",Honorable Justice Abiodun Smith,[Honorable Justice Abiodun Smith],True,[Honorable Justice Abiodun Smith],Who is the Chief Judge of the Tebesa Nemine's ...,"[{'id': 263638, 'question': 'Where was Tebesa ..."
4,2hop__142842_68489,"[{'idx': 0, 'title': 'Perfect Night: Live in L...",Snapper Foster,[Snapper Foster],True,[Snapper Foster],Who did the performer of Night Rocker play on ...,"[{'id': 142842, 'question': 'Who performed Nig..."


In [5]:
perfect_retrieval_func = lambda docs, query: [doc for doc in docs if doc['is_supporting']]

In [6]:
N_RUNS = 1
results = []

In [7]:
for i in range(1, N_RUNS+1):
    df_standard, scores = benchmark(df, answer_question, perfect_retrieval_func, ignore_errors=True)
    results.append({**scores, "retrieval": "groundtruth", "context": "paragraphs", "qa": "direct", "run": i})
    jprint(scores)

  0%|          | 0/10 [00:00<?, ?it/s]

{
  "exact_match": 0.6,
  "f1": 0.6666666666666666,
  "fuzzy_match": 0.7
}


In [8]:
for i in range(1, N_RUNS+1):
    df_cot, scores = benchmark(df, answer_question_with_reasoning, perfect_retrieval_func, ignore_errors=True)
    results.append({**scores, "retrieval": "groundtruth", "context": "paragraphs", "qa": "reasoning", "run": i})
    jprint(scores)

  0%|          | 0/10 [00:00<?, ?it/s]

{
  "exact_match": 0.5,
  "f1": 0.7126984126984127,
  "fuzzy_match": 0.8
}


In [9]:
row = df_cot.iloc[2]
print(row['exact_match'])
print(row['question'])
for hop in row['raw_output']['hops']:
    print(hop['question'])
    print(hop['llm_output'])

True
For what river does the river on which Pa Sak Jolasid Dam is located serve as the mouth?
For what river does the river on which Pa Sak Jolasid Dam is located serve as the mouth?
reasoning='The Pa Sak Jolasid Dam impounds the Pa Sak River. The Pa Sak River is one of the principal tributaries of the Chao Phraya River in Thailand. Therefore, the river on which the Pa Sak Jolasid Dam is located serves as a tributary to the Chao Phraya River.' answer='Chao Phraya River'


In [10]:
for i in range(1, N_RUNS+1):
    _, scores = benchmark(df, answer_question_with_triplets, perfect_retrieval_func, ignore_errors=True)
    results.append({**scores, "retrieval": "groundtruth", "context": "paragraphs", "qa": "triplets", "run": i})
    jprint(scores)

  0%|          | 0/10 [00:00<?, ?it/s]

In [None]:
for i in range(1, N_RUNS+1):
    _, scores = benchmark(df, answer_question_reasoning_with_triplets, perfect_retrieval_func, ignore_errors=True)
    results.append({**scores, "retrieval": "groundtruth", "context": "paragraphs", "qa": "reasoning+triplets", "run": i})
    jprint(scores)

  0%|          | 0/10 [00:00<?, ?it/s]

{
  "exact_match": 0.7,
  "f1": 0.7571428571428571,
  "fuzzy_match": 0.8
}


# Report

In [None]:
report_df = pd.DataFrame.from_records(results, columns=['context', 'retrieval', 'qa', 'run', 'exact_match', 'f1'])
report_df

Unnamed: 0,context,retrieval,qa,run,exact_match,f1
0,paragraphs,groundtruth,direct,1,0.6,0.6
1,paragraphs,groundtruth,reasoning,1,0.5,0.696032
2,paragraphs,groundtruth,triplets,1,0.7,0.807143
3,paragraphs,groundtruth,reasoning+triplets,1,0.7,0.757143


In [None]:
report_df.drop(columns=['context', 'retrieval', 'run']).groupby(['qa']).mean()

Unnamed: 0_level_0,exact_match,f1
qa,Unnamed: 1_level_1,Unnamed: 2_level_1
direct,0.6,0.6
reasoning,0.5,0.696032
reasoning+triplets,0.7,0.757143
triplets,0.7,0.807143
