In [14]:
from dotenv import load_dotenv
load_dotenv()

True

In [15]:
import json
import pandas as pd
from pathlib import Path
from copy import deepcopy
from functools import partial

from bellek.qa.jerxrm import answer_question, answer_question_with_reasoning, answer_question_with_triplets, answer_question_reasoning_with_triplets
from bellek.utils import set_seed, jprint
from bellek.musique.singlehop import benchmark

set_seed(89)

In [16]:
from tqdm.auto import tqdm
tqdm.pandas()

In [17]:
# from datetime import datetime
# import random
# random.seed(datetime.now().timestamp())
# df = pd.read_json('../../data/generated/musique-evaluation/dataset.jsonl', orient='records', lines=True)
# ABLATION_RECORD_IDS = random.sample(df['id'].tolist(), 100)
# ABLATION_RECORD_IDS

In [18]:
from bellek.musique.constants import ABLATION_RECORD_IDS

df = pd.read_json('../../data/generated/musique-evaluation/dataset.jsonl', orient='records', lines=True)
df = df.set_index('id', drop=False).loc[ABLATION_RECORD_IDS].copy().reset_index(drop=True)
qd_df = pd.read_json('../../data/generated/musique-evaluation/question-decomposition.jsonl', orient='records', lines=True)
df = pd.merge(df.drop(columns=['question', 'question_decomposition']), qd_df, on='id', suffixes=('', ''))
# df = df.head(10)

print(df.shape)
df.head()

(100, 8)


Unnamed: 0,id,paragraphs,answer,answer_aliases,answerable,answers,question,question_decomposition
0,2hop__575188_342798,"[{'idx': 0, 'title': 'Liliana Mumy', 'paragrap...",Ahmad Shah Qajar,[Ahmad Shah Qajar],True,[Ahmad Shah Qajar],Who is the child of Mahmoud Mirza's father?,"[{'id': 575188, 'question': 'Who is Mahmoud Mi..."
1,2hop__731584_700117,"[{'idx': 0, 'title': 'KAPE', 'paragraph_text':...",Berrien County,[Berrien County],True,[Berrien County],In which county is the city to which KKVU is l...,"[{'id': 731584, 'question': 'To which city is ..."
2,2hop__690412_526810,"[{'idx': 0, 'title': 'Cabramatta Creek', 'para...",Chao Phraya River,[Chao Phraya River],True,[Chao Phraya River],For what river does the river on which Pa Sak ...,"[{'id': 690412, 'question': 'On which river is..."
3,2hop__263638_69048,"[{'idx': 0, 'title': 'Michael J. Barron', 'par...",Honorable Justice Abiodun Smith,[Honorable Justice Abiodun Smith],True,[Honorable Justice Abiodun Smith],Who is the Chief Judge of the Tebesa Nemine's ...,"[{'id': 263638, 'question': 'Where was Tebesa ..."
4,2hop__142842_68489,"[{'idx': 0, 'title': 'Perfect Night: Live in L...",Snapper Foster,[Snapper Foster],True,[Snapper Foster],Who did the performer of Night Rocker play on ...,"[{'id': 142842, 'question': 'Who performed Nig..."


In [19]:
perfect_retrieval_func = lambda docs, query: [doc for doc in docs if doc['is_supporting']]

In [20]:
N_RUNS = 3
results = []

In [21]:
for i in range(1, N_RUNS+1):
    _, scores = benchmark(df, answer_question, perfect_retrieval_func, ignore_errors=True)
    results.append({**scores, "retrieval": "groundtruth", "context": "paragraphs", "qa": "direct", "run": i})
    jprint(scores)

  0%|          | 0/100 [00:00<?, ?it/s]

Failed to answer the question 2hop__472083_7298
Failed to parse model output. You may need to update your prompt to encourage the model to return a specific type.
{
  "exact_match": 0.57,
  "f1": 0.6749593478417008,
  "fuzzy_match": 0.7
}


  0%|          | 0/100 [00:00<?, ?it/s]

{
  "exact_match": 0.55,
  "f1": 0.6667688716512246,
  "fuzzy_match": 0.7
}


  0%|          | 0/100 [00:00<?, ?it/s]

Failed to answer the question 2hop__472083_7298
Failed to parse model output. You may need to update your prompt to encourage the model to return a specific type.
{
  "exact_match": 0.55,
  "f1": 0.6471022049845578,
  "fuzzy_match": 0.67
}


In [22]:
for i in range(1, N_RUNS+1):
    _, scores = benchmark(df, answer_question_with_reasoning, perfect_retrieval_func, ignore_errors=True)
    results.append({**scores, "retrieval": "groundtruth", "context": "paragraphs", "qa": "reasoning", "run": i})
    jprint(scores)

  0%|          | 0/100 [00:00<?, ?it/s]

{
  "exact_match": 0.55,
  "f1": 0.6532875457875457,
  "fuzzy_match": 0.65
}


  0%|          | 0/100 [00:00<?, ?it/s]

{
  "exact_match": 0.5,
  "f1": 0.6333449687567334,
  "fuzzy_match": 0.65
}


  0%|          | 0/100 [00:00<?, ?it/s]

{
  "exact_match": 0.49,
  "f1": 0.617964016375781,
  "fuzzy_match": 0.63
}


In [23]:
for i in range(1, N_RUNS+1):
    _, scores = benchmark(df, answer_question_with_triplets, perfect_retrieval_func, ignore_errors=True)
    results.append({**scores, "retrieval": "groundtruth", "context": "paragraphs", "qa": "triplets", "run": i})
    jprint(scores)

  0%|          | 0/100 [00:00<?, ?it/s]

{
  "exact_match": 0.61,
  "f1": 0.6997908496732025,
  "fuzzy_match": 0.73
}


  0%|          | 0/100 [00:00<?, ?it/s]

{
  "exact_match": 0.59,
  "f1": 0.6947630718954246,
  "fuzzy_match": 0.72
}


  0%|          | 0/100 [00:00<?, ?it/s]

{
  "exact_match": 0.59,
  "f1": 0.6872896825396824,
  "fuzzy_match": 0.7
}


In [24]:
for i in range(1, N_RUNS+1):
    _, scores = benchmark(df, answer_question_reasoning_with_triplets, perfect_retrieval_func, ignore_errors=True)
    results.append({**scores, "retrieval": "groundtruth", "context": "paragraphs", "qa": "reasoning+triplets", "run": i})
    jprint(scores)

  0%|          | 0/100 [00:00<?, ?it/s]

{
  "exact_match": 0.62,
  "f1": 0.7267449684818105,
  "fuzzy_match": 0.74
}


  0%|          | 0/100 [00:00<?, ?it/s]

{
  "exact_match": 0.59,
  "f1": 0.7006508785332316,
  "fuzzy_match": 0.7
}


  0%|          | 0/100 [00:00<?, ?it/s]

{
  "exact_match": 0.56,
  "f1": 0.6883247602071133,
  "fuzzy_match": 0.7
}


# Report

In [25]:
report_df = pd.DataFrame.from_records(results, columns=['context', 'retrieval', 'qa', 'run', 'exact_match', 'f1'])
report_df

Unnamed: 0,context,retrieval,qa,run,exact_match,f1
0,paragraphs,groundtruth,direct,1,0.57,0.674959
1,paragraphs,groundtruth,direct,2,0.55,0.666769
2,paragraphs,groundtruth,direct,3,0.55,0.647102
3,paragraphs,groundtruth,reasoning,1,0.55,0.653288
4,paragraphs,groundtruth,reasoning,2,0.5,0.633345
5,paragraphs,groundtruth,reasoning,3,0.49,0.617964
6,paragraphs,groundtruth,triplets,1,0.61,0.699791
7,paragraphs,groundtruth,triplets,2,0.59,0.694763
8,paragraphs,groundtruth,triplets,3,0.59,0.68729
9,paragraphs,groundtruth,reasoning+triplets,1,0.62,0.726745


In [26]:
report_df.drop(columns=['context', 'retrieval', 'run']).groupby(['qa']).mean()

Unnamed: 0_level_0,exact_match,f1
qa,Unnamed: 1_level_1,Unnamed: 2_level_1
direct,0.556667,0.662943
reasoning,0.513333,0.634866
reasoning+triplets,0.59,0.70524
triplets,0.596667,0.693948


In [27]:
report_df.drop(columns=['context', 'retrieval'])

Unnamed: 0,qa,run,exact_match,f1
0,direct,1,0.57,0.674959
1,direct,2,0.55,0.666769
2,direct,3,0.55,0.647102
3,reasoning,1,0.55,0.653288
4,reasoning,2,0.5,0.633345
5,reasoning,3,0.49,0.617964
6,triplets,1,0.61,0.699791
7,triplets,2,0.59,0.694763
8,triplets,3,0.59,0.68729
9,reasoning+triplets,1,0.62,0.726745
