In [1]:
from dotenv import load_dotenv
load_dotenv()

True

In [2]:
import json
import pandas as pd
from pathlib import Path
from copy import deepcopy
from functools import partial

from bellem.qa.ablation import answer_question, answer_question_cot
from bellem.utils import set_seed, jprint
from bellem.musique.multihop import benchmark

set_seed(89)

In [3]:
from tqdm.auto import tqdm
tqdm.pandas()

In [4]:
from bellem.musique.constants import ABLATION_RECORD_IDS

df = pd.read_json('../../data/generated/musique-evaluation/dataset.jsonl', orient='records', lines=True)
df = df.set_index('id', drop=False).loc[ABLATION_RECORD_IDS].copy().reset_index(drop=True)
qd_df = pd.read_json('../../data/generated/musique-evaluation/question-decomposition.jsonl', orient='records', lines=True)
df = pd.merge(df.drop(columns=['question', 'question_decomposition']), qd_df, on='id', suffixes=('', ''))
# df = df.head(10)

print(df.shape)
df.head()

(100, 8)


Unnamed: 0,id,paragraphs,answer,answer_aliases,answerable,answers,question,question_decomposition
0,2hop__575188_342798,"[{'idx': 0, 'title': 'Liliana Mumy', 'paragrap...",Ahmad Shah Qajar,[Ahmad Shah Qajar],True,[Ahmad Shah Qajar],Who is the child of Mahmoud Mirza's father?,"[{'id': 575188, 'question': 'Who is Mahmoud Mi..."
1,2hop__731584_700117,"[{'idx': 0, 'title': 'KAPE', 'paragraph_text':...",Berrien County,[Berrien County],True,[Berrien County],In which county is the city to which KKVU is l...,"[{'id': 731584, 'question': 'To which city is ..."
2,2hop__690412_526810,"[{'idx': 0, 'title': 'Cabramatta Creek', 'para...",Chao Phraya River,[Chao Phraya River],True,[Chao Phraya River],For what river does the river on which Pa Sak ...,"[{'id': 690412, 'question': 'On which river is..."
3,2hop__263638_69048,"[{'idx': 0, 'title': 'Michael J. Barron', 'par...",Honorable Justice Abiodun Smith,[Honorable Justice Abiodun Smith],True,[Honorable Justice Abiodun Smith],Who is the Chief Judge of the Tebesa Nemine's ...,"[{'id': 263638, 'question': 'Where was Tebesa ..."
4,2hop__142842_68489,"[{'idx': 0, 'title': 'Perfect Night: Live in L...",Snapper Foster,[Snapper Foster],True,[Snapper Foster],Who did the performer of Night Rocker play on ...,"[{'id': 142842, 'question': 'Who performed Nig..."


In [5]:
jerx_file = Path("../../data/raw/musique-evaluation/jerx-inferences/llama3-base.jsonl")
jerx_df = pd.read_json(jerx_file, lines=True)

jerx_mapping = {(row['id'], row['paragraph_idx']): row['generation'] for _, row in jerx_df.iterrows()}

def extract_triplets(example: dict):
    example["triplets_str"] = [jerx_mapping[(example['id'], p['idx'])].strip() for p in example['paragraphs']]
    return example


In [6]:
def enhance_paragraphs(row):
    paragraphs_with_triplets = []
    for p in row['paragraphs']:
        p = deepcopy(p)
        triplets_str = str(jerx_mapping[(row['id'], p['idx'])])
        p['paragraph_text'] = '\n'.join([p['paragraph_text'], "# Entity-relation-entity triplets", triplets_str])
        paragraphs_with_triplets.append(p)
    row['paragraphs'] = paragraphs_with_triplets
    return row

df_paragraph_triplets = df.apply(enhance_paragraphs, axis=1) 
df_paragraph_triplets.head()
print(df_paragraph_triplets.iloc[0]['paragraphs'][2]['paragraph_text'])

Mirza Mehdy Ispahani (also known as Sadri Ispahani) (1923–2004), son of Mirza Ahmad Ispahani, was Chairman of M.M. Ispahani from 1949 till 2004. Mirza Ali Behrouze Ispahani, son of Mirza Mehdy Ispahani was elected as the Chairman of M.M. Ispahani in 2004.
# Entity-relation-entity triplets
Mirza Mehdy Ispahani | occupation | Chairman of M.M. Ispahani
Mirza Mehdy Ispahani | tenure | 1949-2004
Mirza Mehdy Ispahani | father | Mirza Ahmad Ispahani
Mirza Mehdy Ispahani | alternative name | Sadri Ispahani
Mirza Ali Behrouze Ispahani | occupation | Chairman of M.M. Ispahani
Mirza Ali Behrouze Ispahani | father | Mirza Mehdy Ispahani


In [7]:
def replace_paragraphs(row):
    paragraphs_with_triplets = []
    for p in row['paragraphs']:
        p = deepcopy(p) 
        triplets_str = str(jerx_mapping[(row['id'], p['idx'])])
        p['paragraph_text'] = '\n'.join(["# Entity-relation-entity triplets", triplets_str])
        paragraphs_with_triplets.append(p)
    row['paragraphs'] = paragraphs_with_triplets
    return row

df_only_triplets = df.apply(replace_paragraphs, axis=1) 
df_only_triplets.head()
print(df_only_triplets.iloc[0]['paragraphs'][2]['paragraph_text'])

# Entity-relation-entity triplets
Mirza Mehdy Ispahani | occupation | Chairman of M.M. Ispahani
Mirza Mehdy Ispahani | tenure | 1949-2004
Mirza Mehdy Ispahani | father | Mirza Ahmad Ispahani
Mirza Mehdy Ispahani | alternative name | Sadri Ispahani
Mirza Ali Behrouze Ispahani | occupation | Chairman of M.M. Ispahani
Mirza Ali Behrouze Ispahani | father | Mirza Mehdy Ispahani


In [8]:
perfect_retrieval_func = lambda docs, query: [doc for doc in docs if doc['is_supporting']]

In [9]:
N_RUNS = 3
results = []

In [10]:
for i in range(1, N_RUNS+1):
    _, scores = benchmark(df, answer_question, perfect_retrieval_func, ignore_errors=True)
    results.append({**scores, "retrieval": "groundtruth", "context": "paragraphs", "qa": "direct", "run": i})
    jprint(scores)

  0%|          | 0/100 [00:00<?, ?it/s]

{
  "exact_match": 0.62,
  "f1": 0.7017070707070707,
  "fuzzy_match": 0.73
}


  0%|          | 0/100 [00:00<?, ?it/s]

{
  "exact_match": 0.63,
  "f1": 0.7033737373737374,
  "fuzzy_match": 0.74
}


  0%|          | 0/100 [00:00<?, ?it/s]

{
  "exact_match": 0.6,
  "f1": 0.6817070707070707,
  "fuzzy_match": 0.72
}


In [11]:
for i in range(1, N_RUNS+1):
    _, scores = benchmark(df_only_triplets, answer_question, perfect_retrieval_func, ignore_errors=True)
    results.append({**scores, "retrieval": "groundtruth", "context": "triplets", "qa": "direct", "run": i})
    jprint(scores)

  0%|          | 0/100 [00:00<?, ?it/s]

Failed to answer the question 2hop__472083_7298
Failed to parse model output. You may need to update your prompt to encourage the model to return a specific type.
{
  "exact_match": 0.52,
  "f1": 0.6471587301587302,
  "fuzzy_match": 0.67
}


  0%|          | 0/100 [00:00<?, ?it/s]

Failed to answer the question 2hop__472083_7298
Failed to parse model output. You may need to update your prompt to encourage the model to return a specific type.
{
  "exact_match": 0.54,
  "f1": 0.659047619047619,
  "fuzzy_match": 0.7
}


  0%|          | 0/100 [00:00<?, ?it/s]

Failed to answer the question 2hop__472083_7298
Failed to parse model output. You may need to update your prompt to encourage the model to return a specific type.
{
  "exact_match": 0.52,
  "f1": 0.6563809523809523,
  "fuzzy_match": 0.69
}


In [12]:
for i in range(1, N_RUNS+1):
    _, scores = benchmark(df_paragraph_triplets, answer_question, perfect_retrieval_func, ignore_errors=True)
    results.append({**scores, "retrieval": "groundtruth", "context": "triplets+paragraphs", "qa": "direct", "run": i})
    jprint(scores)

  0%|          | 0/100 [00:00<?, ?it/s]

Failed to answer the question 2hop__472083_7298
Failed to parse model output. You may need to update your prompt to encourage the model to return a specific type.
{
  "exact_match": 0.58,
  "f1": 0.677002553002553,
  "fuzzy_match": 0.69
}


  0%|          | 0/100 [00:00<?, ?it/s]

Failed to answer the question 2hop__472083_7298
Failed to parse model output. You may need to update your prompt to encourage the model to return a specific type.
{
  "exact_match": 0.59,
  "f1": 0.6924946164946164,
  "fuzzy_match": 0.7
}


  0%|          | 0/100 [00:00<?, ?it/s]

Failed to answer the question 2hop__472083_7298
Failed to parse model output. You may need to update your prompt to encourage the model to return a specific type.
{
  "exact_match": 0.59,
  "f1": 0.6885410145410146,
  "fuzzy_match": 0.7
}


# Report

In [13]:
report_df = pd.DataFrame.from_records(results, columns=['context', 'retrieval', 'qa', 'run', 'exact_match', 'f1'])
report_df.drop(columns=['retrieval', 'qa'])

Unnamed: 0,context,run,exact_match,f1
0,paragraphs,1,0.62,0.701707
1,paragraphs,2,0.63,0.703374
2,paragraphs,3,0.6,0.681707
3,triplets,1,0.52,0.647159
4,triplets,2,0.54,0.659048
5,triplets,3,0.52,0.656381
6,triplets+paragraphs,1,0.58,0.677003
7,triplets+paragraphs,2,0.59,0.692495
8,triplets+paragraphs,3,0.59,0.688541


In [14]:
report_df.drop(columns=['retrieval', 'qa']).groupby(['context']).mean()

Unnamed: 0_level_0,run,exact_match,f1
context,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
paragraphs,2.0,0.616667,0.695596
triplets,2.0,0.526667,0.654196
triplets+paragraphs,2.0,0.586667,0.686013
