In [14]:
from dotenv import load_dotenv
load_dotenv()

True

In [15]:
import json
import pandas as pd
from pathlib import Path
from copy import deepcopy
from functools import partial

from bellem.qa.ablation import answer_question_cot, answer_question_cte, answer_question_cte_cot
from bellem.utils import set_seed, jprint
from bellem.musique.multihop import benchmark

set_seed(89)

In [16]:
from tqdm.auto import tqdm
tqdm.pandas()

In [17]:
def silence(exc_cls):
    def decorator(func):
        def wrapper(*args, **kwargs):
            try:
                return func(*args, **kwargs)
            except exc_cls as e:
                return None
        return wrapper
    return decorator

In [18]:
df = pd.read_json('../../data/generated/musique-evaluation/dataset.jsonl', orient='records', lines=True).iloc[:400]
qd_df = pd.read_json('../../data/generated/musique-evaluation/question-decomposition.jsonl', orient='records', lines=True)
df = pd.merge(df.drop(columns=['question', 'question_decomposition']), qd_df, on='id', suffixes=('', ''))
print(df.shape)
df.head()

(400, 8)


Unnamed: 0,id,paragraphs,answer,answer_aliases,answerable,answers,question,question_decomposition
0,2hop__131818_161450,"[{'idx': 0, 'title': 'Maria Carrillo High Scho...",in the north-east of the country south of the ...,"[Caspian Sea, in the north-east of the country...",True,"[Caspian Sea, in the north-east of the country...",Where is the Voshmgir District located?,"[{'id': 131818, 'question': 'In which province..."
1,2hop__444265_82341,"[{'idx': 0, 'title': 'Ocala, Florida', 'paragr...",in Northern Florida,"[in Northern Florida, Northern Florida]",True,"[in Northern Florida, Northern Florida]",In what part of Florida is Tom Denney's birthp...,"[{'id': 444265, 'question': 'Where was Tom Den..."
2,2hop__711946_269414,"[{'idx': 0, 'title': 'Wild Thing (Tone Lōc son...",Kill Rock Stars,[Kill Rock Stars],True,[Kill Rock Stars],What record label is the performer who release...,"[{'id': 711946, 'question': 'Who released All ..."
3,2hop__311931_417706,"[{'idx': 0, 'title': 'The Main Attraction (alb...",Attic Records,"[Attic, Attic Records]",True,"[Attic, Attic Records]",What record label does the performer of Emotio...,"[{'id': 311931, 'question': 'Who is the perfor..."
4,2hop__809785_606637,"[{'idx': 0, 'title': 'The Main Attraction (alb...",Secret City Records,[Secret City Records],True,[Secret City Records],What record label does the performer of Advent...,"[{'id': 809785, 'question': 'Who is the perfor..."


In [19]:
perfect_retrieval_func = lambda docs, query: [doc for doc in docs if doc['is_supporting']]

In [20]:
results = []

In [21]:
_, scores = benchmark(df, answer_question_cot, perfect_retrieval_func, ignore_errors=True)
results.append({**scores, "retrieval": "groundtruth", "qa": "reasoning", "context": "paragraphs"})
jprint(scores)

  0%|          | 0/400 [00:00<?, ?it/s]

{
  "exact_match": 0.6075,
  "f1": 0.7327885137738082,
  "fuzzy_match": 0.755
}


In [22]:
_, scores = benchmark(df, answer_question_cte, perfect_retrieval_func, ignore_errors=True)
results.append({**scores, "retrieval": "groundtruth", "qa": "triplets", "context": "paragraphs"})
jprint(scores)

  0%|          | 0/400 [00:00<?, ?it/s]

Failed to answer the question 2hop__468778_88165
BaseModel.__init__() takes 1 positional argument but 2 were given
{
  "exact_match": 0.62,
  "f1": 0.7416570027195032,
  "fuzzy_match": 0.77
}


In [23]:
_, scores = benchmark(df, answer_question_cte_cot, perfect_retrieval_func, ignore_errors=True)
results.append({**scores, "retrieval": "groundtruth", "qa": "reasoning+triplets", "context": "paragraphs"})
jprint(scores)

  0%|          | 0/400 [00:00<?, ?it/s]

Failed to answer the question 2hop__142621_225454
Failed to parse model output. You may need to update your prompt to encourage the model to return a specific type.
{
  "exact_match": 0.585,
  "f1": 0.7069332097640924,
  "fuzzy_match": 0.74
}


# Report

In [24]:
report_df = pd.DataFrame.from_records(results, columns=['context', 'retrieval', 'qa', 'exact_match', 'fuzzy_match', 'f1'])
report_df

Unnamed: 0,context,retrieval,qa,exact_match,fuzzy_match,f1
0,paragraphs,groundtruth,reasoning,0.6075,0.755,0.732789
1,paragraphs,groundtruth,triplets,0.62,0.77,0.741657
2,paragraphs,groundtruth,reasoning+triplets,0.585,0.74,0.706933


In [25]:
print(report_df[report_df['qa']=='reasoning'].drop(columns=['qa']).to_markdown(index=False))

| context    | retrieval   |   exact_match |   fuzzy_match |       f1 |
|:-----------|:------------|--------------:|--------------:|---------:|
| paragraphs | groundtruth |        0.6075 |         0.755 | 0.732789 |


In [26]:
print(report_df[report_df['qa']=='jerx-reasoning'].drop(columns=['qa']).to_markdown(index=False))

| context   | retrieval   | exact_match   | fuzzy_match   | f1   |
|-----------|-------------|---------------|---------------|------|
