In [1]:
from dotenv import load_dotenv
load_dotenv()

True

In [2]:
import json
import pandas as pd
from pathlib import Path
from copy import deepcopy
from functools import partial
import magentic

from bellek.qa.ablation import answer_question_standard, answer_question_cot, answer_question_cot_fs, answer_question_cte, answer_question_cte_cot
from bellek.utils import set_seed, jprint
from bellek.musique.singlehop import benchmark

set_seed(89)

In [3]:
from tqdm.auto import tqdm
tqdm.pandas()

In [4]:
pd.options.display.float_format = '{:,.3f}'.format

In [5]:
N_RUNS = 1
SAMPLE_SIZE = 100

In [6]:
from bellek.musique.constants import ABLATION_RECORD_IDS

df = pd.read_json('../../data/generated/musique-evaluation/dataset.jsonl', orient='records', lines=True)
df = df.set_index('id', drop=False).loc[ABLATION_RECORD_IDS].copy().reset_index(drop=True)
qd_df = pd.read_json('../../data/generated/musique-evaluation/question-decomposition.jsonl', orient='records', lines=True)
df = pd.merge(df.drop(columns=['question', 'question_decomposition']), qd_df, on='id', suffixes=('', ''))
df = df.head(SAMPLE_SIZE)

print(df.shape)
df.head()

(100, 8)


Unnamed: 0,id,paragraphs,answer,answer_aliases,answerable,answers,question,question_decomposition
0,2hop__575188_342798,"[{'idx': 0, 'title': 'Liliana Mumy', 'paragrap...",Ahmad Shah Qajar,[Ahmad Shah Qajar],True,[Ahmad Shah Qajar],Who is the child of Mahmoud Mirza's father?,"[{'id': 575188, 'question': 'Who is Mahmoud Mi..."
1,2hop__731584_700117,"[{'idx': 0, 'title': 'KAPE', 'paragraph_text':...",Berrien County,[Berrien County],True,[Berrien County],In which county is the city to which KKVU is l...,"[{'id': 731584, 'question': 'To which city is ..."
2,2hop__690412_526810,"[{'idx': 0, 'title': 'Cabramatta Creek', 'para...",Chao Phraya River,[Chao Phraya River],True,[Chao Phraya River],For what river does the river on which Pa Sak ...,"[{'id': 690412, 'question': 'On which river is..."
3,2hop__263638_69048,"[{'idx': 0, 'title': 'Michael J. Barron', 'par...",Honorable Justice Abiodun Smith,[Honorable Justice Abiodun Smith],True,[Honorable Justice Abiodun Smith],Who is the Chief Judge of the Tebesa Nemine's ...,"[{'id': 263638, 'question': 'Where was Tebesa ..."
4,2hop__142842_68489,"[{'idx': 0, 'title': 'Perfect Night: Live in L...",Snapper Foster,[Snapper Foster],True,[Snapper Foster],Who did the performer of Night Rocker play on ...,"[{'id': 142842, 'question': 'Who performed Nig..."


In [7]:
perfect_retrieval_func = lambda docs, query: [doc for doc in docs if doc['is_supporting']]

In [8]:
results = []

In [9]:
for temperature in [0.0, 0.1, 0.3, 0.5, 0.7, 1.0, 1.5, 2.0]:
    with magentic.OpenaiChatModel("gpt-3.5-turbo", temperature=temperature):
        for qa_prompting, qa_func in [('standard', answer_question_standard), ('cte', answer_question_cte)]:
            for i in range(1, N_RUNS+1):
                _, scores = benchmark(df, qa_func, perfect_retrieval_func, ignore_errors=True)
                results.append({**scores, "retrieval": "groundtruth", "context": "paragraphs", "qa": qa_prompting, "temperature": temperature, "run": i})
                jprint(scores)

  0%|          | 0/100 [00:00<?, ?it/s]

{
  "exact_match": 0.58,
  "f1": 0.674219421101774,
  "fuzzy_match": 0.71
}


  0%|          | 0/100 [00:00<?, ?it/s]

{
  "exact_match": 0.59,
  "f1": 0.7086811815635343,
  "fuzzy_match": 0.73
}


  0%|          | 0/100 [00:00<?, ?it/s]

{
  "exact_match": 0.58,
  "f1": 0.6840289449112977,
  "fuzzy_match": 0.73
}


  0%|          | 0/100 [00:00<?, ?it/s]

{
  "exact_match": 0.62,
  "f1": 0.7233478482302012,
  "fuzzy_match": 0.76
}


  0%|          | 0/100 [00:00<?, ?it/s]

{
  "exact_match": 0.56,
  "f1": 0.6769910938734466,
  "fuzzy_match": 0.71
}


  0%|          | 0/100 [00:00<?, ?it/s]

{
  "exact_match": 0.63,
  "f1": 0.7329337068160596,
  "fuzzy_match": 0.75
}


  0%|          | 0/100 [00:00<?, ?it/s]

{
  "exact_match": 0.54,
  "f1": 0.6589593478417008,
  "fuzzy_match": 0.7
}


  0%|          | 0/100 [00:00<?, ?it/s]

{
  "exact_match": 0.59,
  "f1": 0.7163625730994151,
  "fuzzy_match": 0.75
}


  0%|          | 0/100 [00:00<?, ?it/s]

{
  "exact_match": 0.53,
  "f1": 0.6506003734827265,
  "fuzzy_match": 0.69
}


  0%|          | 0/100 [00:00<?, ?it/s]

{
  "exact_match": 0.62,
  "f1": 0.7178716577540107,
  "fuzzy_match": 0.74
}


  0%|          | 0/100 [00:00<?, ?it/s]

{
  "exact_match": 0.58,
  "f1": 0.6700765639589167,
  "fuzzy_match": 0.71
}


  0%|          | 0/100 [00:00<?, ?it/s]

{
  "exact_match": 0.61,
  "f1": 0.7176811815635346,
  "fuzzy_match": 0.74
}


  0%|          | 0/100 [00:00<?, ?it/s]

{
  "exact_match": 0.53,
  "f1": 0.6512796092796093,
  "fuzzy_match": 0.66
}


  0%|          | 0/100 [00:00<?, ?it/s]

{
  "exact_match": 0.56,
  "f1": 0.6821718020541551,
  "fuzzy_match": 0.68
}


  0%|          | 0/100 [00:00<?, ?it/s]

Failed to answer the question 2hop__575188_342798
Failed to parse model output. You may need to update your prompt to encourage the model to return a specific type.
Failed to answer the question 2hop__95970_456836
Failed to parse model output. You may need to update your prompt to encourage the model to return a specific type.
Failed to answer the question 2hop__280451_84616
Failed to parse model output. You may need to update your prompt to encourage the model to return a specific type.
Failed to answer the question 2hop__543853_124498
Failed to parse model output. You may need to update your prompt to encourage the model to return a specific type.
Failed to answer the question 2hop__785711_63853
Failed to parse model output. You may need to update your prompt to encourage the model to return a specific type.
Failed to answer the question 2hop__126306_396277
Failed to parse model output. You may need to update your prompt to encourage the model to return a specific type.
Failed to ans

  0%|          | 0/100 [00:00<?, ?it/s]

Failed to answer the question 2hop__142842_68489
Failed to parse model output. You may need to update your prompt to encourage the model to return a specific type.
Failed to answer the question 2hop__189094_612080
Failed to parse model output. You may need to update your prompt to encourage the model to return a specific type.
Failed to answer the question 2hop__819974_129669
Failed to parse model output. You may need to update your prompt to encourage the model to return a specific type.
Failed to answer the question 2hop__852657_155922
Failed to parse model output. You may need to update your prompt to encourage the model to return a specific type.
Failed to answer the question 2hop__128420_375952
Failed to parse model output. You may need to update your prompt to encourage the model to return a specific type.
Failed to answer the question 2hop__144303_483189
Failed to parse model output. You may need to update your prompt to encourage the model to return a specific type.
Failed to a

# Report

In [10]:
report_df = pd.DataFrame.from_records(results, columns=['context', 'retrieval', 'qa', 'temperature', 'run', 'exact_match', 'f1'])
report_df

Unnamed: 0,context,retrieval,qa,temperature,run,exact_match,f1
0,paragraphs,groundtruth,standard,0.0,1,0.58,0.674
1,paragraphs,groundtruth,cte,0.0,1,0.59,0.709
2,paragraphs,groundtruth,standard,0.1,1,0.58,0.684
3,paragraphs,groundtruth,cte,0.1,1,0.62,0.723
4,paragraphs,groundtruth,standard,0.3,1,0.56,0.677
5,paragraphs,groundtruth,cte,0.3,1,0.63,0.733
6,paragraphs,groundtruth,standard,0.5,1,0.54,0.659
7,paragraphs,groundtruth,cte,0.5,1,0.59,0.716
8,paragraphs,groundtruth,standard,0.7,1,0.53,0.651
9,paragraphs,groundtruth,cte,0.7,1,0.62,0.718


In [11]:
from datetime import datetime
suffix = datetime.utcnow().strftime("%Y%m%d-%H%M%S")
report_df.to_json(f'./ablation-temperature-{suffix}.jsonl', orient='records', lines=True)

In [12]:
report_df.drop(columns=['context', 'retrieval', 'run']).groupby(['qa', 'temperature']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,exact_match,f1
qa,temperature,Unnamed: 2_level_1,Unnamed: 3_level_1
cte,0.0,0.59,0.709
cte,0.1,0.62,0.723
cte,0.3,0.63,0.733
cte,0.5,0.59,0.716
cte,0.7,0.62,0.718
cte,1.0,0.61,0.718
cte,1.5,0.56,0.682
cte,2.0,0.23,0.308
standard,0.0,0.58,0.674
standard,0.1,0.58,0.684


In [13]:
print(report_df.drop(columns=['context', 'retrieval', 'run']).groupby(['qa', 'temperature']).mean().to_latex())

\begin{tabular}{llrr}
\toprule
 &  & exact_match & f1 \\
qa & temperature &  &  \\
\midrule
\multirow[t]{8}{*}{cte} & 0.000000 & 0.590000 & 0.708681 \\
 & 0.100000 & 0.620000 & 0.723348 \\
 & 0.300000 & 0.630000 & 0.732934 \\
 & 0.500000 & 0.590000 & 0.716363 \\
 & 0.700000 & 0.620000 & 0.717872 \\
 & 1.000000 & 0.610000 & 0.717681 \\
 & 1.500000 & 0.560000 & 0.682172 \\
 & 2.000000 & 0.230000 & 0.307787 \\
\cline{1-4}
\multirow[t]{8}{*}{standard} & 0.000000 & 0.580000 & 0.674219 \\
 & 0.100000 & 0.580000 & 0.684029 \\
 & 0.300000 & 0.560000 & 0.676991 \\
 & 0.500000 & 0.540000 & 0.658959 \\
 & 0.700000 & 0.530000 & 0.650600 \\
 & 1.000000 & 0.580000 & 0.670077 \\
 & 1.500000 & 0.530000 & 0.651280 \\
 & 2.000000 & 0.400000 & 0.500630 \\
\cline{1-4}
\bottomrule
\end{tabular}

