In [1]:
from dotenv import load_dotenv
load_dotenv()

True

In [2]:
import json
import pandas as pd
from pathlib import Path
from copy import deepcopy
from functools import partial
import magentic

from bellem.musique.qa import answer_question_standard, answer_question_cot, answer_question_cot_fs, answer_question_cte
from bellem.utils import set_seed, jprint
from bellem.musique.singlehop import benchmark

set_seed(89)

In [3]:
from tqdm.auto import tqdm
tqdm.pandas()

In [4]:
pd.options.display.float_format = '{:,.3f}'.format

In [5]:
def perfect_retrieval_func(docs, query):
    return [doc for doc in docs if doc['is_supporting']]

In [6]:
N_RUNS = 3

In [7]:
from bellem.musique.constants import ABLATION_RECORD_IDS

df = pd.read_json('../../data/generated/musique-evaluation/dataset.jsonl', orient='records', lines=True)
df = df.set_index('id', drop=False).loc[ABLATION_RECORD_IDS].copy().reset_index(drop=True)
# df = df.sample(10)

print(df.shape)
df.head()

(100, 8)


Unnamed: 0,id,paragraphs,question,question_decomposition,answer,answer_aliases,answerable,answers
0,2hop__575188_342798,"[{'idx': 0, 'title': 'Liliana Mumy', 'paragrap...",Who is the child of Mahmoud Mirza's father?,"[{'id': 575188, 'question': 'Mahmoud Mirza >> ...",Ahmad Shah Qajar,[Ahmad Shah Qajar],True,[Ahmad Shah Qajar]
1,2hop__731584_700117,"[{'idx': 0, 'title': 'KAPE', 'paragraph_text':...",In which county is the city to which KKVU is l...,"[{'id': 731584, 'question': 'KKVU >> licensed ...",Berrien County,[Berrien County],True,[Berrien County]
2,2hop__690412_526810,"[{'idx': 0, 'title': 'Cabramatta Creek', 'para...",For what river does the river on which Pa Sak ...,"[{'id': 690412, 'question': 'Pa Sak Jolasid Da...",Chao Phraya River,[Chao Phraya River],True,[Chao Phraya River]
3,2hop__263638_69048,"[{'idx': 0, 'title': 'Michael J. Barron', 'par...",Who is the Chief Judge of the Tebesa Nemine's ...,"[{'id': 263638, 'question': 'Tebesa Nemine >> ...",Honorable Justice Abiodun Smith,[Honorable Justice Abiodun Smith],True,[Honorable Justice Abiodun Smith]
4,2hop__142842_68489,"[{'idx': 0, 'title': 'Perfect Night: Live in L...",Who did the performer of Night Rocker play on ...,"[{'id': 142842, 'question': 'Which performer r...",Snapper Foster,[Snapper Foster],True,[Snapper Foster]


In [8]:
results = []

for temperature in tqdm([2.0]):
    completion_kwargs = {"temperature": temperature}
    for qa_technique, qa_func in [("cte", answer_question_cte)]:
        qa_func = partial(qa_func, completion_kwargs=completion_kwargs)
        for run in range(1, N_RUNS + 1):
            _, scores = benchmark(df, qa_func, perfect_retrieval_func, ignore_errors=False)
            results.append(
                {
                    **scores,
                    "retrieval": "groundtruth",
                    "context": "paragraphs",
                    "qa": qa_technique,
                    "temperature": temperature,
                    "run": run,
                }
            )

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

In [8]:
# results = []

# for temperature in tqdm([0.0, 0.1, 0.3, 0.5, 0.7, 1.0, 1.5, 2.0]):
#     completion_kwargs = {"temperature": temperature}
#     for qa_technique, qa_func in [("standard", answer_question_standard), ("cte", answer_question_cte)]:
#         qa_func = partial(qa_func, completion_kwargs=completion_kwargs)
#         for run in range(1, N_RUNS + 1):
#             _, scores = benchmark(df, qa_func, perfect_retrieval_func, ignore_errors=False)
#             results.append(
#                 {
#                     **scores,
#                     "retrieval": "groundtruth",
#                     "context": "paragraphs",
#                     "qa": qa_technique,
#                     "temperature": temperature,
#                     "run": run,
#                 }
#             )

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

InternalServerError: Error code: 500 - {'error': {'message': 'The model produced invalid content. Consider modifying your prompt if you are seeing this error persistently.', 'type': 'model_error', 'param': None, 'code': None}}

# Report

In [None]:
report_df = pd.DataFrame.from_records(results, columns=['context', 'retrieval', 'qa', 'temperature', 'run', 'exact_match', 'f1'])
report_df

Unnamed: 0,context,retrieval,qa,temperature,run,exact_match,f1
0,paragraphs,groundtruth,standard,0.0,1,0.49,0.597
1,paragraphs,groundtruth,standard,0.0,2,0.49,0.595
2,paragraphs,groundtruth,standard,0.0,3,0.52,0.621
3,paragraphs,groundtruth,cte,0.0,1,0.57,0.691
4,paragraphs,groundtruth,cte,0.0,2,0.56,0.692
5,paragraphs,groundtruth,cte,0.0,3,0.55,0.681
6,paragraphs,groundtruth,standard,0.1,1,0.5,0.615
7,paragraphs,groundtruth,standard,0.1,2,0.48,0.597
8,paragraphs,groundtruth,standard,0.1,3,0.53,0.643
9,paragraphs,groundtruth,cte,0.1,1,0.6,0.718


In [None]:
from datetime import datetime
suffix = datetime.utcnow().strftime("%Y%m%d-%H%M%S")
report_df.to_json(f'./ablation-temperature-{suffix}.jsonl', orient='records', lines=True)

In [None]:
report_df.drop(columns=['context', 'retrieval', 'run']).groupby(['qa', 'temperature']).agg(['min', 'mean', 'max', 'std'])

Unnamed: 0_level_0,Unnamed: 1_level_0,exact_match,exact_match,exact_match,exact_match,f1,f1,f1,f1
Unnamed: 0_level_1,Unnamed: 1_level_1,min,mean,max,std,min,mean,max,std
qa,temperature,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
cte,0.0,0.55,0.56,0.57,0.01,0.681,0.688,0.692,0.006
cte,0.1,0.55,0.573,0.6,0.025,0.678,0.699,0.718,0.02
cte,0.3,0.55,0.56,0.57,0.01,0.672,0.677,0.682,0.005
cte,0.5,0.55,0.563,0.58,0.015,0.676,0.685,0.695,0.01
cte,0.7,0.57,0.57,0.57,0.0,0.685,0.692,0.699,0.007
cte,1.0,0.56,0.573,0.59,0.015,0.673,0.684,0.692,0.01
cte,1.5,0.52,0.533,0.55,0.015,0.64,0.646,0.652,0.006
standard,0.0,0.49,0.5,0.52,0.017,0.595,0.604,0.621,0.015
standard,0.1,0.48,0.503,0.53,0.025,0.597,0.619,0.643,0.023
standard,0.3,0.49,0.507,0.52,0.015,0.609,0.623,0.639,0.015


In [None]:
print(report_df.drop(columns=['context', 'retrieval', 'run']).groupby(['qa', 'temperature']).mean().to_latex())

\begin{tabular}{llrr}
\toprule
 &  & exact_match & f1 \\
qa & temperature &  &  \\
\midrule
\multirow[t]{7}{*}{cte} & 0.000000 & 0.560000 & 0.687859 \\
 & 0.100000 & 0.573333 & 0.698589 \\
 & 0.300000 & 0.560000 & 0.676589 \\
 & 0.500000 & 0.563333 & 0.685137 \\
 & 0.700000 & 0.570000 & 0.691570 \\
 & 1.000000 & 0.573333 & 0.684375 \\
 & 1.500000 & 0.533333 & 0.646284 \\
\cline{1-4}
\multirow[t]{8}{*}{standard} & 0.000000 & 0.500000 & 0.604231 \\
 & 0.100000 & 0.503333 & 0.618818 \\
 & 0.300000 & 0.506667 & 0.622858 \\
 & 0.500000 & 0.513333 & 0.625697 \\
 & 0.700000 & 0.490000 & 0.601731 \\
 & 1.000000 & 0.503333 & 0.611642 \\
 & 1.500000 & 0.476667 & 0.590835 \\
 & 2.000000 & 0.320000 & 0.406635 \\
\cline{1-4}
\bottomrule
\end{tabular}

