In [1]:
from dotenv import load_dotenv
load_dotenv()

True

In [2]:
import json
import pandas as pd
from pathlib import Path
from copy import deepcopy
from functools import partial
import magentic

from bellek.qa.ablation import answer_question_standard, answer_question_cot, answer_question_cot_fs, answer_question_cte, answer_question_cte_cot
from bellek.utils import set_seed, jprint
from bellek.musique.singlehop import benchmark

set_seed(89)

In [3]:
from tqdm.auto import tqdm
tqdm.pandas()

In [4]:
pd.options.display.float_format = '{:,.3f}'.format

In [5]:
N_RUNS = 3
SAMPLE_SIZE = 100

In [6]:
from bellek.musique.constants import ABLATION_RECORD_IDS

df = pd.read_json('../../data/generated/musique-evaluation/dataset.jsonl', orient='records', lines=True)
df = df.set_index('id', drop=False).loc[ABLATION_RECORD_IDS].copy().reset_index(drop=True)
qd_df = pd.read_json('../../data/generated/musique-evaluation/question-decomposition.jsonl', orient='records', lines=True)
df = pd.merge(df.drop(columns=['question', 'question_decomposition']), qd_df, on='id', suffixes=('', ''))
df = df.head(SAMPLE_SIZE)

print(df.shape)
df.head()

(100, 8)


Unnamed: 0,id,paragraphs,answer,answer_aliases,answerable,answers,question,question_decomposition
0,2hop__575188_342798,"[{'idx': 0, 'title': 'Liliana Mumy', 'paragrap...",Ahmad Shah Qajar,[Ahmad Shah Qajar],True,[Ahmad Shah Qajar],Who is the child of Mahmoud Mirza's father?,"[{'id': 575188, 'question': 'Who is Mahmoud Mi..."
1,2hop__731584_700117,"[{'idx': 0, 'title': 'KAPE', 'paragraph_text':...",Berrien County,[Berrien County],True,[Berrien County],In which county is the city to which KKVU is l...,"[{'id': 731584, 'question': 'To which city is ..."
2,2hop__690412_526810,"[{'idx': 0, 'title': 'Cabramatta Creek', 'para...",Chao Phraya River,[Chao Phraya River],True,[Chao Phraya River],For what river does the river on which Pa Sak ...,"[{'id': 690412, 'question': 'On which river is..."
3,2hop__263638_69048,"[{'idx': 0, 'title': 'Michael J. Barron', 'par...",Honorable Justice Abiodun Smith,[Honorable Justice Abiodun Smith],True,[Honorable Justice Abiodun Smith],Who is the Chief Judge of the Tebesa Nemine's ...,"[{'id': 263638, 'question': 'Where was Tebesa ..."
4,2hop__142842_68489,"[{'idx': 0, 'title': 'Perfect Night: Live in L...",Snapper Foster,[Snapper Foster],True,[Snapper Foster],Who did the performer of Night Rocker play on ...,"[{'id': 142842, 'question': 'Who performed Nig..."


In [7]:
perfect_retrieval_func = lambda docs, query: [doc for doc in docs if doc['is_supporting']]

In [8]:
results = []

In [14]:
for temperature in [0.0, 0.1, 0.3, 0.5, 0.7, 0.9]:
    with magentic.OpenaiChatModel("gpt-3.5-turbo", temperature=temperature):
        for qa_prompting, qa_func in [('standard', answer_question_standard), ('cte', answer_question_cte)]:
            for i in range(1, N_RUNS+1):
                _, scores = benchmark(df, qa_func, perfect_retrieval_func, ignore_errors=True)
                results.append({**scores, "retrieval": "groundtruth", "context": "paragraphs", "qa": qa_prompting, "temperature": temperature, "run": i})
                jprint(scores)

  0%|          | 0/100 [00:00<?, ?it/s]

{
  "exact_match": 0.55,
  "f1": 0.6496479925303454,
  "fuzzy_match": 0.68
}


  0%|          | 0/100 [00:00<?, ?it/s]

{
  "exact_match": 0.57,
  "f1": 0.6637432306255835,
  "fuzzy_match": 0.7
}


  0%|          | 0/100 [00:00<?, ?it/s]

{
  "exact_match": 0.56,
  "f1": 0.663219421101774,
  "fuzzy_match": 0.69
}


  0%|          | 0/100 [00:00<?, ?it/s]

{
  "exact_match": 0.62,
  "f1": 0.733314659197012,
  "fuzzy_match": 0.77
}


  0%|          | 0/100 [00:00<?, ?it/s]

{
  "exact_match": 0.64,
  "f1": 0.7391720969089388,
  "fuzzy_match": 0.76
}


  0%|          | 0/100 [00:00<?, ?it/s]

{
  "exact_match": 0.63,
  "f1": 0.727347848230201,
  "fuzzy_match": 0.75
}


# Report

In [15]:
report_df = pd.DataFrame.from_records(results, columns=['context', 'retrieval', 'qa', 'temperature', 'run', 'exact_match', 'f1'])
report_df

Unnamed: 0,context,retrieval,qa,temperature,run,exact_match,f1
0,paragraphs,groundtruth,standard,0.1,1,0.56,0.669
1,paragraphs,groundtruth,standard,0.1,2,0.58,0.675
2,paragraphs,groundtruth,standard,0.1,3,0.58,0.674
3,paragraphs,groundtruth,cte,0.1,1,0.63,0.738
4,paragraphs,groundtruth,cte,0.1,2,0.62,0.722
5,paragraphs,groundtruth,cte,0.1,3,0.62,0.729
6,paragraphs,groundtruth,standard,0.3,1,0.56,0.666
7,paragraphs,groundtruth,standard,0.3,2,0.58,0.679
8,paragraphs,groundtruth,standard,0.3,3,0.56,0.665
9,paragraphs,groundtruth,cte,0.3,1,0.61,0.719


In [16]:
from datetime import datetime
suffix = datetime.utcnow().strftime("%Y%m%d-%H%M%S")
report_df.to_json(f'./ablation-temperature-{suffix}.jsonl', orient='records', lines=True)

In [17]:
report_df.drop(columns=['context', 'retrieval', 'run']).groupby(['qa']).mean().loc[['standard', 'cte']]

Unnamed: 0_level_0,temperature,exact_match,f1
qa,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
standard,0.417,0.566,0.666
cte,0.417,0.627,0.731


In [18]:
report_df.drop(columns=['context', 'retrieval', 'run']).groupby(['qa', 'temperature']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,exact_match,f1
qa,temperature,Unnamed: 2_level_1,Unnamed: 3_level_1
cte,0.0,0.63,0.733
cte,0.1,0.623,0.729
cte,0.3,0.63,0.736
cte,0.5,0.623,0.732
cte,0.7,0.627,0.726
cte,0.9,0.63,0.733
standard,0.0,0.56,0.659
standard,0.1,0.573,0.673
standard,0.3,0.567,0.67
standard,0.5,0.567,0.663


In [20]:
print(report_df.drop(columns=['context', 'retrieval', 'run']).groupby(['qa', 'temperature']).mean().to_latex())

\begin{tabular}{llrr}
\toprule
 &  & exact_match & f1 \\
qa & temperature &  &  \\
\midrule
\multirow[t]{6}{*}{cte} & 0.000000 & 0.630000 & 0.733278 \\
 & 0.100000 & 0.623333 & 0.729475 \\
 & 0.300000 & 0.630000 & 0.735729 \\
 & 0.500000 & 0.623333 & 0.731715 \\
 & 0.700000 & 0.626667 & 0.725516 \\
 & 0.900000 & 0.630000 & 0.732523 \\
\cline{1-4}
\multirow[t]{6}{*}{standard} & 0.000000 & 0.560000 & 0.658870 \\
 & 0.100000 & 0.573333 & 0.672759 \\
 & 0.300000 & 0.566667 & 0.670046 \\
 & 0.500000 & 0.566667 & 0.663426 \\
 & 0.700000 & 0.566667 & 0.667299 \\
 & 0.900000 & 0.560000 & 0.664886 \\
\cline{1-4}
\bottomrule
\end{tabular}



## Inspect

In [13]:
i = 2
row = df_cot.iloc[i]
hop = row['raw_output']['hops'][0]
print(row['question'])
print(row['answers'])
print(hop['llm_output'].answer)
print(hop['llm_output'].reasoning)

NameError: name 'df_cot' is not defined