In [1]:
from dotenv import load_dotenv
load_dotenv()

True

In [2]:
import json
import pandas as pd
from pathlib import Path
from copy import deepcopy
from functools import partial

from bellem.qa.ablation import answer_question_standard, answer_question_cot, answer_question_cot_fs, answer_question_cte, answer_question_cte_cot
from bellem.utils import set_seed, jprint
from bellem.musique.singlehop import benchmark

set_seed(89)



In [3]:
from tqdm.auto import tqdm
tqdm.pandas()

In [4]:
pd.options.display.float_format = '{:,.3f}'.format

In [5]:
N_RUNS = 1
SAMPLE_SIZE = 10

In [6]:
from bellem.musique.constants import ABLATION_RECORD_IDS

df = pd.read_json('../../data/generated/musique-evaluation/dataset.jsonl', orient='records', lines=True)
df = df.set_index('id', drop=False).loc[ABLATION_RECORD_IDS].copy().reset_index(drop=True)
qd_df = pd.read_json('../../data/generated/musique-evaluation/question-decomposition.jsonl', orient='records', lines=True)
df = pd.merge(df.drop(columns=['question', 'question_decomposition']), qd_df, on='id', suffixes=('', ''))
df = df.head(SAMPLE_SIZE)

print(df.shape)
df.head()

(10, 8)


Unnamed: 0,id,paragraphs,answer,answer_aliases,answerable,answers,question,question_decomposition
0,2hop__575188_342798,"[{'idx': 0, 'title': 'Liliana Mumy', 'paragrap...",Ahmad Shah Qajar,[Ahmad Shah Qajar],True,[Ahmad Shah Qajar],Who is the child of Mahmoud Mirza's father?,"[{'id': 575188, 'question': 'Who is Mahmoud Mi..."
1,2hop__731584_700117,"[{'idx': 0, 'title': 'KAPE', 'paragraph_text':...",Berrien County,[Berrien County],True,[Berrien County],In which county is the city to which KKVU is l...,"[{'id': 731584, 'question': 'To which city is ..."
2,2hop__690412_526810,"[{'idx': 0, 'title': 'Cabramatta Creek', 'para...",Chao Phraya River,[Chao Phraya River],True,[Chao Phraya River],For what river does the river on which Pa Sak ...,"[{'id': 690412, 'question': 'On which river is..."
3,2hop__263638_69048,"[{'idx': 0, 'title': 'Michael J. Barron', 'par...",Honorable Justice Abiodun Smith,[Honorable Justice Abiodun Smith],True,[Honorable Justice Abiodun Smith],Who is the Chief Judge of the Tebesa Nemine's ...,"[{'id': 263638, 'question': 'Where was Tebesa ..."
4,2hop__142842_68489,"[{'idx': 0, 'title': 'Perfect Night: Live in L...",Snapper Foster,[Snapper Foster],True,[Snapper Foster],Who did the performer of Night Rocker play on ...,"[{'id': 142842, 'question': 'Who performed Nig..."


In [7]:
perfect_retrieval_func = lambda docs, query: [doc for doc in docs if doc['is_supporting']]

In [8]:
results = []

In [9]:
for i in range(1, N_RUNS+1):
    df_standard, scores = benchmark(df, answer_question_standard, perfect_retrieval_func, ignore_errors=True)
    results.append({**scores, "retrieval": "groundtruth", "context": "paragraphs", "qa": "standard", "run": i})
    jprint(scores)

  0%|          | 0/10 [00:00<?, ?it/s]

{
  "exact_match": 0.6,
  "f1": 0.6666666666666666,
  "fuzzy_match": 0.7
}


In [10]:
for i in range(1, N_RUNS+1):
    df_cot, scores = benchmark(df, answer_question_cot, perfect_retrieval_func, ignore_errors=True)
    results.append({**scores, "retrieval": "groundtruth", "context": "paragraphs", "qa": "cot-zs", "run": i})
    jprint(scores)

  0%|          | 0/10 [00:00<?, ?it/s]

{
  "exact_match": 0.6,
  "f1": 0.7960317460317461,
  "fuzzy_match": 0.8
}


In [11]:
for i in range(1, N_RUNS+1):
    df_cot_fs, scores = benchmark(df, answer_question_cot_fs, perfect_retrieval_func, ignore_errors=True)
    results.append({**scores, "retrieval": "groundtruth", "context": "paragraphs", "qa": "cot-fs", "run": i})
    jprint(scores)

  0%|          | 0/10 [00:00<?, ?it/s]

{
  "exact_match": 0.5,
  "f1": 0.7238095238095238,
  "fuzzy_match": 0.9
}


In [12]:
for i in range(1, N_RUNS+1):
    df_cte, scores = benchmark(df, answer_question_cte, perfect_retrieval_func, ignore_errors=True)
    results.append({**scores, "retrieval": "groundtruth", "context": "paragraphs", "qa": "cte", "run": i})
    jprint(scores)

  0%|          | 0/10 [00:00<?, ?it/s]

{
  "exact_match": 0.9,
  "f1": 0.9571428571428571,
  "fuzzy_match": 1.0
}


# Report

In [14]:
report_df = pd.DataFrame.from_records(results, columns=['context', 'retrieval', 'qa', 'run', 'exact_match', 'f1'])
report_df

Unnamed: 0,context,retrieval,qa,run,exact_match,f1
0,paragraphs,groundtruth,standard,1,0.6,0.667
1,paragraphs,groundtruth,cot-zs,1,0.6,0.796
2,paragraphs,groundtruth,cot-fs,1,0.5,0.724
3,paragraphs,groundtruth,cte,1,0.9,0.957


In [15]:
from datetime import datetime
suffix = datetime.utcnow().strftime("%Y%m%d-%H%M%S")
report_df.to_json(f'./comparison-prompting-technique-{suffix}.jsonl', orient='records', lines=True)

## Inspect

In [28]:
context = cte_example['raw_output']['hops'][0]['context']
context

'# KKVU\nKKVU (104.5 FM, "U 104.5") is a commercial radio station licensed to Stevensville, Montana, serving the Missoula, Montana area, owned by Simmons Media Ventures, LLC, through licensee Missoula Broadcasting Company, LLC. KKVU airs an Adult Top 40 music format.\n# Stevensville, Michigan\nStevensville is a village in Berrien County in the U.S. state of Michigan. The village lies within Lincoln Township. The population was 1,142 at the 2010 census.'

In [31]:
i = 2

example = df.iloc[i]
cte_example = df_cte.iloc[i]
cot_example = df_cot.iloc[i]

print("# Question")
print(example['question'])

print("# Answers")
print(example['answers'])

context = cte_example['raw_output']['hops'][0]['context']
print("# Supporting paragraphs")
print(context)

print("="*80)
print("CTE Prompting")
print("="*80)
print(cte_example['raw_output']['hops'][0]['llm_output'].triplets)
print(cte_example['raw_output']['hops'][0]['llm_output'].answer)

print("="*80)
print("COT Prompting")
print("="*80)
print(cot_example['raw_output']['hops'][0]['llm_output'].reasoning)
print(cot_example['raw_output']['hops'][0]['llm_output'].answer)

# Question
For what river does the river on which Pa Sak Jolasid Dam is located serve as the mouth?
# Answers
['Chao Phraya River']
# Supporting paragraphs
# Pa Sak Jolasid Dam
The Pa Sak Jolasid Dam or Pa Sak Cholasit Dam (, ) impounds the Pa Sak River at Ban Kaeng Suea Ten, Tambon Nong Bua, Phatthana Nikhom District, Lopburi Province, Thailand. It is the biggest reservoir in central Thailand.
# List of tributaries of the Chao Phraya River
The principal tributaries of the Chao Phraya River of Thailand are the Pa Sak River, the Sakae Krang River, the Nan River (along with its principal confluent the Yom River), the Ping River (with its principal confluent the Wang River), and the Tha Chin River. Each of these tributaries (and the Chao Phraya itself) is further tributed by additional minor tributaries often referred to as "khwae". All of the tributaries, including the lesser khwae, form an extensive tree-like pattern, with branches flowing through nearly every province in central and no

In [15]:
report_df.drop(columns=['context', 'retrieval', 'run']).groupby(['qa']).mean().loc[['standard', 'cot-zs', 'cot-fs', 'cte']]

Unnamed: 0_level_0,exact_match,f1
qa,Unnamed: 1_level_1,Unnamed: 2_level_1
standard,0.577,0.674
cot-zs,0.437,0.582
cot-fs,0.503,0.64
cte,0.63,0.736


## Inspect

In [16]:
i = 2
row = df_cot.iloc[i]
hop = row['raw_output']['hops'][0]
print(row['question'])
print(row['answers'])
print(hop['llm_output'].answer)
print(hop['llm_output'].reasoning)

For what river does the river on which Pa Sak Jolasid Dam is located serve as the mouth?
['Chao Phraya River']
Chao Phraya River
The Pa Sak Jolasid Dam impounds the Pa Sak River. The Pa Sak River is one of the principal tributaries of the Chao Phraya River in Thailand. Therefore, the river on which the Pa Sak Jolasid Dam is located serves as a tributary to the Chao Phraya River.
