In [30]:
from dotenv import load_dotenv
load_dotenv()

True

In [31]:
import json
import pandas as pd
from pathlib import Path
from copy import deepcopy
from functools import partial

from bellem.qa.ablation import answer_question_standard
from bellem.utils import set_seed, jprint
from bellem.musique.multihop import benchmark

set_seed(89)

In [32]:
from tqdm.auto import tqdm
tqdm.pandas()

In [33]:
N_RUNS = 1
SAMPLE_SIZE = 20

In [34]:
from bellem.musique.constants import ABLATION_RECORD_IDS

df = pd.read_json('../../data/generated/musique-evaluation/dataset.jsonl', orient='records', lines=True)
df.set_index('id', inplace=True, drop=False)
df = df.loc[ABLATION_RECORD_IDS].copy().reset_index(drop=True)
df = df.head(SAMPLE_SIZE)

print(len(df))
df.head()

20


Unnamed: 0,id,paragraphs,question,question_decomposition,answer,answer_aliases,answerable,answers
0,2hop__575188_342798,"[{'idx': 0, 'title': 'Liliana Mumy', 'paragrap...",Who is the child of Mahmoud Mirza's father?,"[{'id': 575188, 'question': 'Mahmoud Mirza >> ...",Ahmad Shah Qajar,[Ahmad Shah Qajar],True,[Ahmad Shah Qajar]
1,2hop__731584_700117,"[{'idx': 0, 'title': 'KAPE', 'paragraph_text':...",In which county is the city to which KKVU is l...,"[{'id': 731584, 'question': 'KKVU >> licensed ...",Berrien County,[Berrien County],True,[Berrien County]
2,2hop__690412_526810,"[{'idx': 0, 'title': 'Cabramatta Creek', 'para...",For what river does the river on which Pa Sak ...,"[{'id': 690412, 'question': 'Pa Sak Jolasid Da...",Chao Phraya River,[Chao Phraya River],True,[Chao Phraya River]
3,2hop__263638_69048,"[{'idx': 0, 'title': 'Michael J. Barron', 'par...",Who is the Chief Judge of the Tebesa Nemine's ...,"[{'id': 263638, 'question': 'Tebesa Nemine >> ...",Honorable Justice Abiodun Smith,[Honorable Justice Abiodun Smith],True,[Honorable Justice Abiodun Smith]
4,2hop__142842_68489,"[{'idx': 0, 'title': 'Perfect Night: Live in L...",Who did the performer of Night Rocker play on ...,"[{'id': 142842, 'question': 'Which performer r...",Snapper Foster,[Snapper Foster],True,[Snapper Foster]


In [35]:
qd_df = pd.read_json('../../data/generated/musique-evaluation/question-decomposition.jsonl', orient='records', lines=True)
df = pd.merge(df.drop(columns=['question', 'question_decomposition']), qd_df, on='id', suffixes=('', ''))
print(df.shape)
df.head()

(20, 8)


Unnamed: 0,id,paragraphs,answer,answer_aliases,answerable,answers,question,question_decomposition
0,2hop__575188_342798,"[{'idx': 0, 'title': 'Liliana Mumy', 'paragrap...",Ahmad Shah Qajar,[Ahmad Shah Qajar],True,[Ahmad Shah Qajar],Who is the child of Mahmoud Mirza's father?,"[{'id': 575188, 'question': 'Who is Mahmoud Mi..."
1,2hop__731584_700117,"[{'idx': 0, 'title': 'KAPE', 'paragraph_text':...",Berrien County,[Berrien County],True,[Berrien County],In which county is the city to which KKVU is l...,"[{'id': 731584, 'question': 'To which city is ..."
2,2hop__690412_526810,"[{'idx': 0, 'title': 'Cabramatta Creek', 'para...",Chao Phraya River,[Chao Phraya River],True,[Chao Phraya River],For what river does the river on which Pa Sak ...,"[{'id': 690412, 'question': 'On which river is..."
3,2hop__263638_69048,"[{'idx': 0, 'title': 'Michael J. Barron', 'par...",Honorable Justice Abiodun Smith,[Honorable Justice Abiodun Smith],True,[Honorable Justice Abiodun Smith],Who is the Chief Judge of the Tebesa Nemine's ...,"[{'id': 263638, 'question': 'Where was Tebesa ..."
4,2hop__142842_68489,"[{'idx': 0, 'title': 'Perfect Night: Live in L...",Snapper Foster,[Snapper Foster],True,[Snapper Foster],Who did the performer of Night Rocker play on ...,"[{'id': 142842, 'question': 'Who performed Nig..."


In [36]:
jerx_file = Path("../../data/raw/musique-evaluation/jerx-inferences/llama3-base.jsonl")
jerx_df = pd.read_json(jerx_file, lines=True)
jerx_df.head()

Unnamed: 0,id,paragraph_idx,paragraph_text,paragraph_title,is_supporting,text,input,generation
0,2hop__131818_161450,0,Maria Carrillo High School is a public high sc...,Maria Carrillo High School,False,# Maria Carrillo High School\nMaria Carrillo H...,[{'content': 'You are an excellent knowledge g...,Maria Carrillo High School | location | Santa ...
1,2hop__131818_161450,1,"Golestān Province (Persian: استان گلستان‎, Ost...",Golestan Province,True,# Golestan Province\nGolestān Province (Persia...,[{'content': 'You are an excellent knowledge g...,Golestan Province | location | north-east of I...
2,2hop__131818_161450,2,Voshmgir District () is a district (bakhsh) in...,Voshmgir District,True,# Voshmgir District\nVoshmgir District () is a...,[{'content': 'You are an excellent knowledge g...,"Voshmgir District | location | Aqqala County, ..."
3,2hop__131818_161450,3,52 Heroor is a village in the southern state o...,52 Heroor,False,# 52 Heroor\n52 Heroor is a village in the sou...,[{'content': 'You are an excellent knowledge g...,"52 Heroor | location | Karnataka, India\n52 He..."
4,2hop__131818_161450,4,Vennaimalai is a village of Karur District loc...,Vennaimalai,False,# Vennaimalai\nVennaimalai is a village of Kar...,[{'content': 'You are an excellent knowledge g...,Vennaimalai | location | Karur District\nVenna...


In [37]:
jerx_mapping = {(row['id'], row['paragraph_idx']): row['generation'] for _, row in jerx_df.iterrows()}

def extract_triplets(example: dict):
    example["triplets_str"] = [jerx_mapping[(example['id'], p['idx'])].strip() for p in example['paragraphs']]
    return example

In [38]:
df = df.apply(extract_triplets, axis=1)
print(len(df))
df.head()

20


Unnamed: 0,id,paragraphs,answer,answer_aliases,answerable,answers,question,question_decomposition,triplets_str
0,2hop__575188_342798,"[{'idx': 0, 'title': 'Liliana Mumy', 'paragrap...",Ahmad Shah Qajar,[Ahmad Shah Qajar],True,[Ahmad Shah Qajar],Who is the child of Mahmoud Mirza's father?,"[{'id': 575188, 'question': 'Who is Mahmoud Mi...",[Liliana Mumy | father | Bill Mumy\nBill Mumy ...
1,2hop__731584_700117,"[{'idx': 0, 'title': 'KAPE', 'paragraph_text':...",Berrien County,[Berrien County],True,[Berrien County],In which county is the city to which KKVU is l...,"[{'id': 731584, 'question': 'To which city is ...",[KAPE | broadcast frequency | 1550 AM\nKAPE | ...
2,2hop__690412_526810,"[{'idx': 0, 'title': 'Cabramatta Creek', 'para...",Chao Phraya River,[Chao Phraya River],True,[Chao Phraya River],For what river does the river on which Pa Sak ...,"[{'id': 690412, 'question': 'On which river is...",[Cabramatta Creek | location | Sydney\nCabrama...
3,2hop__263638_69048,"[{'idx': 0, 'title': 'Michael J. Barron', 'par...",Honorable Justice Abiodun Smith,[Honorable Justice Abiodun Smith],True,[Honorable Justice Abiodun Smith],Who is the Chief Judge of the Tebesa Nemine's ...,"[{'id': 263638, 'question': 'Where was Tebesa ...",[Michael J. Barron | birth year | 1933\nMichae...
4,2hop__142842_68489,"[{'idx': 0, 'title': 'Perfect Night: Live in L...",Snapper Foster,[Snapper Foster],True,[Snapper Foster],Who did the performer of Night Rocker play on ...,"[{'id': 142842, 'question': 'Who performed Nig...",[Perfect Night: Live in London | recorded by |...


In [39]:
import bm25s
import logging

logging.getLogger("bm25s").setLevel(logging.ERROR)

def bm25_retrieval(docs: list[dict], query: str, top_k: int = 5):
    top_k = min(top_k, len(docs))
    retriever = bm25s.BM25(corpus=docs)
    tokenized_corpus = bm25s.tokenize([doc['text'] for doc in docs], show_progress=False)
    retriever.index(tokenized_corpus, show_progress=False)
    results, _ = retriever.retrieve(bm25s.tokenize(query), k=top_k, show_progress=False)
    return results[0].tolist()

In [40]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("all-MiniLM-L6-v2")

def semantic_retrieval(docs: list[dict], query: str, top_k: int = 5):
    embeddings = model.encode([doc['text'] for doc in docs])
    query_vectors = model.encode([query])
    similarities = model.similarity(embeddings, query_vectors)
    sorted_indices = similarities.argsort(dim=0, descending=True)
    return [docs[i] for i in sorted_indices[:top_k]]



In [41]:
dummy_retrieval_func = lambda docs,query: docs
perfect_retrieval_func = lambda docs,query: [doc for doc in docs if doc['is_supporting']]

In [42]:
qa_func = answer_question_standard

In [43]:
results = []

## Only paragraphs

In [44]:
%%capture
_, scores = benchmark(df, qa_func, bm25_retrieval, ignore_errors=True)
results.append({**scores, "retrieval": "bm25", "context": "paragraphs"})
jprint(scores)

In [45]:
_, scores = benchmark(df, qa_func, semantic_retrieval, ignore_errors=True)
results.append({**scores, "retrieval": "semantic", "context": "paragraphs"})
jprint(scores)

  0%|          | 0/20 [00:00<?, ?it/s]

{
  "exact_match": 0.45,
  "f1": 0.54,
  "fuzzy_match": 0.55
}


## Paragraphs + Triplets

In [46]:
def enhance_paragraphs(row):
    paragraphs_with_triplets = []
    for p in row['paragraphs']:
        p = deepcopy(p)
        triplets_str = str(jerx_mapping[(row['id'], p['idx'])])
        p['paragraph_text'] = '\n'.join([p['paragraph_text'], "# Entity-relation-entity triplets", triplets_str])
        paragraphs_with_triplets.append(p)
    row['paragraphs'] = paragraphs_with_triplets
    return row

df_paragraph_triplets = df.apply(enhance_paragraphs, axis=1) 
df_paragraph_triplets.head()
print(df_paragraph_triplets.iloc[0]['paragraphs'][2]['paragraph_text'])

Mirza Mehdy Ispahani (also known as Sadri Ispahani) (1923–2004), son of Mirza Ahmad Ispahani, was Chairman of M.M. Ispahani from 1949 till 2004. Mirza Ali Behrouze Ispahani, son of Mirza Mehdy Ispahani was elected as the Chairman of M.M. Ispahani in 2004.
# Entity-relation-entity triplets
Mirza Mehdy Ispahani | occupation | Chairman of M.M. Ispahani
Mirza Mehdy Ispahani | tenure | 1949-2004
Mirza Mehdy Ispahani | father | Mirza Ahmad Ispahani
Mirza Mehdy Ispahani | alternative name | Sadri Ispahani
Mirza Ali Behrouze Ispahani | occupation | Chairman of M.M. Ispahani
Mirza Ali Behrouze Ispahani | father | Mirza Mehdy Ispahani


In [47]:
%%capture
_, scores = benchmark(df_paragraph_triplets, qa_func, bm25_retrieval, ignore_errors=True)
results.append({**scores, "retrieval": "bm25", "context": "paragraphs+triplets"})
jprint(scores)

In [48]:
_, scores = benchmark(df_paragraph_triplets, qa_func, semantic_retrieval, ignore_errors=True)
results.append({**scores, "retrieval": "semantic", "context": "paragraphs+triplets"})
jprint(scores)

  0%|          | 0/20 [00:00<?, ?it/s]

{
  "exact_match": 0.5,
  "f1": 0.5831818181818182,
  "fuzzy_match": 0.6
}


## Only triplets

In [49]:
def replace_paragraphs(row):
    paragraphs_with_triplets = []
    for p in row['paragraphs']:
        triplets_str = str(jerx_mapping[(row['id'], p['idx'])])
        for triplet in triplets_str.splitlines():
            p = deepcopy(p) 
            p['title'] = ""
            p['paragraph_text'] = triplet.strip()
            paragraphs_with_triplets.append(p)
    row['paragraphs'] = paragraphs_with_triplets
    return row

df_only_triplets = df.apply(replace_paragraphs, axis=1) 
df_only_triplets.head()
print(df_only_triplets.iloc[0]['paragraphs'][0]['paragraph_text'])

Liliana Mumy | father | Bill Mumy


In [50]:
%%capture
_, scores = benchmark(df_only_triplets, qa_func, partial(bm25_retrieval, top_k=10), ignore_errors=True)
results.append({**scores, "retrieval": "bm25", "context": "triplets"})
jprint(scores)

In [51]:
_, scores = benchmark(df_only_triplets, qa_func, partial(semantic_retrieval, top_k=10), ignore_errors=True)
results.append({**scores, "retrieval": "semantic", "context": "triplets"})
jprint(scores)

  0%|          | 0/20 [00:00<?, ?it/s]

{
  "exact_match": 0.35,
  "f1": 0.3931818181818182,
  "fuzzy_match": 0.4
}


# Report

In [52]:
report_df = pd.DataFrame.from_records(results, columns=['context', 'retrieval', 'exact_match', 'fuzzy_match', 'f1'])
report_df

Unnamed: 0,context,retrieval,exact_match,fuzzy_match,f1
0,paragraphs,bm25,0.4,0.5,0.511515
1,paragraphs,semantic,0.45,0.55,0.54
2,paragraphs+triplets,bm25,0.45,0.55,0.491667
3,paragraphs+triplets,semantic,0.5,0.6,0.583182
4,triplets,bm25,0.3,0.5,0.423333
5,triplets,semantic,0.35,0.4,0.393182


In [53]:
from datetime import datetime
suffix = datetime.utcnow().strftime("%Y%m%d-%H%M%S")
report_df.to_json(f'../../data/generated/musique-evaluation/baseline-report-{suffix}.jsonl', orient='records', lines=True)

In [54]:
print(report_df[report_df['retrieval']=='bm25'].drop(columns=['retrieval']).to_markdown(index=False))

| context             |   exact_match |   fuzzy_match |       f1 |
|:--------------------|--------------:|--------------:|---------:|
| paragraphs          |          0.4  |          0.5  | 0.511515 |
| paragraphs+triplets |          0.45 |          0.55 | 0.491667 |
| triplets            |          0.3  |          0.5  | 0.423333 |


In [55]:
print(report_df[report_df['retrieval']=='semantic'].drop(columns=['retrieval']).to_markdown(index=False))

| context             |   exact_match |   fuzzy_match |       f1 |
|:--------------------|--------------:|--------------:|---------:|
| paragraphs          |          0.45 |          0.55 | 0.54     |
| paragraphs+triplets |          0.5  |          0.6  | 0.583182 |
| triplets            |          0.35 |          0.4  | 0.393182 |


## Retrieval impact

In [56]:
print(report_df[report_df['context']=='paragraphs'].to_markdown(index=False))

| context    | retrieval   |   exact_match |   fuzzy_match |       f1 |
|:-----------|:------------|--------------:|--------------:|---------:|
| paragraphs | bm25        |          0.4  |          0.5  | 0.511515 |
| paragraphs | semantic    |          0.45 |          0.55 | 0.54     |
