In [1]:
from dotenv import load_dotenv
load_dotenv()

True

In [2]:
import json
import pandas as pd
from pathlib import Path
from copy import deepcopy
from functools import partial

from bellem.qa.llm import make_question_answer_func
from bellem.utils import set_seed, jprint
from bellem.musique.multihop import benchmark

set_seed(89)

In [3]:
from tqdm.auto import tqdm
tqdm.pandas()

In [4]:
def silence(exc_cls):
    def decorator(func):
        def wrapper(*args, **kwargs):
            try:
                return func(*args, **kwargs)
            except exc_cls as e:
                return None
        return wrapper
    return decorator

In [5]:
df = pd.read_json('../../data/generated/musique-evaluation/dataset.jsonl', orient='records', lines=True).iloc[100:200]
qd_df = pd.read_json('../../data/generated/musique-evaluation/question-decomposition.jsonl', orient='records', lines=True)
df = pd.merge(df.drop(columns=['question', 'question_decomposition']), qd_df, on='id', suffixes=('', ''))
print(df.shape)
df.head()

(100, 8)


Unnamed: 0,id,paragraphs,answer,answer_aliases,answerable,answers,question,question_decomposition
0,2hop__143915_68489,"[{'idx': 0, 'title': 'Daniel Goddard (actor)',...",Snapper Foster,[Snapper Foster],True,[Snapper Foster],who did the performer of Sings America play on...,"[{'id': 143915, 'question': 'Who performed Sin..."
1,2hop__642686_7292,"[{'idx': 0, 'title': 'Borat's Television Progr...",George Benson,[George Benson],True,[George Benson],Along with Kenny G and the performer of Hello ...,"[{'id': 642686, 'question': 'Who performed Hel..."
2,2hop__391258_161450,"[{'idx': 0, 'title': 'Karimabad-e Ayaghchi', '...",in the north-east of the country south of the ...,"[Caspian Sea, in the north-east of the country...",True,"[Caspian Sea, in the north-east of the country...",Where is the province that contains Maraveh Ta...,"[{'id': 391258, 'question': 'Which province co..."
3,2hop__549146_223121,"[{'idx': 0, 'title': 'All Funked Up', 'paragra...",Asian Man Records,[Asian Man Records],True,[Asian Man Records],What record label did the person who is part o...,"[{'id': 549146, 'question': 'Who is part of Th..."
4,2hop__811015_3300,"[{'idx': 0, 'title': 'Blue Bloods (season 7)',...",season three,[season three],True,[season three],What season was the performer of Blue Skies on?,"[{'id': 811015, 'question': 'Who performed Blu..."


In [6]:
jerx_file = Path("../../data/raw/musique-evaluation/jerx-inferences/llama3-base.jsonl")
jerx_df = pd.read_json(jerx_file, lines=True)
jerx_df.head()

Unnamed: 0,id,paragraph_idx,paragraph_text,paragraph_title,is_supporting,text,input,generation
0,2hop__131818_161450,0,Maria Carrillo High School is a public high sc...,Maria Carrillo High School,False,# Maria Carrillo High School\nMaria Carrillo H...,[{'content': 'You are an excellent knowledge g...,Maria Carrillo High School | location | Santa ...
1,2hop__131818_161450,1,"Golestān Province (Persian: استان گلستان‎, Ost...",Golestan Province,True,# Golestan Province\nGolestān Province (Persia...,[{'content': 'You are an excellent knowledge g...,Golestan Province | location | north-east of I...
2,2hop__131818_161450,2,Voshmgir District () is a district (bakhsh) in...,Voshmgir District,True,# Voshmgir District\nVoshmgir District () is a...,[{'content': 'You are an excellent knowledge g...,"Voshmgir District | location | Aqqala County, ..."
3,2hop__131818_161450,3,52 Heroor is a village in the southern state o...,52 Heroor,False,# 52 Heroor\n52 Heroor is a village in the sou...,[{'content': 'You are an excellent knowledge g...,"52 Heroor | location | Karnataka, India\n52 He..."
4,2hop__131818_161450,4,Vennaimalai is a village of Karur District loc...,Vennaimalai,False,# Vennaimalai\nVennaimalai is a village of Kar...,[{'content': 'You are an excellent knowledge g...,Vennaimalai | location | Karur District\nVenna...


In [7]:
from bellem.jerx.fewshot.llm import make_kg_triplet_extract_fn

extract_kg_triplets = make_kg_triplet_extract_fn(model="gpt-4-turbo")

def extract_triplets(example: dict):
    context = "The triplets you extracted should help answering the following multi-hop question:\n" + example['question'] + "\n\n"
    triplets = [triplet for p in example['paragraphs'] if p['is_supporting'] for triplet in extract_kg_triplets(context + p['paragraph_text'])]
    example['triplets_str'] = '\n'.join(' | '.join(triplet) for triplet in triplets)
    return example

In [8]:
# jerx_mapping = {(row['id'], row['paragraph_idx']): row['generation'] for _, row in jerx_df.iterrows()}

# def extract_triplets(example: dict):
#     example["triplets_str"] = [jerx_mapping[(example['id'], p['idx'])].strip() for p in example['paragraphs']]
#     example["triplets_str"] = [triplets_str.replace("|", "|") for triplets_str in example["triplets_str"]]
#     return example

In [9]:
df = df.progress_apply(extract_triplets, axis=1)
print(len(df))
df.head()

  0%|          | 0/100 [00:00<?, ?it/s]

100


Unnamed: 0,id,paragraphs,answer,answer_aliases,answerable,answers,question,question_decomposition,triplets_str
0,2hop__143915_68489,"[{'idx': 0, 'title': 'Daniel Goddard (actor)',...",Snapper Foster,[Snapper Foster],True,[Snapper Foster],who did the performer of Sings America play on...,"[{'id': 143915, 'question': 'Who performed Sin...",David Hasselhoff | released | Sings America (a...
1,2hop__642686_7292,"[{'idx': 0, 'title': 'Borat's Television Progr...",George Benson,[George Benson],True,[George Benson],Along with Kenny G and the performer of Hello ...,"[{'id': 642686, 'question': 'Who performed Hel...",Kenny G | genre | Smooth Jazz\nDave Koz | genr...
2,2hop__391258_161450,"[{'idx': 0, 'title': 'Karimabad-e Ayaghchi', '...",in the north-east of the country south of the ...,"[Caspian Sea, in the north-east of the country...",True,"[Caspian Sea, in the north-east of the country...",Where is the province that contains Maraveh Ta...,"[{'id': 391258, 'question': 'Which province co...",Maraveh Tappeh County | located in | Golestan ...
3,2hop__549146_223121,"[{'idx': 0, 'title': 'All Funked Up', 'paragra...",Asian Man Records,[Asian Man Records],True,[Asian Man Records],What record label did the person who is part o...,"[{'id': 549146, 'question': 'Who is part of Th...",Mike Park | member of | The Bruce Lee Band\nTh...
4,2hop__811015_3300,"[{'idx': 0, 'title': 'Blue Bloods (season 7)',...",season three,[season three],True,[season three],What season was the performer of Blue Skies on?,"[{'id': 811015, 'question': 'Who performed Blu...",Blue Skies (Album) | artist | Diana DeGarmo\nB...


In [10]:
import bm25s
import logging

logging.getLogger("bm25s").setLevel(logging.ERROR)

def bm25_retrieval(docs: list[dict], query: str, top_k: int = 5):
    top_k = min(top_k, len(docs))
    retriever = bm25s.BM25(corpus=docs)
    tokenized_corpus = bm25s.tokenize([doc['text'] for doc in docs], show_progress=False)
    retriever.index(tokenized_corpus, show_progress=False)
    results, _ = retriever.retrieve(bm25s.tokenize(query), k=top_k, show_progress=False)
    return results[0].tolist()

In [11]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("all-MiniLM-L6-v2")

def semantic_retrieval(docs: list[dict], query: str, top_k: int = 5):
    embeddings = model.encode([doc['text'] for doc in docs])
    query_vectors = model.encode([query])
    similarities = model.similarity(embeddings, query_vectors)
    sorted_indices = similarities.argsort(dim=0, descending=True)
    return [docs[i] for i in sorted_indices[:top_k]]



In [12]:
dummy_retrieval_func = lambda docs,query: docs
perfect_retrieval_func = lambda docs,query: [doc for doc in docs if doc['is_supporting']]

In [13]:
completion_kwargs={"temperature": 0.0, "max_tokens": 1024}
qa_func = make_question_answer_func("gpt-3.5-turbo", completion_kwargs=completion_kwargs)

In [14]:
results = []

## Only paragraphs

In [15]:
_, scores = benchmark(df, qa_func, perfect_retrieval_func, ignore_errors=True)
results.append({**scores, "retrieval": "groundtruth", "context": "paragraphs"})
jprint(scores)

  0%|          | 0/100 [00:00<?, ?it/s]

Failed to answer the question 2hop__549146_223121
Unterminated string starting at: line 1 column 15 (char 14)


Using the latest cached version of the module from /Users/bdsaglam/.cache/huggingface/modules/evaluate_modules/metrics/bdsaglam--musique/9f409241d4cc6ea7853124e79cf44954a75900a0a2c0b9d20b909c2396f6b071 (last modified on Sat May  4 17:09:36 2024) since it couldn't be found locally at bdsaglam--musique, or remotely on the Hugging Face Hub.


{
  "exact_match": 0.68,
  "f1": 0.7664080364080362,
  "fuzzy_match": 0.78
}


## Paragraphs + Triplets

In [16]:
def enhance_paragraphs(row):
    paragraphs_with_triplets = []
    for p in row['paragraphs']:
        p = deepcopy(p)
        triplets_str = row['triplets_str']
        p['paragraph_text'] = '\n'.join([p['paragraph_text'], "# Entity-relation-entity triplets", triplets_str])
        paragraphs_with_triplets.append(p)
    row['paragraphs'] = paragraphs_with_triplets
    return row

df_paragraph_triplets = df.apply(enhance_paragraphs, axis=1) 
df_paragraph_triplets.head()
print(df_paragraph_triplets.iloc[0]['paragraphs'][2]['paragraph_text'])

Lauren Alice Koslow (born March 9, 1953) is an American actress, best known for her long - running portrayal of Kate Roberts on the NBC dramatic serial Days of Our Lives, which she has played continuously since 1996. She previously appeared in the soaps The Bold and the Beautiful and The Young and the Restless.
# Entity-relation-entity triplets
David Hasselhoff | released | Sings America (album)
Sings America (album) | release date | August 2009
Sings America (album) | contains covers by | Elvis Presley
Sings America (album) | contains covers by | The Beach Boys
Sings America (album) | contains covers by | Glen Campbell
Sings America (album) | contains covers by | Burt Bacharach
Sings America (album) | contains covers by | Madonna
Sings America (album) | bonus track | "More Than Words Can Say"
"More Than Words Can Say" | composer | David Hasselhoff
"More Than Words Can Say" | composer | Wade Hubbard
"More Than Words Can Say" | composer | Glenn Morrow
David Hasselhoff | role in Young an

In [17]:
_, scores = benchmark(df_paragraph_triplets, qa_func, perfect_retrieval_func, ignore_errors=True)
results.append({**scores, "retrieval": "groundtruth", "context": "paragraphs+triplets"})
jprint(scores)

  0%|          | 0/100 [00:00<?, ?it/s]

{
  "exact_match": 0.58,
  "f1": 0.674978021978022,
  "fuzzy_match": 0.7
}


## Only triplets

In [18]:
def replace_paragraphs(row):
    paragraphs_with_triplets = []
    for p in row['paragraphs']:
        p = deepcopy(p) 
        triplets_str = row['triplets_str']
        p['paragraph_text'] = '\n'.join(["# Entity-relation-entity triplets", triplets_str])
        paragraphs_with_triplets.append(p)
    row['paragraphs'] = paragraphs_with_triplets
    return row

df_only_triplets = df.apply(replace_paragraphs, axis=1) 
df_only_triplets.head()
print(df_only_triplets.iloc[0]['paragraphs'][2]['paragraph_text'])

# Entity-relation-entity triplets
David Hasselhoff | released | Sings America (album)
Sings America (album) | release date | August 2009
Sings America (album) | contains covers by | Elvis Presley
Sings America (album) | contains covers by | The Beach Boys
Sings America (album) | contains covers by | Glen Campbell
Sings America (album) | contains covers by | Burt Bacharach
Sings America (album) | contains covers by | Madonna
Sings America (album) | bonus track | "More Than Words Can Say"
"More Than Words Can Say" | composer | David Hasselhoff
"More Than Words Can Say" | composer | Wade Hubbard
"More Than Words Can Say" | composer | Glenn Morrow
David Hasselhoff | role in Young and Restless | Dr. William 'Snapper' Foster Jr.
Snapper Foster | fictionality | fictional character
Snapper Foster | appears in | The Young and the Restless
Snapper Foster | performer | William Gray Espy
Snapper Foster | performer | David Hasselhoff
William Gray Espy | role duration | March 26, 1973 to July 1975
D

In [19]:
_, scores = benchmark(df_only_triplets, qa_func, perfect_retrieval_func, ignore_errors=True)
results.append({**scores, "retrieval": "groundtruth", "context": "triplets"})
jprint(scores)

  0%|          | 0/100 [00:00<?, ?it/s]

Using the latest cached version of the module from /Users/bdsaglam/.cache/huggingface/modules/evaluate_modules/metrics/bdsaglam--musique/9f409241d4cc6ea7853124e79cf44954a75900a0a2c0b9d20b909c2396f6b071 (last modified on Sat May  4 17:09:36 2024) since it couldn't be found locally at bdsaglam--musique, or remotely on the Hugging Face Hub.


{
  "exact_match": 0.6,
  "f1": 0.7105796164619694,
  "fuzzy_match": 0.74
}


# Report

In [20]:
report_df = pd.DataFrame.from_records(results, columns=['context', 'retrieval', 'exact_match', 'fuzzy_match', 'f1'])
report_df

Unnamed: 0,context,retrieval,exact_match,fuzzy_match,f1
0,paragraphs,groundtruth,0.68,0.78,0.766408
1,paragraphs+triplets,groundtruth,0.58,0.7,0.674978
2,triplets,groundtruth,0.6,0.74,0.71058


In [21]:
print(report_df[report_df['retrieval']=='groundtruth'].drop(columns=['retrieval']).to_markdown(index=False))

| context             |   exact_match |   fuzzy_match |       f1 |
|:--------------------|--------------:|--------------:|---------:|
| paragraphs          |          0.68 |          0.78 | 0.766408 |
| paragraphs+triplets |          0.58 |          0.7  | 0.674978 |
| triplets            |          0.6  |          0.74 | 0.71058  |
