In [1]:
import json
import magentic
import logging
from copy import deepcopy
from datetime import datetime
from functools import partial
from pathlib import Path

import bm25s
from tenacity import retry, stop_after_attempt, wait_random_exponential
import pandas as pd
from dotenv import load_dotenv
from sentence_transformers import SentenceTransformer
from tqdm.auto import tqdm

from bellek.musique.singlehop import benchmark as benchmark_single
from bellek.musique.multihop import benchmark as benchmark_multi
from bellek.musique.qa import answer_question_standard, answer_question_cte
from bellek.utils import set_seed

load_dotenv()

tqdm.pandas()
pd.options.display.float_format = "{:,.3f}".format

set_seed(89)

logging.getLogger("bm25s").setLevel(logging.ERROR)

In [2]:
from bellek.jerx.fewshot.llm import make_kg_triplet_extract_fn

extract_kg_triplets = make_kg_triplet_extract_fn(model='llama3-8b-togetherai', completion_params= { "temperature": 0.1 })

In [26]:
RECORD_IDS = ['2hop__197090_126045']

df = pd.read_json("../../data/generated/musique-evaluation/dataset.jsonl", orient="records", lines=True)
df = df.set_index("id", drop=False).loc[RECORD_IDS].copy().reset_index(drop=True)

In [27]:
df.iloc[0]['paragraphs'][16]['paragraph_text'] = """Melih Gökçek had been the Metropolitan Mayor of Ankara since 1994 as a politician from the Welfare Party until 2018 where Mansur Yavas was elected. He later joined the Virtue Party and then the AKP. Initially elected in the 1994 local elections, he was re-elected in 1999, 2004 and 2009. In the 2014 local election, Gökçek stood for a fifth term. """

In [28]:
def format_triplets(triplets: list[tuple]):
    return '\n'.join([" | ".join(triplet) for triplet in triplets])

def extract_triplets(example: dict):
    example["triplets_str"] = [format_triplets(extract_kg_triplets(p['paragraph_text'])) for p in example["paragraphs"]]
    return example

df = df.apply(extract_triplets, axis=1)
df.head()

Unnamed: 0,id,paragraphs,question,question_decomposition,answer,answer_aliases,answerable,answers,triplets_str
0,2hop__197090_126045,"[{'idx': 0, 'title': 'Bucak, Çine', 'paragraph...",Who was in charge of the city where Aydın Örs ...,"[{'id': 197090, 'question': 'Aydın Örs >> plac...",Melih Gökçek,"[Mansur Yavaş, Melih Gökçek]",True,"[Mansur Yavaş, Melih Gökçek]",[Bucak | location | District of Çine\nBucak | ...


In [29]:
row = df.iloc[0]
print(row['question'])
print('-'*80)
for triplet_str in row['triplets_str']:
    print(triplet_str)

Who was in charge of the city where Aydın Örs was born?
--------------------------------------------------------------------------------
Bucak | location | District of Çine
Bucak | location | Aydın Province
Bucak | location | Turkey
Bucak | population | 217
Bucak | population as of | 2010
Kızılcapınar | location | Germencik District
Kızılcapınar | location | Aydın Province
Kızılcapınar | location | Turkey
Kızılcapınar | population | 720
Kızılcapınar | population as of | 2010
Karaağaçlı | location | Germencik District
Karaağaçlı | location | Aydın Province
Karaağaçlı | location | Turkey
Karaağaçlı | population | 388
Karaağaçlı | population as of | 2010
Yamalak | location | District of Kuyucak
Yamalak | location | Aydın Province
Yamalak | location | Turkey
Yamalak | population | 1952
Yamalak | population year | 2010
Battle of Antioch on the Meander | took place near | Yamalak
Ömerbeyli | location | Germencik District
Ömerbeyli | location | Aydın Province
Ömerbeyli | location | Turkey
Öme

In [30]:
# Retrieval functions
def dummy_retrieval(docs: list[dict], query: str, top_k: int):
    return docs


def perfect_retrieval(docs: list[dict], query: str, top_k: int):
    return [doc for doc in docs if doc["is_supporting"]]


results = []

# Parameters
qa_retry_deco = retry(stop=stop_after_attempt(3), wait=wait_random_exponential(multiplier=1, max=30))
llm = magentic.OpenaiChatModel("gpt-3.5-turbo", temperature=0.1)

# Hyperparamaters
qdecomp_params = [
    (False, benchmark_single),
    # (True, benchmark_multi),
]

prompting_params = [
    ("Standard", answer_question_standard),
]

retrieval_params = [
    # ("Sparse", bm25_retrieval, [40]),
    # ("Dense", semantic_retrieval, [5, 10, 15, 20, 30, 40, 50, 70]),
    # ("Dummy", dummy_retrieval, [0]),
    ("Perfect", perfect_retrieval, [0]),
]

# ## Only triplets

print("Running QA experiments with only triplets")

def replace_paragraphs_with_triplets(row):
    paragraphs_with_triplets = []
    for p in row["paragraphs"]:
        triplets_str = format_triplets(extract_kg_triplets(p['paragraph_text']))
        for triplet in triplets_str.splitlines():
            p = deepcopy(p)
            p["title"] = ""
            p["paragraph_text"] = triplet.strip()
            paragraphs_with_triplets.append(p)
    row["paragraphs"] = paragraphs_with_triplets
    return row


df_only_triplets = df.apply(replace_paragraphs_with_triplets, axis=1)

result_dfs = []

with llm:
    for run in range(1):
        for qdecomp, benchmark in qdecomp_params:
            for qa_technique, qa_func in prompting_params:
                for retriever_name, retriever, top_ks in retrieval_params:
                    for top_k in top_ks:
                        result_df, scores = benchmark(
                            df_only_triplets,
                            qa_retry_deco(qa_func),
                            partial(retriever, top_k=top_k),
                        )
                        result_dfs.append(result_df)
                        results.append(
                            {
                                **scores,
                                "qdecomp": qdecomp,
                                "context": "Triplets",
                                "retrieval": retriever_name,
                                "top_k": top_k,
                                "qa": qa_technique,
                                "run": run,
                            }
                        )


Running QA experiments with only triplets


  0%|          | 0/1 [00:00<?, ?it/s]

In [31]:
result_df

Unnamed: 0,id,paragraphs,question,question_decomposition,answer,answer_aliases,answerable,answers,triplets_str,predicted_answer,raw_output,exact_match,fuzzy_match
0,2hop__197090_126045,"[{'idx': 0, 'title': '', 'paragraph_text': 'Bu...",Who was in charge of the city where Aydın Örs ...,"[{'id': 197090, 'question': 'Aydın Örs >> plac...",Melih Gökçek,"[Mansur Yavaş, Melih Gökçek]",True,"[Mansur Yavaş, Melih Gökçek]",[Bucak | location | District of Çine\nBucak | ...,Melih Gökçek,"{'answer': 'Melih Gökçek', 'hops': [{'question...",True,True


In [32]:
for paragraph in row['paragraphs']:
    if paragraph['is_supporting']:
        print(paragraph['idx'])
        print(paragraph['paragraph_text'])

16
Melih Gökçek had been the Metropolitan Mayor of Ankara since 1994 as a politician from the Welfare Party until 2018 where Mansur Yavas was elected. He later joined the Virtue Party and then the AKP. Initially elected in the 1994 local elections, he was re-elected in 1999, 2004 and 2009. In the 2014 local election, Gökçek stood for a fifth term. 
17
Aydın Örs (born 1946 in Ankara) is a Turkish former basketball coach and former head coach of Fenerbahçe. He started to play basketball in 1963. He played for DSİ Spor and Sekerspor also played for 35 times with the Turkish national basketball team.


In [33]:
result_df, scores = benchmark_single(
    df, 
    answer_question_cte,
    partial(perfect_retrieval, top_k=2),
)
result_df

  0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,id,paragraphs,question,question_decomposition,answer,answer_aliases,answerable,answers,triplets_str,predicted_answer,raw_output,exact_match,fuzzy_match
0,2hop__197090_126045,"[{'idx': 0, 'title': 'Bucak, Çine', 'paragraph...",Who was in charge of the city where Aydın Örs ...,"[{'id': 197090, 'question': 'Aydın Örs >> plac...",Melih Gökçek,"[Mansur Yavaş, Melih Gökçek]",True,"[Mansur Yavaş, Melih Gökçek]",[Bucak | location | District of Çine\nBucak | ...,Melih Gökçek,"{'answer': 'Melih Gökçek', 'hops': [{'question...",True,True


In [18]:
result_df.iloc[0]['raw_output']['hops'][0]['llm_output']

{'triplets': ['Aydın Örs | born in | Ankara',
  'Melih Gökçek | Metropolitan Mayor of | Ankara'],
 'answer': 'Melih Gökçek',
 'generation': 'Triplets:\nAydın Örs | born in | Ankara\nMelih Gökçek | Metropolitan Mayor of | Ankara\n\nAnswer: Melih Gökçek'}