In [1]:
from dotenv import load_dotenv
load_dotenv()

True

In [2]:
import os
import random
from functools import partial
from pathlib import Path

import pandas as pd
from openai import OpenAI

from bellek.utils import jprint
from bellek.utils import set_seed

set_seed(42)

In [3]:
from tqdm.auto import tqdm
tqdm.pandas()

In [4]:
# LLM_MODEL_NAME = "gpt-3.5-turbo"
LLM_MODEL_NAME = "llama-3-70b-tgi"

In [5]:
import bm25s

class KnowledgeGraph:
    def __init__(self, triplets: list[tuple[str, str, str]]):
        self.corpus = [' | '.join(triplet) for triplet in triplets]
        self.retriever = bm25s.BM25(corpus=self.corpus)
        self.tokenized_corpus = bm25s.tokenize(self.corpus)
        self.retriever.index(self.tokenized_corpus)

    def search(self, query: str, top_k: int = 10):
        top_k = min(top_k, len(self.corpus))
        results, _ = self.retriever.retrieve(bm25s.tokenize(query), k=top_k)
        return results[0].tolist()

In [6]:
import logging
from bm25s import debug_logger as bm25s_logger

bm25s_logger.setLevel(logging.WARNING)

In [7]:
DEFAULT_SYSTEM_PROMPT = """You are an expert Q&A system that is trusted around the world. Always answer the question using the provided context information, and not prior knowledge.
Some rules to follow:
1. Never directly reference the given context in your answer.
2. Avoid statements like 'Based on the context, ...' or 'The context information ...' or anything along those lines.
3. Do not use propositions like 'because', 'since', 'as', 'due to', 'as a result', 'in', 'on', 'at', etc.
4. If the answer is not present in the context, provide "N/A" as the answer.
"""

USER_PROMPT = """The context information below is provided as a set of entity-relation-entity triplets from knowledge graph.
---------------------
{context}
---------------------
Given the context information and not prior knowledge, answer the question.
{question}

Response Format:
Reasoning: Provide your reasoning for the answer.
Answer: Provide the answer in 2-4 words.
"""


def make_question_answer_func(model_name: str = "gpt-3.5-turbo", client: OpenAI = None):
    if client is None:
        client = OpenAI()

    def func(context: str, question: str) -> dict:
        messages = [
            {
                "role": "system",
                "content": DEFAULT_SYSTEM_PROMPT,
            },
            {
                "role": "user",
                "content": USER_PROMPT.format(context=context, question=question),
            },
        ]
        chat_completion = client.chat.completions.create(
            model=model_name,
            messages=messages,
        )
        text = chat_completion.choices[0].message.content
        reasoning = ""
        answer = "N/A"
        for line in text.splitlines():
            if line.lower().startswith("reasoning:"):
                reasoning = line.split(":", 1)[1].strip()
            elif line.lower().startswith("answer:"):
                answer = line.split(":", 1)[1].strip()
        return dict(answer=answer, reasoning=reasoning, raw=text)

    return func

In [8]:
DEFAULT_SYSTEM_PROMPT = """You are an excellent knowledge graph search agent that is trusted around the world. You come up with a query from a given question so that you can search the knowledge graph for relevant information. 

# Example
Question: What is the capital of France?
Query: capital of France

Question: Which city Mississipi river flows through?
Query: Mississipi river

# Task
Question: {question}
Query: 
"""

def make_query_expansion_func(model_name: str = "gpt-3.5-turbo", client: OpenAI = None):
    if client is None:
        client = OpenAI()

    def func(question: str) -> dict:
        messages = [
            {
                "role": "system",
                "content": DEFAULT_SYSTEM_PROMPT.format(question=question),
            },
        ]
        chat_completion = client.chat.completions.create(
            model=model_name,
            messages=messages,
        )
        text = chat_completion.choices[0].message.content
        return text.replace("Query: ", "").strip().split(", ")

    return func

## Dataset

In [9]:
dataset_file: Path = Path("../../data/generated/musique-evaluation/dataset.jsonl")
df = pd.read_json(dataset_file, lines=True)
df.head()

Unnamed: 0,id,paragraphs,question,question_decomposition,answer,answer_aliases,answerable
0,2hop__131818_161450,"[{'idx': 0, 'title': 'Maria Carrillo High Scho...",Where is the Voshmgir District located?,"[{'id': 131818, 'question': 'Which state is Vo...",in the north-east of the country south of the ...,"[Caspian Sea, in the north-east of the country...",True
1,2hop__444265_82341,"[{'idx': 0, 'title': 'Ocala, Florida', 'paragr...",In what part of Florida is Tom Denney's birthp...,"[{'id': 444265, 'question': 'Tom Denney >> pla...",in Northern Florida,"[in Northern Florida, Northern Florida]",True
2,2hop__711946_269414,"[{'idx': 0, 'title': 'Wild Thing (Tone Lōc son...",What record label is the performer who release...,"[{'id': 711946, 'question': 'All Your Faded Th...",Kill Rock Stars,[Kill Rock Stars],True
3,2hop__311931_417706,"[{'idx': 0, 'title': 'The Main Attraction (alb...",What record label does the performer of Emotio...,"[{'id': 311931, 'question': 'Emotional Rain >>...",Attic Records,"[Attic, Attic Records]",True
4,2hop__809785_606637,"[{'idx': 0, 'title': 'The Main Attraction (alb...",What record label does the performer of Advent...,"[{'id': 809785, 'question': 'Adventures in You...",Secret City Records,[Secret City Records],True


In [10]:
jerx_file = Path("../../data/raw/musique-evaluation/jerx-inferences/llama3-base.jsonl")
jerx_df = pd.read_json(jerx_file, lines=True)
jerx_df.head()

Unnamed: 0,id,paragraph_idx,paragraph_text,paragraph_title,is_supporting,text,input,generation
0,2hop__131818_161450,0,Maria Carrillo High School is a public high sc...,Maria Carrillo High School,False,# Maria Carrillo High School\nMaria Carrillo H...,[{'content': 'You are an excellent knowledge g...,Maria Carrillo High School | location | Santa ...
1,2hop__131818_161450,1,"Golestān Province (Persian: استان گلستان‎, Ost...",Golestan Province,True,# Golestan Province\nGolestān Province (Persia...,[{'content': 'You are an excellent knowledge g...,Golestan Province | location | north-east of I...
2,2hop__131818_161450,2,Voshmgir District () is a district (bakhsh) in...,Voshmgir District,True,# Voshmgir District\nVoshmgir District () is a...,[{'content': 'You are an excellent knowledge g...,"Voshmgir District | location | Aqqala County, ..."
3,2hop__131818_161450,3,52 Heroor is a village in the southern state o...,52 Heroor,False,# 52 Heroor\n52 Heroor is a village in the sou...,[{'content': 'You are an excellent knowledge g...,"52 Heroor | location | Karnataka, India\n52 He..."
4,2hop__131818_161450,4,Vennaimalai is a village of Karur District loc...,Vennaimalai,False,# Vennaimalai\nVennaimalai is a village of Kar...,[{'content': 'You are an excellent knowledge g...,Vennaimalai | location | Karur District\nVenna...


In [11]:
jerx_mapping = {(row['id'], row['paragraph_idx']): row['generation'] for _, row in jerx_df.iterrows()}

def extract_triplets(example: dict):
    generations = '\n'.join(jerx_mapping[(example['id'], p['idx'])] for p in example['paragraphs'] if p['is_supporting'])
    # generations = '\n'.join(jerx_mapping[(example['id'], p['idx'])] for p in example['paragraphs'])
    example["triplets"] = [line.split(" | ") for line in generations.split('\n') if line.strip()]
    return example

In [12]:
df = df.apply(extract_triplets, axis=1)
df.head()

Unnamed: 0,id,paragraphs,question,question_decomposition,answer,answer_aliases,answerable,triplets
0,2hop__131818_161450,"[{'idx': 0, 'title': 'Maria Carrillo High Scho...",Where is the Voshmgir District located?,"[{'id': 131818, 'question': 'Which state is Vo...",in the north-east of the country south of the ...,"[Caspian Sea, in the north-east of the country...",True,"[[Golestan Province, location, north-east of I..."
1,2hop__444265_82341,"[{'idx': 0, 'title': 'Ocala, Florida', 'paragr...",In what part of Florida is Tom Denney's birthp...,"[{'id': 444265, 'question': 'Tom Denney >> pla...",in Northern Florida,"[in Northern Florida, Northern Florida]",True,"[[Ocala, location, Florida], [Ocala, location ..."
2,2hop__711946_269414,"[{'idx': 0, 'title': 'Wild Thing (Tone Lōc son...",What record label is the performer who release...,"[{'id': 711946, 'question': 'All Your Faded Th...",Kill Rock Stars,[Kill Rock Stars],True,"[[All Your Faded Things, album, ], [All Your F..."
3,2hop__311931_417706,"[{'idx': 0, 'title': 'The Main Attraction (alb...",What record label does the performer of Emotio...,"[{'id': 311931, 'question': 'Emotional Rain >>...",Attic Records,"[Attic, Attic Records]",True,"[[Lee Aaron (album), release date, 1987-02-17]..."
4,2hop__809785_606637,"[{'idx': 0, 'title': 'The Main Attraction (alb...",What record label does the performer of Advent...,"[{'id': 809785, 'question': 'Adventures in You...",Secret City Records,[Secret City Records],True,"[[Adventures in Your Own Backyard, type, album..."


In [13]:
i = 1
example = df.iloc[i]
example

id                                                       2hop__444265_82341
paragraphs                [{'idx': 0, 'title': 'Ocala, Florida', 'paragr...
question                  In what part of Florida is Tom Denney's birthp...
question_decomposition    [{'id': 444265, 'question': 'Tom Denney >> pla...
answer                                                  in Northern Florida
answer_aliases                      [in Northern Florida, Northern Florida]
answerable                                                             True
triplets                  [[Ocala, location, Florida], [Ocala, location ...
Name: 1, dtype: object

In [14]:
print(example['question'])

In what part of Florida is Tom Denney's birthplace located?


In [15]:
print(example['question_decomposition'][0]['question'])
print(example['question_decomposition'][1]['question'])

Tom Denney >> place of birth
where is #1 in the state of florida


## Search

In [16]:
i = random.choice(range(len(df)))
example = df.iloc[i]
example['question_decomposition']

[{'id': 144303,
  'question': 'What city was Eutychides born?',
  'answer': 'Sicyon',
  'paragraph_support_idx': 8},
 {'id': 483189,
  'question': '#1 >> part of',
  'answer': 'Greek mythology',
  'paragraph_support_idx': 10}]

In [17]:
kg = KnowledgeGraph(example['triplets'])

Split strings:   0%|          | 0/21 [00:00<?, ?it/s]

BM25S Count Tokens:   0%|          | 0/21 [00:00<?, ?it/s]

BM25S Compute Scores:   0%|          | 0/21 [00:00<?, ?it/s]

In [18]:
question = example['question_decomposition'][0]['question']
print(question)
for triplet in kg.search(question):
    print(triplet)

What city was Eutychides born?


Split strings:   0%|          | 0/1 [00:00<?, ?it/s]

BM25S Retrieve:   0%|          | 0/1 [00:00<?, ?it/s]

Eutychides | teacher | Lysippus
Eutychides | work | Statue of Tyche
Eutychides | role | Greek sculptor
Eutychides | work | Vatican (small copy)
Antioch | recipient | Statue of Tyche
Sicyon | previous name of polis | Mecone
Sicyon | eponym | polis
Tyche | subject | statue
Sicyon | previous name of polis | Aegiale
Sicyon | father | Marathon


## Multi-hop question answering

In [19]:
qa_func = make_question_answer_func(LLM_MODEL_NAME)

In [20]:
def mhqa(example: dict, top_k: int = 10) -> dict:
    kg = KnowledgeGraph(example['triplets'])

    # First question
    question1 = example["question_decomposition"][0]["question"]
    query1 = question1
    docs1 = kg.search(query1, top_k=top_k)
    context1 = "\n".join(docs1)
    result1 = qa_func(context=context1, question=question1)
    hop1 = {
        "question": question1,
        "query" : query1,
        "context": context1,
        "answer": result1["answer"],
        "reasoning": result1["reasoning"],
    }

    # Second question
    if result1["answer"] == "N/A":
        return {
            "answer": "N/A",
            "hops": [hop1],
        }

    question2 = example["question_decomposition"][1]["question"]
    question2 = question2.replace("#1", result1["answer"])
    query2 = question2
    docs2 = kg.search(query2, top_k=top_k)
    context2 = "\n".join(docs2)
    result2 = qa_func(context=context2, question=question2)
    hop2 = {
        "question": question2,
        "query": query2,
        "context": context2,
        "answer": result2["answer"],
        "reasoning": result2["reasoning"],
    }
    return {
        "answer": result2["answer"],
        "hops": [hop1, hop2],
    }

In [21]:
i = 0
example = df.iloc[i]
result = mhqa(example)
jprint(result)
print(example['answer_aliases'])

Split strings:   0%|          | 0/9 [00:00<?, ?it/s]

BM25S Count Tokens:   0%|          | 0/9 [00:00<?, ?it/s]

BM25S Compute Scores:   0%|          | 0/9 [00:00<?, ?it/s]

Split strings:   0%|          | 0/1 [00:00<?, ?it/s]

BM25S Retrieve:   0%|          | 0/1 [00:00<?, ?it/s]

Split strings:   0%|          | 0/1 [00:00<?, ?it/s]

BM25S Retrieve:   0%|          | 0/1 [00:00<?, ?it/s]

{
  "answer": "north-east Iran",
  "hops": [
    {
      "question": "Which state is Voshmgir District located?",
      "query": "Which state is Voshmgir District located?",
      "context": "Voshmgir District | city | Anbar Olum\nVoshmgir District | population in families | 5,266\nVoshmgir District | population | 25,149\nVoshmgir District | rural districts | Mazraeh-ye Jonubi Rural District, Mazraeh-ye Shomali Rural District\nVoshmgir District | location | Aqqala County, Golestan Province, Iran\nGolestan Province | country | Iran\nGolestan Province | capital | Gorgan\nGolestan Province | location relative to Caspian Sea | south\nGolestan Province | location | north-east of Iran",
      "answer": "Golestan Province",
      "reasoning": "The context information provides the location of Voshmgir District as \"Aqqala County, Golestan Province, Iran\". Since Golestan Province is a province in Iran, and provinces are typically equivalent to states, we can infer that Voshmgir District is loc

In [None]:
df['mhqa_result'] = df.progress_apply(mhqa, axis=1)
df["predicted_answer"] = df["mhqa_result"].map(lambda x: x["answer"] or "N/A")

In [23]:
from bellek.musique.eval import calculate_metrics, compare_answers

comp_df = compare_answers(df)
scores = calculate_metrics(comp_df)
scores["fuzzy_match"] = comp_df["fuzzy_match"].mean()
jprint(scores)

{
  "exact_match": 0.4,
  "f1": 0.48717421467421473,
  "fuzzy_match": 0.485
}


## Inspect

In [24]:
i = 4
example = df.iloc[i]
example

id                                                      2hop__809785_606637
paragraphs                [{'idx': 0, 'title': 'The Main Attraction (alb...
question                  What record label does the performer of Advent...
question_decomposition    [{'id': 809785, 'question': 'Adventures in You...
answer                                                  Secret City Records
answer_aliases                                        [Secret City Records]
answerable                                                             True
triplets                  [[Adventures in Your Own Backyard, type, album...
mhqa_result               {'answer': 'Secret City Records', 'hops': [{'q...
predicted_answer                                        Secret City Records
exact_match                                                            True
fuzzy_match                                                            True
Name: 4, dtype: object

In [25]:
example['question_decomposition']

[{'id': 809785,
  'question': 'Adventures in Your Own Backyard >> performer',
  'answer': 'Patrick Watson',
  'paragraph_support_idx': 15},
 {'id': 606637,
  'question': '#1 >> record label',
  'answer': 'Secret City Records',
  'paragraph_support_idx': 18}]

In [26]:
example['answer_aliases']

['Secret City Records']

In [27]:
example['mhqa_result']

{'answer': 'Secret City Records',
 'hops': [{'question': 'Adventures in Your Own Backyard >> performer',
   'query': 'Adventures in Your Own Backyard >> performer',
   'context': 'Adventures in Your Own Backyard | type | album\nAdventures in Your Own Backyard | artist | Patrick Watson\nAdventures in Your Own Backyard | release date | April 2012\nAdventures in Your Own Backyard | genre | musical style (simpler and more emotional)\nFireweed | music video | features live action and animation\nFireweed | album single release date | (unknown)\nBeijing | album performance | on CBC Radio\'s "Q" radio show\nTracy\'s Waters | album single release date | 2009-03-05\nPatrick Watson | album label | Secret City Records\nPatrick Watson | album release date | 2009-04-28',
   'answer': 'Patrick Watson',
   'reasoning': 'The query "Adventures in Your Own Backyard >> performer" is asking about the performer associated with the album "Adventures in Your Own Backyard". From the context information, we can