# MuSiQue multi-hop baseline

In [None]:
#|default_exp musique.multihopjerx

In [None]:
#|hide
from fastcore.test import *
from nbdev.showdoc import *

In [None]:
#|export

import json
from typing import Callable

import pandas as pd
from tqdm.auto import tqdm

from bellem.jerx.reward.llm import QuestionAnsweringResult
from bellem.musique.eval import calculate_metrics, compare_answers

tqdm.pandas()

In [None]:
# |export


def make_docs(example):
    ps = example["paragraphs"]
    for p in ps:
        idx = p["idx"]
        title = p["title"]
        body = p["paragraph_text"]
        is_supporting = p["is_supporting"]
        text = f"# {title}\n{body}"
        yield dict(
            text=text,
            is_supporting=is_supporting,
            parent_id=example["id"],
            idx=idx,
        )

In [None]:
#|export
# AUTOGENERATED! DO NOT EDIT! File to edit: ../../nbs/qa.llm.ipynb.

import json
import os

from openai import OpenAI
from pydantic import BaseModel, Field

DEFAULT_SYSTEM_PROMPT = """You are an excellent Q&A system that is trusted around the world. Always answer the question using the provided context information, and not prior knowledge.

Some rules to follow:
1. Never directly reference the given context in your answer.
2. Avoid statements like 'Based on the context, ...' or 'The context information ...' or anything along those lines.

Output format:
Your output must be a single line in JSON such as:
{"reasoning": "Provide step by step reasoning for the answer.", "answer": "Provide the final answer in 2-4 words."}
"""

USER_PROMPT = """The context information below is provided as a set of entity-relation-entity triplets from knowledge graph.
---------------------
{context}
---------------------
Given the context information and not prior knowledge, answer the question.
{question}
"""

def make_qa_chat(context: str, question: str) -> list[dict]:
    return [
            {
                "role": "system",
                "content": DEFAULT_SYSTEM_PROMPT,
            },
            {
                "role": "user",
                "content": USER_PROMPT.format(context=context, question=question),
            },
        ]

# %% ../../nbs/qa.llm.ipynb 6
def parse_llm_generation(output: str):
    return json.loads(output)

# %% ../../nbs/qa.llm.ipynb 7
class QuestionAnsweringResult(BaseModel):
    """Data model for answering the question."""

    reasoning: str = Field(description="Concise reasoning for the answer.")
    answer: str = Field(description="The answer to the question in 2-4 words.")
    raw_output: str = Field(description="The raw output from the model.")


def make_question_answer_func(
    model_name: str = "gpt-3.5-turbo",
    client: OpenAI = None,
    completion_kwargs: dict | None = None,
):
    if client is None:
        client = OpenAI()

    if completion_kwargs is None:
        completion_kwargs = {}

    def func(context: str, question: str) -> QuestionAnsweringResult:
        messages = make_qa_chat(context, question)
        chat_completion = client.chat.completions.create(
            model=model_name,
            messages=messages,
            **completion_kwargs,
        )
        text = chat_completion.choices[0].message.content
        output = parse_llm_generation(text)
        return QuestionAnsweringResult(answer=output["answer"], reasoning=output["reasoning"], raw_output=text)

    return func


In [None]:
#|export

class BaselineMultiHop:
    def __init__(self, qa_func, retrieval_func):
        self.qa_func = qa_func
        self.retrieval_func = retrieval_func

    def _call(self, example) -> QuestionAnsweringResult:
        docs = list(make_docs(example))
        
        # First question
        question1 = example["question_decomposition"][0]["question"]
        query1 = question1
        docs1 = self.retrieval_func(docs, query1)
        context1 = "\n".join(doc['text'] for doc in docs1)
        result1 = self.qa_func(context=context1, question=question1)
        hop1 = {
            "question": question1,
            "query" : query1,
            "context": context1,
            "answer": result1.answer,
            "reasoning": result1.reasoning,
        }

        # Second question
        if result1.answer == "N/A":
            return QuestionAnsweringResult({
                "answer": "N/A",
                "reasoning": result1.reasoning,
                "hops": [hop1],
            })

        question2 = example["question_decomposition"][1]["question"]
        question2 = question2.replace("#1", result1.answer)
        query2 = question2
        docs2 = self.retrieval_func(docs, query2)
        context2 = "\n".join(doc['text'] for doc in docs2)
        result2 = self.qa_func(context=context2, question=question2)
        hop2 = {
            "question": question2,
            "query": query2,
            "context": context2,
            "answer": result2.answer,
            "reasoning": result2.reasoning,
        }
        return QuestionAnsweringResult(answer=result2.answer, reasoning=result2.reasoning, raw_output=json.dumps([hop1, hop2]))

    def __call__(self, example, ignore_errors: bool = False) -> QuestionAnsweringResult:
        try:
            output = self._call(example)
        except Exception as exc:
            if ignore_errors:
                id = example['id']
                print(f"Failed to answer the question {id}\n{exc}")
                output = QuestionAnsweringResult(reasoning="", answer="N/A", raw_output=str(exc))
            else:
                raise
        return output

In [None]:
#|export

def benchmark(
    dataf: pd.DataFrame,
    qa_func: Callable,
    retrieval_func: Callable,
    ignore_errors: bool = False,
) -> tuple[pd.DataFrame, dict]:
    pipeline = BaselineMultiHop(qa_func, retrieval_func)

    def process(example):
        output = pipeline(example, ignore_errors=ignore_errors)
        example["predicted_answer"] = output.answer
        example["raw_llm_output"] = output
        return example

    dataf = dataf.progress_apply(process, axis=1)
    dataf = compare_answers(dataf)
    scores = calculate_metrics(dataf)
    scores["fuzzy_match"] = dataf["fuzzy_match"].mean()
    return dataf, scores

In [None]:
df = pd.read_json('../data/generated/musique-evaluation/dataset.jsonl', orient='records', lines=True)
print(len(df))
df.head()

100


Unnamed: 0,id,paragraphs,question,question_decomposition,answer,answer_aliases,answerable,answers
0,2hop__131818_161450,"[{'idx': 0, 'title': 'Maria Carrillo High Scho...",Where is the Voshmgir District located?,"[{'id': 131818, 'question': 'Which state is Vo...",in the north-east of the country south of the ...,"[Caspian Sea, in the north-east of the country...",True,"[Caspian Sea, in the north-east of the country..."
1,2hop__444265_82341,"[{'idx': 0, 'title': 'Ocala, Florida', 'paragr...",In what part of Florida is Tom Denney's birthp...,"[{'id': 444265, 'question': 'Tom Denney >> pla...",in Northern Florida,"[in Northern Florida, Northern Florida]",True,"[in Northern Florida, Northern Florida]"
2,2hop__711946_269414,"[{'idx': 0, 'title': 'Wild Thing (Tone Lōc son...",What record label is the performer who release...,"[{'id': 711946, 'question': 'All Your Faded Th...",Kill Rock Stars,[Kill Rock Stars],True,[Kill Rock Stars]
3,2hop__311931_417706,"[{'idx': 0, 'title': 'The Main Attraction (alb...",What record label does the performer of Emotio...,"[{'id': 311931, 'question': 'Emotional Rain >>...",Attic Records,"[Attic, Attic Records]",True,"[Attic, Attic Records]"
4,2hop__809785_606637,"[{'idx': 0, 'title': 'The Main Attraction (alb...",What record label does the performer of Advent...,"[{'id': 809785, 'question': 'Adventures in You...",Secret City Records,[Secret City Records],True,[Secret City Records]


In [None]:
from bellem.qa.llm import make_question_answer_func

qa_func = make_question_answer_func()
retrieval_func = lambda docs, query: docs
pipeline = BaselineMultiHop(qa_func, retrieval_func)

In [None]:
i = 1
example = df.iloc[i].to_dict()
output = pipeline(example).dict()
print("Question:", example['question'])
print("Reference answer:", example['answer'])
print("Predicted answer:", output['answer'])
print("Reasoning:", output['reasoning'])

Question: In what part of Florida is Tom Denney's birthplace located?
Reference answer: in Northern Florida
Predicted answer: Northern Florida
Reasoning: Ocala is located in Northern Florida.


In [None]:
json.loads(output['raw_output'])

[{'question': 'Tom Denney >> place of birth',
  'query': 'Tom Denney >> place of birth',
  'context': '# Ocala, Florida\nOcala (/ oʊ ˈkælə / oh - KAL - ə) is a city located in Northern Florida. As of the 2013 census, its population, estimated by the United States Census Bureau, was 57,468, making it the 45th most populated city in Florida.\n# Jasmine Estates, Florida\nJasmine Estates is a census-designated place (CDP) in Pasco County, Florida, United States. The population was 18,989 at the 2010 census.\n# Ridgecrest, Florida\nRidgecrest is a census-designated place (CDP) in Pinellas County, Florida, United States. The population was 2,558 at the 2010 census.\n# Villas, Florida\nVillas is a census designated place (CDP) in Lee County, Florida, United States. The population was 11,346 at the 2000 census. It is part of the Cape Coral-Fort Myers, Florida Metropolitan Statistical Area.\n# Lacoochee, Florida\nLacoochee is a census-designated place (CDP) in Pasco County, Florida, United Stat

In [None]:
mdf, scores = benchmark(df.sample(2), qa_func, retrieval_func)
print(scores)
mdf

  0%|          | 0/2 [00:00<?, ?it/s]

{'exact_match': 0.0, 'f1': 0.0, 'fuzzy_match': 0.0}


Unnamed: 0,id,paragraphs,question,question_decomposition,answer,answer_aliases,answerable,answers,predicted_answer,raw_llm_output,exact_match,fuzzy_match
28,2hop__723101_150107,"[{'idx': 0, 'title': 'Bruce Chandler', 'paragr...",Who published Communication of the association...,"[{'id': 723101, 'question': 'W. Bruce Croft >>...",Association for Computing Machinery,"[ACM, Association for Computing Machinery]",True,"[ACM, Association for Computing Machinery]",W. Bruce Croft,reasoning='Based on the entities mentioned in ...,False,False
98,2hop__342746_679190,"[{'idx': 0, 'title': 'Adam's Rib', 'paragraph_...",Who is the spouse of the screenwriter of The A...,"[{'id': 342746, 'question': 'The Actress >> sc...",Garson Kanin,[Garson Kanin],True,[Garson Kanin],Not mentioned,reasoning='Only spouses of notable individuals...,False,False


In [None]:
#|hide
import nbdev; nbdev.nbdev_export()