In [1]:
from dotenv import load_dotenv
load_dotenv()

True

In [2]:
import os
import warnings
from pathlib import Path

import pandas as pd

from bellek.text.utils import fuzzy_match
from bellek.utils import set_seed, jprint

set_seed(41)

In [3]:
ds_df = pd.read_json('../../data/generated/musique-evaluation/dataset.jsonl', orient='records', lines=True)
qd_df = pd.read_json('../../data/generated/musique-evaluation/question-decomposition.jsonl', orient='records', lines=True)
df = pd.merge(ds_df.drop(columns=['question', 'question_decomposition']), qd_df, on='id', suffixes=('', ''))
df.head()

Unnamed: 0,id,paragraphs,answer,answer_aliases,answerable,question,question_decomposition
0,2hop__131818_161450,"[{'idx': 0, 'title': 'Maria Carrillo High Scho...",in the north-east of the country south of the ...,[in the north-east of the country south of the...,True,Where is the Voshmgir District located?,[{'question': 'Which country is the Voshmgir D...
1,2hop__444265_82341,"[{'idx': 0, 'title': 'Ocala, Florida', 'paragr...",in Northern Florida,"[Northern Florida, in Northern Florida]",True,In what part of Florida is Tom Denney's birthp...,[{'question': 'Where is Tom Denney's birthplac...
2,2hop__711946_269414,"[{'idx': 0, 'title': 'Wild Thing (Tone Lōc son...",Kill Rock Stars,[Kill Rock Stars],True,What record label is the performer who release...,[{'question': 'Who is the performer that relea...
3,2hop__311931_417706,"[{'idx': 0, 'title': 'The Main Attraction (alb...",Attic Records,"[Attic, Attic Records]",True,What record label does the performer of Emotio...,[{'question': 'Who is the performer of Emotion...
4,2hop__809785_606637,"[{'idx': 0, 'title': 'The Main Attraction (alb...",Secret City Records,[Secret City Records],True,What record label does the performer of Advent...,[{'question': 'Who is the performer of Adventu...


In [4]:
def make_docs(example, only_supporting=False):
    ps = example["paragraphs"]
    for p in ps:
        if only_supporting and not p["is_supporting"]:
            continue
        idx = p["idx"]
        title = p["title"]
        body = p["paragraph_text"]
        is_supporting = p["is_supporting"]
        text = f"# {title}\n{body}"
        yield dict(
            text=text,
            metadata={"parent_id": example["id"], "idx": idx, "is_supporting": is_supporting},
        )

In [5]:
def present_example(example, predicted_answer):
    text = "\n\n".join([p["paragraph_text"] for p in example['paragraphs']])
    print("="*80)
    print("Question:", example["question"])
    print("Reference Answer:", example['answer'])
    print("Predicted Answer:", predicted_answer)
    print("-"*80)
    print("Paragraphs")
    print(text)

In [6]:
from magentic import chatprompt, SystemMessage, UserMessage
from pydantic import BaseModel, Field

class Output(BaseModel):
    """Output containing the answers for questions."""
    reasoning: str = Field(description="Multi-hop reasoning for the answer.")
    answer: str


SYSTEM_PROMPT = """You are an expert multi-hop Q&A system that is trusted around the world.
You are given a question and its decomposition to sub-questions. You reach to the answer by answering each sub-question. Always answer the query using the provided context information, and not prior knowledge.
Some rules to follow:
1. Never directly reference the given context in your answer.
2. Avoid statements like 'Based on the context, ...' or 'The context information ...' or anything along those lines.
3. Your answers must be 2-4 words long.

Response Format:
Reasoning: ...
Answer for sub-question 1: ...
Final Answer: ...
"""

USER_PROMPT = """Context information is below.
---------------------
{context_str}
---------------------
Given the context information and not prior knowledge, answer the question.
{query_str}
"""

@chatprompt(
    SystemMessage(SYSTEM_PROMPT),
    UserMessage(USER_PROMPT),
)
def answer_question(context_str: str, query_str: str) -> str: ...


In [7]:
from magentic import OpenaiChatModel

llm = OpenaiChatModel(
    model="meta-llama/Meta-Llama-3-70B-Instruct",
    temperature=0.0,
    base_url="http://127.0.0.1:8080/v1",
    api_key="anything",
)

In [8]:
def format_question(example):
    sub_questions = '\n'.join([f"  Sub-question {i+1}: {item['question']}" for i, item in enumerate(example['question_decomposition'])])
    return f"\n{sub_questions}"

In [9]:
def answer(example):
    documents = list(make_docs(example, only_supporting=False))
    context = "\n\n".join([doc["text"] for doc in documents])
    with llm:
        text = answer_question(context_str=context, query_str=format_question(example))
    lines = text.splitlines()
    answer = None
    for line in lines:
        if line.lower().startswith("final answer"):
            answer = line.split(":")[1].strip()
    example['predicted_answer'] = answer
    example['raw_llm_output'] = text
    return example

In [10]:
def safe_answer(example):
    try:
        return answer(example)
    except Exception as exc:
        id = example['id']
        print(f"Failed to answer the question {id}\n{exc}")
        example['predicted_answer'] = ''
        example['raw_llm_output'] = ''
        return example

In [11]:
# from langchain.globals import set_debug
# set_debug(True)

# i = 0
# example = df.iloc[i].to_dict()
# example_ = answer(example)
# print("Question:", example['question'])
# print("Reference answer:", example['answer'])
# print("Predicted answer:", example_['predicted_answer'])

# print("-"*20)
# jprint(example_['raw_llm_output'])

# set_debug(False)

In [12]:
df = df.apply(safe_answer, axis=1)

Failed to answer the question 2hop__398344_85513
list index out of range


## Evaluation

In [20]:
from bellek.musique.eval import calculate_metrics, compare_answers

df['predicted_answer'] = df['predicted_answer'].map(lambda x: x or "N/A")
df = compare_answers(df)
scores = calculate_metrics(df)
scores['fuzzy_match'] = df['fuzzy_match'].mean()
print(scores)

{'exact_match': 0.05, 'f1': 0.1432119305268393, 'fuzzy_match': 0.18}


In [19]:
df

Unnamed: 0,id,paragraphs,answer,answer_aliases,answerable,question,question_decomposition,predicted_answer,raw_llm_output,exact_match,fuzzy_match
0,2hop__131818_161450,"[{'idx': 0, 'title': 'Maria Carrillo High Scho...",in the north-east of the country south of the ...,[in the north-east of the country south of the...,True,Where is the Voshmgir District located?,[{'question': 'Which country is the Voshmgir D...,Not applicable,Reasoning: \nThe context information provides ...,False,False
1,2hop__444265_82341,"[{'idx': 0, 'title': 'Ocala, Florida', 'paragr...",in Northern Florida,"[Northern Florida, in Northern Florida]",True,In what part of Florida is Tom Denney's birthp...,[{'question': 'Where is Tom Denney's birthplac...,Northern Florida,Reasoning: \nTom Denney is an American musicia...,True,True
2,2hop__711946_269414,"[{'idx': 0, 'title': 'Wild Thing (Tone Lōc son...",Kill Rock Stars,[Kill Rock Stars],True,What record label is the performer who release...,[{'question': 'Who is the performer that relea...,,Reasoning: \nThe context information provides ...,False,True
3,2hop__311931_417706,"[{'idx': 0, 'title': 'The Main Attraction (alb...",Attic Records,"[Attic, Attic Records]",True,What record label does the performer of Emotio...,[{'question': 'Who is the performer of Emotion...,,Reasoning: \nThe context information provides ...,False,True
4,2hop__809785_606637,"[{'idx': 0, 'title': 'The Main Attraction (alb...",Secret City Records,[Secret City Records],True,What record label does the performer of Advent...,[{'question': 'Who is the performer of Adventu...,,Reasoning: \nThe context information provides ...,False,True
...,...,...,...,...,...,...,...,...,...,...,...
195,2hop__123166_47134,"[{'idx': 0, 'title': 'Jesse Alexander', 'parag...",Benny Beaver,[Benny Beaver],True,What is the mascot of William J. Ripple's univ...,[{'question': 'What is the name of William J. ...,Oregon State University and Benny Beaver,Reasoning: \nWe are looking for the university...,False,True
196,2hop__312223_144857,"[{'idx': 0, 'title': 'Steve Somers', 'paragrap...",Clatskanie,"[Clatskanie, Oregon, Clatskanie]",True,What city was the author of What We Talk About...,[{'question': 'Who is the author of What We Ta...,,Reasoning: \nThe context information provides ...,False,True
197,2hop__51965_165532,"[{'idx': 0, 'title': 'The Big Arvo', 'paragrap...",Sherry Boucher,[Sherry Boucher],True,Who is the spouse of the actor who plays Paul ...,[{'question': 'Who plays Paul in Breakfast at ...,,Reasoning: \nWe are looking for the actor who ...,False,True
198,2hop__168816_144857,"[{'idx': 0, 'title': 'Minstrel Point', 'paragr...",Clatskanie,"[Clatskanie, Oregon, Clatskanie]",True,In what city was the author of Elephant born?,[{'question': 'Who is the author of Elephant?'...,,Reasoning: \nThe context information provides ...,False,True
