In [None]:
# !pip install magentic

In [1]:
from dotenv import load_dotenv
load_dotenv()

True

In [2]:
from pathlib import Path

import pandas as pd

from bellek.text.utils import fuzzy_match
from bellek.utils import set_seed, jprint

set_seed(89)

In [3]:
ds_df = pd.read_json('../../data/generated/musique-evaluation/dataset.jsonl', orient='records', lines=True)
qd_df = pd.read_json('../../data/generated/musique-evaluation/question-decomposition.jsonl', orient='records', lines=True)
df = pd.merge(ds_df.drop(columns=['question', 'question_decomposition']), qd_df, on='id', suffixes=('', ''))
df.head()

Unnamed: 0,id,paragraphs,answer,answer_aliases,answerable,question,question_decomposition
0,2hop__131818_161450,"[{'idx': 0, 'title': 'Maria Carrillo High Scho...",in the north-east of the country south of the ...,[in the north-east of the country south of the...,True,Where is the Voshmgir District located?,[{'question': 'Which country is the Voshmgir D...
1,2hop__444265_82341,"[{'idx': 0, 'title': 'Ocala, Florida', 'paragr...",in Northern Florida,"[Northern Florida, in Northern Florida]",True,In what part of Florida is Tom Denney's birthp...,[{'question': 'Where is Tom Denney's birthplac...
2,2hop__711946_269414,"[{'idx': 0, 'title': 'Wild Thing (Tone Lōc son...",Kill Rock Stars,[Kill Rock Stars],True,What record label is the performer who release...,[{'question': 'Who is the performer that relea...
3,2hop__311931_417706,"[{'idx': 0, 'title': 'The Main Attraction (alb...",Attic Records,"[Attic, Attic Records]",True,What record label does the performer of Emotio...,[{'question': 'Who is the performer of Emotion...
4,2hop__809785_606637,"[{'idx': 0, 'title': 'The Main Attraction (alb...",Secret City Records,[Secret City Records],True,What record label does the performer of Advent...,[{'question': 'Who is the performer of Adventu...


In [4]:
def make_docs(example, only_supporting=False):
    ps = example["paragraphs"]
    for p in ps:
        if only_supporting and not p["is_supporting"]:
            continue
        idx = p["idx"]
        title = p["title"]
        body = p["paragraph_text"]
        is_supporting = p["is_supporting"]
        text = f"# {title}\n{body}"
        yield dict(
            text=text,
            metadata={"parent_id": example["id"], "idx": idx, "is_supporting": is_supporting},
        )

In [5]:
def present_example(example, predicted_answer):
    text = "\n\n".join([p["paragraph_text"] for p in example['paragraphs']])
    print("="*80)
    print("Question:", example["question"])
    print("Reference Answer:", example['answer'])
    print("Predicted Answer:", predicted_answer)
    print("-"*80)
    print("Paragraphs")
    print(text)

In [13]:
def format_question(example):
    return example['question']

In [23]:
from magentic import chatprompt, SystemMessage, UserMessage
from pydantic import BaseModel, Field

class Output(BaseModel):
    """Output containing the answers for questions."""
    reasoning: str = Field(description="Multi-hop reasoning for the answer.")
    answer: str


SYSTEM_PROMPT = """You are an expert Q&A system that is trusted around the world. You are given a question that requires multi-hop reasoning. Always answer the query using the provided context information, and not prior knowledge.
Some rules to follow:
1. Never directly reference the given context in your answer.
2. Avoid statements like 'Based on the context, ...' or 'The context information ...' or anything along those lines.
3. Your answer must be 2-4 words long.

Response Format:
Reason:...
Answer:...
"""

USER_PROMPT = """Context information is below.
---------------------
{context_str}
---------------------
Given the context information and not prior knowledge, answer the question.
{query_str}
"""

@chatprompt(
    SystemMessage(SYSTEM_PROMPT),
    UserMessage(USER_PROMPT),
)
def answer_question(context_str: str, query_str: str) -> str: ...


In [24]:
from magentic import OpenaiChatModel

llm = OpenaiChatModel(
    model="meta-llama/Meta-Llama-3-70B-Instruct",
    temperature=0.0,
    base_url="http://127.0.0.1:8080/v1",
    api_key="anything",
)

In [28]:
def answer(example):
    documents = list(make_docs(example, only_supporting=False))
    context = "\n\n".join([doc["text"] for doc in documents])
    with llm:
        text = answer_question(context_str=context, query_str=format_question(example))
    lines = text.splitlines()
    answer = None
    for line in lines:
        if line.lower().startswith("answer"):
            answer = line.split(":")[1].strip()
    example['predicted_answer'] = answer
    example['raw_llm_output'] = text
    return example

In [29]:
def safe_answer(example):
    try:
        return answer(example)
    except Exception as exc:
        id = example['id']
        print(f"Failed to answer the question {id}\n{exc}")
        example['predicted_answer'] = None
        example['raw_llm_output'] = None
        return example

In [30]:
i = 0
example = df.iloc[i].to_dict()
example_ = answer(example)
print("Question:", example['question'])
print("Reference answer:", example['answer'])
print("Predicted answer:", example_['predicted_answer'])

print("-"*20)
jprint(example_['raw_llm_output'])

Question: Where is the Voshmgir District located?
Reference answer: in the north-east of the country south of the Caspian Sea
Predicted answer: Iran
--------------------
"Reason: Voshmgir District is mentioned as a district in Aqqala County, Golestan Province, Iran.\nAnswer: Iran"


In [31]:
df = df.apply(safe_answer, axis=1)

## Evaluation

In [39]:
from bellek.musique.eval import calculate_metrics, compare_answers

df['predicted_answer'] = df['predicted_answer'].map(lambda x: x or "N/A")
df = compare_answers(df)
scores = calculate_metrics(df)
scores['fuzzy_match'] = df['fuzzy_match'].mean()
print(scores)

{'exact_match': 0.5, 'f1': 0.582357142857143, 'fuzzy_match': 0.61}


In [37]:
df.to_json('../../data/generated/musique-evaluation/baseline-context-supporting-only-q-llama.jsonl', orient='records', lines=True)