In [1]:
from dotenv import load_dotenv
load_dotenv()

True

In [2]:
import warnings
from pathlib import Path

import pandas as pd

from bellek.text.utils import fuzzy_match
from bellek.utils import set_seed, jprint

set_seed(89)

In [3]:
import os
import wandb
from bellek.langchain.obs import patch_wandb_tracer_serialize_io
from bellek.wandb import generate_run_id

os.environ["LANGCHAIN_WANDB_TRACING"] = "true"
os.environ["WANDB_PROJECT"] = "thesis-mhqa-baseline-context"
os.environ["WANDB_RUN_ID"] = generate_run_id(8)
os.environ["WANDB_NOTEBOOK_NAME"] = os.path.basename(globals()['__vsc_ipynb_file__'])
os.environ["WANDB_NOTES"] = "Model sees only question"

patch_wandb_tracer_serialize_io()

wandb_run = wandb.init(project=os.environ["WANDB_PROJECT"], resume=os.environ["WANDB_RUN_ID"])

[34m[1mwandb[0m: Currently logged in as: [33mbdsaglam[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [4]:
ds_df = pd.read_json('../../data/generated/musique-evaluation/dataset.jsonl', orient='records', lines=True)
qd_df = pd.read_json('../../data/generated/musique-evaluation/question-decomposition.jsonl', orient='records', lines=True)
df = pd.merge(ds_df.drop(columns=['question', 'question_decomposition']), qd_df, on='id', suffixes=('', ''))
df.head()

Unnamed: 0,id,paragraphs,answer,answer_aliases,answerable,question,question_decomposition
0,2hop__131818_161450,"[{'idx': 0, 'title': 'Maria Carrillo High Scho...",in the north-east of the country south of the ...,[Caspian Sea],True,Where is the Voshmgir District located?,[{'question': 'Which country is the Voshmgir D...
1,2hop__444265_82341,"[{'idx': 0, 'title': 'Ocala, Florida', 'paragr...",in Northern Florida,[Northern Florida],True,In what part of Florida is Tom Denney's birthp...,[{'question': 'Where is Tom Denney's birthplac...
2,2hop__711946_269414,"[{'idx': 0, 'title': 'Wild Thing (Tone Lōc son...",Kill Rock Stars,[],True,What record label is the performer who release...,[{'question': 'Who is the performer that relea...
3,2hop__311931_417706,"[{'idx': 0, 'title': 'The Main Attraction (alb...",Attic Records,[Attic],True,What record label does the performer of Emotio...,[{'question': 'Who is the performer of Emotion...
4,2hop__809785_606637,"[{'idx': 0, 'title': 'The Main Attraction (alb...",Secret City Records,[],True,What record label does the performer of Advent...,[{'question': 'Who is the performer of Adventu...


In [5]:
def make_docs(example, only_supporting=False):
    ps = example["paragraphs"]
    for p in ps:
        if only_supporting and not p["is_supporting"]:
            continue
        idx = p["idx"]
        title = p["title"]
        body = p["paragraph_text"]
        is_supporting = p["is_supporting"]
        text = f"# {title}\n{body}"
        yield dict(
            text=text,
            metadata={"parent_id": example["id"], "idx": idx, "is_supporting": is_supporting},
        )

In [6]:
def present_example(example, predicted_answer):
    text = "\n\n".join([p["paragraph_text"] for p in example['paragraphs']])
    print("="*80)
    print("Question:", example["question"])
    print("Reference Answer:", example['answer'])
    print("Predicted Answer:", predicted_answer)
    print("-"*80)
    print("Paragraphs")
    print(text)

In [7]:
from langchain.chat_models import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
from langchain.chains.openai_functions import create_structured_output_runnable
from langchain_core.pydantic_v1 import BaseModel, Field

SYSTEM_PROMPT = """You are an expert Q&A system that is trusted around the world. You are given a question that requires multi-hop reasoning. Always answer the query using the provided context information, and not prior knowledge.
Some rules to follow:
1. Never directly reference the given context in your answer.
2. Avoid statements like 'Based on the context, ...' or 'The context information ...' or anything along those lines.
3. Your answer must be 2-4 words long."""

USER_PROMPT = """Context information is below.
---------------------
{context_str}
---------------------
Given the context information and not prior knowledge, answer the question.
{query_str}
"""

prompt = ChatPromptTemplate.from_messages([
    ("system", SYSTEM_PROMPT), 
    ("user", USER_PROMPT),
])

llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0)

class Output(BaseModel):
    """Output containing the answers for questions."""
    answer: str
    reasoning: str = Field(description="Multi-hop reasoning for the answer.")

chain = create_structured_output_runnable(Output, llm, prompt)

  warn_deprecated(
  warn_deprecated(


In [8]:
def format_question(example):
    return example['question']

In [9]:
def answer(example):
    documents = list(make_docs(example, only_supporting=False))
    context = "\n\n".join([doc["text"] for doc in documents])
    output = chain.invoke({"context_str": context, "query_str": format_question(example)}).dict()
    example['predicted_answer'] = output.get("answer")
    example['raw_llm_output'] = output
    return example

In [10]:
def safe_answer(example):
    try:
        return answer(example)
    except Exception as exc:
        id = example['id']
        print(f"Failed to answer the question {id}\n{exc}")
        example['predicted_answer'] = None
        example['raw_llm_output'] = None
        return example

In [11]:
# from langchain.globals import set_debug
# set_debug(True)

# i = 0
# example = df.iloc[i].to_dict()
# example_ = answer(example)
# print("Question:", example['question'])
# print("Reference answer:", example['answer'])
# print("Predicted answer:", example_['predicted_answer'])

# print("-"*20)
# jprint(example_['raw_llm_output'])

# set_debug(False)

In [12]:
df = df.apply(safe_answer, axis=1)

## Evaluation

In [13]:
import evaluate
import pandas as pd

def calculate_musique_scores(dataf: pd.DataFrame) -> dict:
    metric = evaluate.load("bdsaglam/musique")
    predictions = dataf["predicted_answer"].tolist()
    references = dataf.apply(lambda row: [row["answer"], *row["answer_aliases"]], axis=1).tolist()
    scores = metric.compute(predictions=predictions, references=references)
    return scores

def _fuzzy_match(example):
    pred, ref = example['predicted_answer'], example['answer']
    return pred is not None and ((pred in ref) or (ref in pred) or fuzzy_match(pred, ref))

In [14]:
df["fuzzy_match"] = df.apply(_fuzzy_match, axis=1)

# log scores
scores = {
    "fuzzy_match": df["fuzzy_match"].mean(),
}
scores.update(calculate_musique_scores(df))
print(scores)
wandb_run.log(scores)

# log evaluation results
cols_to_keep = ['id', 'question', 'question_decomposition', 'answer', 'answer_aliases', 'predicted_answer', 'fuzzy_match', 'raw_llm_output']
eval_table = wandb.Table(dataframe=df[cols_to_keep])
wandb_run.log({
    "evaluation-table": eval_table,
})

{'fuzzy_match': 0.395, 'exact_match': 0.345, 'f1': 0.4478809523809524}


In [15]:
# finish run
wandb_run.finish()

VBox(children=(Label(value='0.256 MB of 0.256 MB uploaded (0.020 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
exact_match,▁
f1,▁
fuzzy_match,▁

0,1
exact_match,0.345
f1,0.44788
fuzzy_match,0.395
