In [1]:
from dotenv import load_dotenv
load_dotenv()

True

In [3]:
import warnings
from pathlib import Path

import pandas as pd

from bellek.text.utils import fuzzy_match
from bellek.utils import set_seed, jprint

set_seed(89)

In [4]:
import os
import wandb
from bellek.langchain.obs import patch_wandb_tracer_serialize_io
from bellek.wandb import generate_run_id

os.environ["LANGCHAIN_WANDB_TRACING"] = "true"
os.environ["WANDB_PROJECT"] = "thesis-mhqa-baseline-context-langchain"
os.environ["WANDB_RUN_ID"] = generate_run_id(8)
os.environ["WANDB_NOTEBOOK_NAME"] = os.path.basename(globals()['__vsc_ipynb_file__'])
os.environ["WANDB_NOTES"] = "Model sees only question"

patch_wandb_tracer_serialize_io()

wandb_run = wandb.init(project=os.environ["WANDB_PROJECT"], resume=os.environ["WANDB_RUN_ID"])

[34m[1mwandb[0m: Currently logged in as: [33mbdsaglam[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [13]:
ds_df = pd.read_json('../../data/generated/musique-kg-llm/train/dataset.jsonl', orient='records', lines=True)
qd_df = pd.read_json('../../data/generated/musique-kg-llm/train/question-decomposition.jsonl', orient='records', lines=True)
df = pd.merge(ds_df.drop(columns=['question', 'question_decomposition']), qd_df, on='id', suffixes=('', ''))
df.head()

Unnamed: 0,id,paragraphs,answer,answer_aliases,answerable,question,question_decomposition
0,2hop__128801_205185,"[{'idx': 0, 'title': 'Pama, Burkina Faso', 'pa...",Midland County,"[Midland County, Texas]",True,What county is the town where KNFM is licensed...,[{'question': 'Which town is KNFM licensed in?...
1,2hop__719559_217649,"[{'idx': 0, 'title': 'Antoine Marchand', 'para...",Warner Bros.,[],True,What's the record label of the artist who put ...,[{'question': 'Who is the artist behind the so...
2,2hop__128806_205185,"[{'idx': 0, 'title': 'Spanish Town', 'paragrap...",Midland County,"[Midland County, Texas]",True,What region is the town where KQRX is liscense...,[{'question': 'In which town is KQRX licensed?...
3,2hop__837090_278127,"[{'idx': 0, 'title': 'The Opening (album)', 'p...",Roc-A-Fella Records,[],True,What is the record label of the Do It Again pe...,[{'question': 'Who is the performer of the son...
4,2hop__128895_11424,"[{'idx': 0, 'title': 'Ehrhardt, South Carolina...",15504,[],True,How many households were there in the town WPU...,[{'question': 'In which town is WPUR licensed?...


In [14]:
def make_docs(example, only_supporting=False):
    ps = example["paragraphs"]
    for p in ps:
        if only_supporting and not p["is_supporting"]:
            continue
        idx = p["idx"]
        title = p["title"]
        body = p["paragraph_text"]
        is_supporting = p["is_supporting"]
        text = f"# {title}\n{body}"
        yield dict(
            text=text,
            metadata={"parent_id": example["id"], "idx": idx, "is_supporting": is_supporting},
        )

In [15]:
def present_example(example, predicted_answer):
    text = "\n\n".join([p["paragraph_text"] for p in example['paragraphs']])
    print("="*80)
    print("Question:", example["question"])
    print("Reference Answer:", example['answer'])
    print("Predicted Answer:", predicted_answer)
    print("-"*80)
    print("Paragraphs")
    print(text)

In [16]:
from langchain.chat_models import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
from langchain.chains.openai_functions import create_structured_output_runnable
from langchain_core.pydantic_v1 import BaseModel, Field

SYSTEM_PROMPT = """You are an expert Q&A system that is trusted around the world. You are given a question that requires multi-hop reasoning. Always answer the query using the provided context information, and not prior knowledge.
Some rules to follow:
1. Never directly reference the given context in your answer.
2. Avoid statements like 'Based on the context, ...' or 'The context information ...' or anything along those lines.
3. Your answer must be 2-4 words long."""

USER_PROMPT = """Context information is below.
---------------------
{context_str}
---------------------
Given the context information and not prior knowledge, answer the question.
{query_str}
"""

prompt = ChatPromptTemplate.from_messages([
    ("system", SYSTEM_PROMPT), 
    ("user", USER_PROMPT),
])

llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0)

class Output(BaseModel):
    """Output containing the answers for questions."""
    answer: str
    reasoning: str = Field(description="Multi-hop reasoning for the answer.")

chain = create_structured_output_runnable(Output, llm, prompt)

  warn_deprecated(


In [17]:
def format_question(example):
    return example['question']

In [18]:
def answer(example):
    documents = list(make_docs(example, only_supporting=False))
    context = "\n\n".join([doc["text"] for doc in documents])
    output = chain.invoke({"context_str": context, "query_str": format_question(example)}).dict()
    example['predicted_answer'] = output.get("answer")
    example['raw_llm_output'] = output
    return example

In [19]:
def safe_answer(example):
    try:
        return answer(example)
    except Exception as exc:
        id = example['id']
        print(f"Failed to answer the question {id}\n{exc}")
        example['predicted_answer'] = None
        example['raw_llm_output'] = None
        return example

In [20]:
# from langchain.globals import set_debug
# set_debug(True)

# i = 0
# example = df.iloc[i].to_dict()
# example_ = answer(example)
# print("Question:", example['question'])
# print("Reference answer:", example['answer'])
# print("Predicted answer:", example_['predicted_answer'])

# print("-"*20)
# jprint(example_['raw_llm_output'])

# set_debug(False)

In [21]:
df = df.apply(safe_answer, axis=1)

## Evaluation

In [22]:
def _exact_match(example):
    pred, ref = example['predicted_answer'], example['answer']
    return pred is not None and pred == ref

def _fuzzy_match(example):
    pred, ref = example['predicted_answer'], example['answer']
    return pred is not None and ((pred in ref) or (ref in pred) or fuzzy_match(pred, ref))

In [23]:
df["exact_match"] = df.apply(_exact_match, axis=1)
df["fuzzy_match"] = df.apply(_fuzzy_match, axis=1)

# log scores
scores = {
    "exact_match": df["exact_match"].mean(),
    "fuzzy_match": df["fuzzy_match"].mean(),
}
print(scores)
wandb_run.log(scores)

# log evaluation results
cols_to_keep = ['id', 'question', 'question_decomposition', 'answer', 'answer_aliases', 'predicted_answer', 'exact_match', 'fuzzy_match', 'raw_llm_output']
eval_table = wandb.Table(dataframe=df[cols_to_keep])
wandb_run.log({
    "evaluation-table": eval_table,
})

{'exact_match': 0.41, 'fuzzy_match': 0.61}


In [24]:
# finish run
wandb_run.finish()

VBox(children=(Label(value='0.183 MB of 0.183 MB uploaded (0.022 MB deduped)\r'), FloatProgress(value=1.0, max…



0,1
exact_match,▁
fuzzy_match,▁

0,1
exact_match,0.41
fuzzy_match,0.61
