In [1]:
# !pip install weaviate-client==3.26.2

# Multi-hop question answering with agent

In [2]:
from dotenv import load_dotenv
load_dotenv()

True

In [3]:
import nest_asyncio
nest_asyncio.apply()

In [4]:
import os
import warnings
from pathlib import Path

import pandas as pd
import weaviate
from pydantic import BaseModel
from llama_index import Document, ServiceContext
from llama_index.prompts.base import Prompt
from llama_index.prompts.prompt_type import PromptType
from llama_index import VectorStoreIndex
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.vector_stores import WeaviateVectorStore
from llama_index.storage.storage_context import StorageContext
from llama_index.llms import ChatMessage, OpenAI
from llama_index.agent import OpenAIAgent
from llama_index.tools.function_tool import FunctionTool
from llama_index.callbacks import CallbackManager, LlamaDebugHandler
from tqdm.auto import tqdm

from bellem.llama_index.obs import make_phoenix_trace_callback_handler
from bellem.utils import generate_time_id, set_seed

set_seed(42)
tqdm.pandas()


/Users/bdsaglam/dev/repos/bellem/.venv/lib/python3.10/site-packages/pydantic/_internal/_config.py:284: PydanticDeprecatedSince20: Support for class-based `config` is deprecated, use ConfigDict instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.7/migration/
/Users/bdsaglam/dev/repos/bellem/.venv/lib/python3.10/site-packages/litellm/proxy/_types.py:83: PydanticDeprecatedSince20: `pydantic.config.Extra` is deprecated, use literal values instead (e.g. `extra='allow'`). Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.7/migration/
  extra = Extra.allow  # Allow extra fields
/Users/bdsaglam/dev/repos/bellem/.venv/lib/python3.10/site-packages/litellm/proxy/_types.py:86: PydanticDeprecatedSince20: Pydantic V1 style `@root_validator` validators are deprecated. You should migrate to Pydantic V2 style `@model_validator` validators, see the migration guide fo

In [5]:
ds_df = pd.read_json('../../data/generated/musique-evaluation/dataset.jsonl', orient='records', lines=True)
qd_df = pd.read_json('../../data/generated/musique-evaluation/question-decomposition.jsonl', orient='records', lines=True)
df = pd.merge(ds_df.drop(columns=['question', 'question_decomposition']), qd_df, on='id', suffixes=('', ''))
# df = df.sample(2)
df.head()

Unnamed: 0,id,paragraphs,answer,answer_aliases,answerable,question,question_decomposition
0,2hop__131818_161450,"[{'idx': 0, 'title': 'Maria Carrillo High Scho...",in the north-east of the country south of the ...,[in the north-east of the country south of the...,True,Where is the Voshmgir District located?,[{'question': 'Which country is the Voshmgir D...
1,2hop__444265_82341,"[{'idx': 0, 'title': 'Ocala, Florida', 'paragr...",in Northern Florida,"[Northern Florida, in Northern Florida]",True,In what part of Florida is Tom Denney's birthp...,[{'question': 'Where is Tom Denney's birthplac...
2,2hop__711946_269414,"[{'idx': 0, 'title': 'Wild Thing (Tone Lōc son...",Kill Rock Stars,[Kill Rock Stars],True,What record label is the performer who release...,[{'question': 'Who is the performer that relea...
3,2hop__311931_417706,"[{'idx': 0, 'title': 'The Main Attraction (alb...",Attic Records,"[Attic, Attic Records]",True,What record label does the performer of Emotio...,[{'question': 'Who is the performer of Emotion...
4,2hop__809785_606637,"[{'idx': 0, 'title': 'The Main Attraction (alb...",Secret City Records,[Secret City Records],True,What record label does the performer of Advent...,[{'question': 'Who is the performer of Adventu...


In [6]:
phoenix_handler = make_phoenix_trace_callback_handler(Path(f"/tmp/phoenix/thesis-kg-llm/baseline-agent/traces-{generate_time_id()}.jsonl"))
callback_manager = CallbackManager(handlers=[
    phoenix_handler,
    # LlamaDebugHandler(print_trace_on_end=True),
])

In [7]:
# llm = OpenAI(model="gpt-3.5-turbo", temperature=0)

# actually, llama-3-70b-tgi
llm = OpenAI(
    model="gpt-3.5-turbo",
    temperature=0.0,
    api_base="http://localhost:8080/v1",
    api_key="_",
)
embed_model = HuggingFaceEmbedding("sentence-transformers/all-MiniLM-L6-v2")
service_context = ServiceContext.from_defaults(llm=llm, embed_model=embed_model, callback_manager=callback_manager)

In [8]:
def make_docs(example, only_supporting=False):
    ps = example["paragraphs"]
    for p in ps:
        if only_supporting and not p["is_supporting"]:
            continue
        idx = p["idx"]
        title = p["title"]
        body = p["paragraph_text"]
        is_supporting = p["is_supporting"]
        text = f"# {title}\n{body}"
        yield Document(
            text=text,
            metadata={"parent_id": example["id"], "idx": idx, "is_supporting": is_supporting},
            excluded_embed_metadata_keys=["parent_id", "idx", "is_supporting"],
            excluded_llm_metadata_keys=["parent_id", "idx", "is_supporting"],
        )

In [9]:
weaviate_client = weaviate.Client(os.getenv('WEAVIATE_CLUSTER_URL'))
vector_store = WeaviateVectorStore(weaviate_client=weaviate_client)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

def make_query_engine(example):
    documents = list(make_docs(example, only_supporting=False))
    vector_index = VectorStoreIndex.from_documents(documents, storage_context=storage_context, service_context=service_context)
    query_engine = vector_index.as_query_engine(
        service_context=service_context,
        vector_store_query_mode="hybrid",
        alpha=0.6,
        similarity_top_k=3,
    )
    return query_engine

            Please consider upgrading to the latest version. See https://weaviate.io/developers/weaviate/client-libraries/python for details.


In [10]:
def make_answer_sub_question(example):
    def answer_sub_question(question: str) -> str:
        """Use this tool to answer a sub-question."""
        return make_query_engine(example).query(question).response
    return FunctionTool.from_defaults(fn=answer_sub_question)

In [11]:
def make_persist_tool():
    outputs = []
    def persist_answers(sub_answers: list[str], final_answer: str) -> str:
        """Save the sub-answers and final answer to a database at the end."""
        outputs.append(dict(final_answer=final_answer, sub_answers=sub_answers))
        return "SUCCESS"

    tool =  FunctionTool.from_defaults(fn=persist_answers)
    def _get_output():
        if outputs:
            return outputs[-1]
        else:
            return None
    tool._get_output = _get_output
    return tool

In [12]:
SYSTEM_PROMPT_STR = """
You are helpful multi-hop question answering assistant that answers the given question by answering each sub-question. You must use `answer_sub_question` tool to answer each sub-question. After you get the answer for the first question, you reformulate the second sub-question and repeat the same procedure. 
Your answers must be in 2-4 words. When you reach the final answer, you persist your answers to a database by calling `persist_answers` function only once. After that, you output "FINISH".
""".strip()

def make_mhqa_agent(example):
    tools = [
        make_answer_sub_question(example),
        make_persist_tool(),
    ]
    prefix_messages = [
        ChatMessage(content=SYSTEM_PROMPT_STR, role="system"),
    ]
    return OpenAIAgent.from_tools(
        llm=OpenAI(model='gpt-3.5-turbo'),
        tools=tools, 
        prefix_messages=prefix_messages,
        # verbose=True,
    )

In [13]:
def format_question(example):
    sub_questions = '\n'.join([f"\t{i+1}.{item['question']}" for i, item in enumerate(example['question_decomposition'])])
    return f"{example['question']}\n\n{sub_questions}"

In [14]:
def mhqa(example):
    """Multi-hop question answering."""
    agent = make_mhqa_agent(example)
    response = agent.query(format_question(example)).response
    output = agent.agent_worker._get_tools(None)[1]._get_output()
    if output is None:
        example['predicted_answer'] = None
        example['predicted_sub_answers'] = None
    else:
        example['predicted_answer'] = output.get("final_answer")
        example['predicted_sub_answers'] = output.get('sub_answers')
    return example

In [15]:
i = 0
example = df.iloc[i]
example_ = mhqa(example)
print("Reference answer:", example['answer'])
print("Predictions:")
print(example_['predicted_answer'])
print(example_['predicted_sub_answers'])

Voshmgir District is located in Iran.
Reference answer: in the north-east of the country south of the Caspian Sea
Predictions:
Voshmgir District is located in Iran.
['Iran', 'Iran is situated in the north-east of the country south of the Caspian Sea.']


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  example['predicted_answer'] = output.get("final_answer")
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  example['predicted_answer'] = output.get("final_answer")
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  example['predicted_sub_answers'] = output.get('sub_answers')
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-

In [16]:
df = df.progress_apply(mhqa, axis=1)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  0%|          | 0/200 [00:00<?, ?it/s]

Voshmgir District is located in Iran.
Ocala, Florida
Cold Crush Records
A&M Records (Canada)
Secret City Records
Nieuwe Waterweg, North Sea
Tasman Sea
Sire Records
Real World label
Florida
New Renaissance Records
Gmina Stężyca
Lesotho
Northern Florida
Kicking Mule Records and Blind Pig Records
Jacqueline Rayner
Thomas Bach
Nieuwe Waterweg
Gmina Daleszyce
Mtetengwe River
Florida
Ramstein Air Base
ACM
Columbia Records
Belarus
International Federation of Association Football
1150
Little Naches River
Routledge
cycling promotion initiative
Assam Football Association
5am
Coastal mountains at elevations of 2,000 to 4,000 feet above sea level
Abiodun Smith
Chao Phraya River
Limpopo River
June 10, 1819
Eastern block of Catalan languages
Fair Trade Services
Honorable Justice Abiodun Smith
Selous Game Reserve
Unknown
Britta Holmberg
Mahmoud Mirza
Richard Stallman
Charles
Rosaline Patricia Irorefe Bozimo
Northern Florida
Johan Remkes
Chaya Mushka Schneersohn
Golestan Province
Lisbon
Richard Stallm

In [19]:
df.to_json('../../data/generated/musique-evaluation/baseline-agent.jsonl', orient='records', lines=True)

In [20]:
from bellem.musique.eval import calculate_metrics, compare_answers

df['predicted_answer'] = df['predicted_answer'].map(lambda x: x or "N/A")
df = compare_answers(df)

# log scores
scores = calculate_metrics(df)
print(scores)

{'exact_match': 0.44, 'f1': 0.5683546589289624}


## Inspect

In [None]:
df[['id', 'question', 'answer', 'predicted_answer', 'predicted_sub_answers', 'fuzzy_match']]