# Multi-hop question answering with agent

In [None]:
import logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')

In [None]:
from dotenv import load_dotenv
load_dotenv()

True

In [None]:
import nest_asyncio
nest_asyncio.apply()

In [None]:
import os
import random
import warnings
import json
import types
from pathlib import Path

import pandas as pd
from pydantic import BaseModel

from bellem.text.utils import fuzzy_match
from bellem.utils import generate_time_id, set_seed

set_seed(42)

In [None]:
from bellem.llama_index.data_structs.data_structs import patch_kg_data_struct
from bellem.llama_index.graph_stores.kuzu import KuzuGraphStore
from bellem.llama_index.indices.knowledge_graph.base import patch_knowledge_graph_index

patch_kg_data_struct()
patch_knowledge_graph_index()

In [None]:
DATA_DIR = Path("../../data/generated/musique-evaluation")
KG_DIRECTORY = DATA_DIR / 'knowledge-graphs'
QA_DIRECTORY = DATA_DIR / 'question-answering'

In [None]:
import kuzu
from llama_index import ServiceContext, StorageContext, load_index_from_storage
from llama_index.indices.knowledge_graph.retrievers import KGRetrieverMode


def load_storage_context(example_id: str):
    directory = KG_DIRECTORY / example_id
    db = kuzu.Database(str(directory / "kuzu"))
    graph_store = KuzuGraphStore(db)
    storage_context = StorageContext.from_defaults(persist_dir=directory / "index", graph_store=graph_store)
    return storage_context

def load_index(example_id: str):
    storage_context = load_storage_context(example_id)
    return load_index_from_storage(
        storage_context,
        include_embeddings=True,
    )


In [None]:
ds_df = pd.read_json(DATA_DIR / 'dataset.jsonl', orient='records', lines=True)
comp_df = pd.read_json(DATA_DIR / 'answer-eval/comparisons.jsonl', orient='records', lines=True)
df = pd.merge(ds_df.drop(columns=['answerable', 'answer', 'answer_aliases']), comp_df.drop(columns=['answerable', 'paragraphs', 'question_decomposition', 'question', 'answer', 'answer_aliases', 'answers']), on='id', suffixes=('', ''))
df.set_index("id", drop=False, inplace=True)
df.head()

Unnamed: 0_level_0,id,paragraphs,question,question_decomposition,answers,predicted_answer,exact_match,fuzzy_match
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2hop__131818_161450,2hop__131818_161450,"[{'idx': 0, 'title': 'Maria Carrillo High Scho...",Where is the Voshmgir District located?,"[{'id': 131818, 'question': 'Which state is Vo...","[Caspian Sea, in the north-east of the country...",Golestan Province,False,False
2hop__711946_269414,2hop__711946_269414,"[{'idx': 0, 'title': 'Wild Thing (Tone Lōc son...",What record label is the performer who release...,"[{'id': 711946, 'question': 'All Your Faded Th...",[Kill Rock Stars],Cold Crush Records,False,False
2hop__311931_417706,2hop__311931_417706,"[{'idx': 0, 'title': 'The Main Attraction (alb...",What record label does the performer of Emotio...,"[{'id': 311931, 'question': 'Emotional Rain >>...","[Attic, Attic Records]",Attic Records,True,True
2hop__358582_189042,2hop__358582_189042,"[{'idx': 0, 'title': 'The Main Attraction (alb...",What is the record label of the Thrill of a Li...,"[{'id': 358582, 'question': 'Thrill of a Lifet...",[New Renaissance Records],Capitol Records,False,False
2hop__341176_711757,2hop__341176_711757,"[{'idx': 0, 'title': 'Gmina Pabianice', 'parag...",What other district is found in the same count...,"[{'id': 341176, 'question': 'Gmina Stężyca, Lu...","[Ryki, Gmina Ryki]",Gmina Stężyca,False,False


In [None]:
def load_triplets(example):
    id = example['id']
    docs_filepath = DATA_DIR / f"knowledge-graphs/{id}/documents.jsonl"
    if not docs_filepath.exists():
        return []
    triplets = []
    with open(docs_filepath) as f:
        for line in f:
            doc = json.loads(line)
            triplets.extend(doc['triplets'])
    return triplets    

In [None]:
df['triplets'] = df.apply(load_triplets, axis=1)

In [None]:
df.head()

Unnamed: 0_level_0,id,paragraphs,question,question_decomposition,answers,predicted_answer,exact_match,fuzzy_match,triplets
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2hop__131818_161450,2hop__131818_161450,"[{'idx': 0, 'title': 'Maria Carrillo High Scho...",Where is the Voshmgir District located?,"[{'id': 131818, 'question': 'Which state is Vo...","[Caspian Sea, in the north-east of the country...",Golestan Province,False,False,[]
2hop__711946_269414,2hop__711946_269414,"[{'idx': 0, 'title': 'Wild Thing (Tone Lōc son...",What record label is the performer who release...,"[{'id': 711946, 'question': 'All Your Faded Th...",[Kill Rock Stars],Cold Crush Records,False,False,[]
2hop__311931_417706,2hop__311931_417706,"[{'idx': 0, 'title': 'The Main Attraction (alb...",What record label does the performer of Emotio...,"[{'id': 311931, 'question': 'Emotional Rain >>...","[Attic, Attic Records]",Attic Records,True,True,[]
2hop__358582_189042,2hop__358582_189042,"[{'idx': 0, 'title': 'The Main Attraction (alb...",What is the record label of the Thrill of a Li...,"[{'id': 358582, 'question': 'Thrill of a Lifet...",[New Renaissance Records],Capitol Records,False,False,"[[Thrill of a Lifetime, album type, King Kobra..."
2hop__341176_711757,2hop__341176_711757,"[{'idx': 0, 'title': 'Gmina Pabianice', 'parag...",What other district is found in the same count...,"[{'id': 341176, 'question': 'Gmina Stężyca, Lu...","[Ryki, Gmina Ryki]",Gmina Stężyca,False,False,[]


In [None]:
import textwrap

def format_paragraph(paragraph):
    return f"Paragraph {paragraph['idx']} - {paragraph['paragraph_text']}"

def present_row(row):
    print(row['id'])
    print()
    print("Success" if row['fuzzy_match'] else "Fail")
    print()
    print(row['question'])
    for item in row['question_decomposition']:
        q = item['question']
        a = item['answer']
        print(f"\t{q}")
        print(f"\t\t{a}")
    print()
    print(f"Prediction: {row['predicted_answer']}")
    print(f"Reference: {row['answers']}")
    print()
    for p in row['paragraphs']:
        if p['is_supporting']:
            for line in textwrap.wrap(format_paragraph(p), width=120):
                print(line)
            print()


In [None]:
import phoenix as px
from phoenix import TraceDataset
from phoenix.trace.utils import json_lines_to_df


def get_trace_dataset(filepath: Path):
    with open(filepath) as f:
        lines = [line for line in f.readlines() if line.strip()]
    return TraceDataset(json_lines_to_df(lines))


def launch_phoenix(example_id: str):
    filepath = QA_DIRECTORY / example_id / "traces.jsonl"
    return px.launch_app(trace=get_trace_dataset(filepath), notebook_environment='local')

2024-09-05 10:38:56,549 - phoenix.datasets.dataset - INFO - Dataset: phoenix_dataset_7cb9cc42-35b7-4e82-862d-564e3f501140 initialized


In [None]:
from IPython.display import display, HTML

def display_knowledge_graph(example_id: str):
    kg_path = KG_DIRECTORY / example_id /  "kuzu-network.html"
    display(HTML(kg_path.read_text()))
    return kg_path

In [None]:
def report_success(dataf):
    fail_dataf = dataf.loc[~dataf['fuzzy_match']]
    success_dataf = dataf.loc[dataf['fuzzy_match']]
    print(len(success_dataf), len(fail_dataf))
    print(f"{len(success_dataf)/len(dataf):.2f}", f"{len(fail_dataf)/len(dataf):.2f}")
    return success_dataf, fail_dataf

In [None]:
success_df, fail_df = report_success(df)

45 55
0.45 0.55


In [None]:
example_id = random.choice(fail_df.index)
# example_id = "2hop__197090_126045"
row = df.loc[example_id]
sub_questions = [item["question"] for item in row["question_decomposition"]]
present_row(row)
# kg_path = display_knowledge_graph(example_id)
# phoenix_session = launch_phoenix(example_id)
# phoenix_session.view()

2hop__131818_161450

Fail

Where is the Voshmgir District located?
	Which state is Voshmgir District located?
		Golestan Province
	Where is #1 located?
		in the north-east of the country south of the Caspian Sea

Prediction: Golestan Province
Reference: ['Caspian Sea', 'in the north-east of the country south of the Caspian Sea']

Paragraph 1 - Golestān Province (Persian: استان گلستان‎, Ostān-e Golestān) is one of the 31 provinces of Iran, located
in the north-east of the country south of the Caspian Sea. Its capital is Gorgan.

Paragraph 2 - Voshmgir District () is a district (bakhsh) in Aqqala County, Golestan Province, Iran. At the 2006 census,
its population was 25,149, in 5,266 families. The District has one city: Anbar Olum. The District has two rural
districts ("dehestan"): Mazraeh-ye Jonubi Rural District and Mazraeh-ye Shomali Rural District.



In [None]:
for example_id in fail_df.index.values:
    row = df.loc[example_id]
    sub_questions = [item["question"] for item in row["question_decomposition"]]
    if any('when' in q.lower() for q in sub_questions):
        present_row(row)

2hop__199513_13732

Fail

How old was Mary when engaged to the person from whom São José dos Campos takes it's name?
	São José dos Campos >> named after
		Joseph
	When she was betrothed to #1 , approximately how old was Mary?
		12–14 years old

Prediction: Widower in Israel
Reference: ['12–14 years old']

Paragraph 18 - According to the apocryphal Gospel of James, Mary was the daughter of Saint Joachim and Saint Anne.
Before Mary's conception, Anne had been barren and was far advanced in years. Mary was given to service as a consecrated
virgin in the Temple in Jerusalem when she was three years old, much like Hannah took Samuel to the Tabernacle as
recorded in the Old Testament. Some apocryphal accounts state that at the time of her betrothal to Joseph, Mary was
12–14 years old, and he was thirty years old, but such accounts are unreliable.

Paragraph 19 - São José dos Campos (, meaning Saint Joseph of the Fields) is a major city and the seat of the
municipality of the same name in the

## Debug

In [None]:
index = load_index(example_id)
retriever = index.as_retriever(
    retriever_mode=KGRetrieverMode.HYBRID,
    include_text=False,
    verbose=True,
)

2024-06-30 20:51:34,644 - llama_index.indices.loading - INFO - Loading all indices.


In [None]:
keyword = "Tom Denney"

In [None]:
nodes = retriever.retrieve(keyword)

2024-06-30 20:51:36,461 - httpx - INFO - HTTP Request: POST http://localhost:8111/chat/completions "HTTP/1.1 200 OK"


[1;3;32mExtracted keywords: ['Tom Denney', 'Tom', 'Denney']
[0m[1;3;34mKG context:
The following are knowledge sequence in max depth 2 in the form of directed graph like:
`subject -[predicate]->, object, <-[predicate_next_hop]-, object_next_hop ...`
['Shelby High School', 'member of', 'Northern Ohio League (1944-2017)']
['Shelby High School', 'member of', 'Sandusky Bay Conference (2017-)']
['Hanna Theatre', 'renovation by', 'Great Lakes Theater Festival']
['Shelby High School', 'location', 'Shelby, Ohio, United States']
['Shelby High School', 'part of', 'Shelby City School District']
['Shelby High School', 'serves', 'students in grades 9-12']
['Hanna Theatre', 'type', 'classic theater company']
['Hanna Theatre', 'location', 'downtown Cleveland']
['Hanna Theatre', 'opening date', 'March 28, 1921']
['Hanna Theatre', 'location', 'Playhouse Square']
['Hanna Theatre', 'location', 'United States']
['Hanna Theatre', 'reopening date', '2008']
['Hanna Theatre', 'location', 'Ohio']
[0m

In [None]:
len(nodes)

1

In [None]:
nodes[0].metadata['kg_rel_texts']

["['Shelby High School', 'member of', 'Northern Ohio League (1944-2017)']",
 "['Shelby High School', 'member of', 'Sandusky Bay Conference (2017-)']",
 "['Hanna Theatre', 'renovation by', 'Great Lakes Theater Festival']",
 "['Shelby High School', 'location', 'Shelby, Ohio, United States']",
 "['Shelby High School', 'part of', 'Shelby City School District']",
 "['Shelby High School', 'serves', 'students in grades 9-12']",
 "['Hanna Theatre', 'type', 'classic theater company']",
 "['Hanna Theatre', 'location', 'downtown Cleveland']",
 "['Hanna Theatre', 'opening date', 'March 28, 1921']",
 "['Hanna Theatre', 'location', 'Playhouse Square']",
 "['Hanna Theatre', 'location', 'United States']",
 "['Hanna Theatre', 'reopening date', '2008']",
 "['Hanna Theatre', 'location', 'Ohio']"]

In [None]:
for _, row in jerx_inference_df.loc[example_id].iterrows():
    if not row['is_supporting']:
        continue
    print(row['text'])
    print('-'*80)
    print(row['generation'])
    print('='*80)

NameError: name 'jerx_inference_df' is not defined

In [None]:
retriever._index_struct.search_node_by_keyword(keyword)

In [None]:
retriever._index_struct.table