# Multi-hop question answering with agent

In [1]:
import logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')

In [2]:
from dotenv import load_dotenv
load_dotenv()

True

In [3]:
import nest_asyncio
nest_asyncio.apply()

In [4]:
import os
import random
import warnings
import json
import types
from pathlib import Path

import pandas as pd
from pydantic import BaseModel

from bellem.text.utils import fuzzy_match
from bellem.utils import generate_time_id, set_seed

set_seed(42)

In [5]:
from bellem.llama_index.data_structs.data_structs import patch_kg_data_struct
from bellem.llama_index.graph_stores.kuzu import KuzuGraphStore
from bellem.llama_index.indices.knowledge_graph.base import patch_knowledge_graph_index

patch_kg_data_struct()
patch_knowledge_graph_index()

In [None]:
DATA_DIR = Path("../../data/generated/musique-evaluation")
KG_DIRECTORY = DATA_DIR / 'knowledge-graphs'
QA_DIRECTORY = DATA_DIR / 'question-answering'

In [7]:
import kuzu
from llama_index import ServiceContext, StorageContext, load_index_from_storage
from llama_index.indices.knowledge_graph.retrievers import KGRetrieverMode


def load_storage_context(example_id: str):
    directory = KG_DIRECTORY / example_id
    db = kuzu.Database(str(directory / "kuzu"))
    graph_store = KuzuGraphStore(db)
    storage_context = StorageContext.from_defaults(persist_dir=directory / "index", graph_store=graph_store)
    return storage_context

def load_index(example_id: str):
    storage_context = load_storage_context(example_id)
    return load_index_from_storage(
        storage_context,
        include_embeddings=True,
    )


In [8]:
ds_df = pd.read_json(DATA_DIR / 'dataset.jsonl', orient='records', lines=True)
comp_df = pd.read_json(DATA_DIR / 'answer-eval/comparisons.jsonl', orient='records', lines=True)
df = pd.merge(ds_df.drop(columns=['answerable', 'answer', 'answer_aliases']), comp_df.drop(columns=['answerable', 'paragraphs', 'question_decomposition', 'question', 'answer', 'answer_aliases', 'answers']), on='id', suffixes=('', ''))
df.set_index("id", drop=False, inplace=True)
df.head()

Unnamed: 0_level_0,id,paragraphs,question,question_decomposition,answers,predicted_answer,exact_match,fuzzy_match
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2hop__128801_205185,2hop__128801_205185,"[{'idx': 0, 'title': 'Pama, Burkina Faso', 'pa...",What county is the town where KNFM is licensed...,"[{'id': 128801, 'question': 'What town is KNFM...","[Midland County, Midland County, Texas]",Midland County,True,True
2hop__719559_217649,2hop__719559_217649,"[{'idx': 0, 'title': 'Antoine Marchand', 'para...",What's the record label of the artist who put ...,"[{'id': 719559, 'question': 'Me and Julio Down...",[Warner Bros.],Columbia Records,False,False
2hop__128806_205185,2hop__128806_205185,"[{'idx': 0, 'title': 'Spanish Town', 'paragrap...",What region is the town where KQRX is liscense...,"[{'id': 128806, 'question': 'What town is KQRX...","[Midland County, Midland County, Texas]",Southern Plains,False,False
2hop__128895_11424,2hop__128895_11424,"[{'idx': 0, 'title': 'Ehrhardt, South Carolina...",How many households were there in the town WPU...,"[{'id': 128895, 'question': 'What town is WPUR...","[15,504]","15,504 households",False,True
2hop__143485_815489,2hop__143485_815489,"[{'idx': 0, 'title': 'Boulevard Records (U.S.)...",What is the record label of the person who rec...,"[{'id': 143485, 'question': 'Who recorded Some...","[Custard, Custard Records]",Custard Records,True,True


In [9]:
def load_triplets(example):
    id = example['id']
    docs_filepath = DATA_DIR / f"knowledge-graphs/{id}/documents.jsonl"
    if not docs_filepath.exists():
        return []
    triplets = []
    with open(docs_filepath) as f:
        for line in f:
            doc = json.loads(line)
            triplets.extend(doc['triplets'])
    return triplets    

In [10]:
df['triplets'] = df.apply(load_triplets, axis=1)

In [11]:
df.head()

Unnamed: 0_level_0,id,paragraphs,question,question_decomposition,answers,predicted_answer,exact_match,fuzzy_match,triplets
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2hop__128801_205185,2hop__128801_205185,"[{'idx': 0, 'title': 'Pama, Burkina Faso', 'pa...",What county is the town where KNFM is licensed...,"[{'id': 128801, 'question': 'What town is KNFM...","[Midland County, Midland County, Texas]",Midland County,True,True,[]
2hop__719559_217649,2hop__719559_217649,"[{'idx': 0, 'title': 'Antoine Marchand', 'para...",What's the record label of the artist who put ...,"[{'id': 719559, 'question': 'Me and Julio Down...",[Warner Bros.],Columbia Records,False,False,[]
2hop__128806_205185,2hop__128806_205185,"[{'idx': 0, 'title': 'Spanish Town', 'paragrap...",What region is the town where KQRX is liscense...,"[{'id': 128806, 'question': 'What town is KQRX...","[Midland County, Midland County, Texas]",Southern Plains,False,False,[]
2hop__128895_11424,2hop__128895_11424,"[{'idx': 0, 'title': 'Ehrhardt, South Carolina...",How many households were there in the town WPU...,"[{'id': 128895, 'question': 'What town is WPUR...","[15,504]","15,504 households",False,True,[]
2hop__143485_815489,2hop__143485_815489,"[{'idx': 0, 'title': 'Boulevard Records (U.S.)...",What is the record label of the person who rec...,"[{'id': 143485, 'question': 'Who recorded Some...","[Custard, Custard Records]",Custard Records,True,True,[]


In [12]:
import textwrap

def format_paragraph(paragraph):
    return f"Paragraph {paragraph['idx']} - {paragraph['paragraph_text']}"

def present_row(row):
    print(row['id'])
    print()
    print("Success" if row['fuzzy_match'] else "Fail")
    print()
    print(row['question'])
    for item in row['question_decomposition']:
        q = item['question']
        a = item['answer']
        print(f"\t{q}")
        print(f"\t\t{a}")
    print()
    print(f"Prediction: {row['predicted_answer']}")
    print(f"Reference: {row['answers']}")
    print()
    for p in row['paragraphs']:
        if p['is_supporting']:
            for line in textwrap.wrap(format_paragraph(p), width=120):
                print(line)
            print()


In [13]:
import phoenix as px
from phoenix import TraceDataset
from phoenix.trace.utils import json_lines_to_df


def get_trace_dataset(filepath: Path):
    with open(filepath) as f:
        lines = [line for line in f.readlines() if line.strip()]
    return TraceDataset(json_lines_to_df(lines))


def launch_phoenix(example_id: str):
    filepath = QA_DIRECTORY / example_id / "traces.jsonl"
    return px.launch_app(trace=get_trace_dataset(filepath), notebook_environment='local')

2024-08-03 13:33:58,320 - phoenix.datasets.dataset - INFO - Dataset: phoenix_dataset_b44aaa4e-c0e7-401a-814c-232a37b522a0 initialized


In [14]:
from IPython.display import display, HTML

def display_knowledge_graph(example_id: str):
    kg_path = KG_DIRECTORY / example_id /  "kuzu-network.html"
    display(HTML(kg_path.read_text()))
    return kg_path

In [15]:
def report_success(dataf):
    fail_dataf = dataf.loc[~dataf['fuzzy_match']]
    success_dataf = dataf.loc[dataf['fuzzy_match']]
    print(len(success_dataf), len(fail_dataf))
    print(f"{len(success_dataf)/len(dataf):.2f}", f"{len(fail_dataf)/len(dataf):.2f}")
    return success_dataf, fail_dataf

In [16]:
success_df, fail_df = report_success(df)

763 229
0.77 0.23


In [17]:
example_id = random.choice(fail_df.index)
# example_id = "2hop__197090_126045"
row = df.loc[example_id]
sub_questions = [item["question"] for item in row["question_decomposition"]]
present_row(row)
kg_path = display_knowledge_graph(example_id)
phoenix_session = launch_phoenix(example_id)
phoenix_session.view()

2hop__635132_754802

Fail

What area shares border with the region that contains Washington Township?
	Washington Township >> located in the administrative territorial entity
		Henry County
	#1 >> shares border with
		Franklin County

Prediction: Turkeycock Wildlife Management Area
Reference: ['Franklin County']

Paragraph 13 - Washington Township is one of the thirteen townships of Henry County, Ohio, United States. As of the 2010
census the population was 1,912, of whom 1,794 lived in the unincorporated portion of the township.

Paragraph 16 - Turkeycock Mountain is a mountain summit located in Franklin County, Virginia and Henry County, Virginia.
Rising out of the eastern foothills of the Blue Ridge Mountains, Turkeycock Mountain rises to above sea level and is
located at . A portion of the mountain is protected as the Turkeycock Wildlife Management Area, which is open to the
public.



FileNotFoundError: [Errno 2] No such file or directory: '../../data/generated/musique-training/knowledge-graphs/2hop__635132_754802/kuzu-network.html'

## Debug

In [None]:
index = load_index(example_id)
retriever = index.as_retriever(
    retriever_mode=KGRetrieverMode.HYBRID,
    include_text=False,
    verbose=True,
)

2024-06-30 20:51:34,644 - llama_index.indices.loading - INFO - Loading all indices.


In [None]:
keyword = "Tom Denney"

In [None]:
nodes = retriever.retrieve(keyword)

2024-06-30 20:51:36,461 - httpx - INFO - HTTP Request: POST http://localhost:8111/chat/completions "HTTP/1.1 200 OK"


[1;3;32mExtracted keywords: ['Tom Denney', 'Tom', 'Denney']
[0m[1;3;34mKG context:
The following are knowledge sequence in max depth 2 in the form of directed graph like:
`subject -[predicate]->, object, <-[predicate_next_hop]-, object_next_hop ...`
['Shelby High School', 'member of', 'Northern Ohio League (1944-2017)']
['Shelby High School', 'member of', 'Sandusky Bay Conference (2017-)']
['Hanna Theatre', 'renovation by', 'Great Lakes Theater Festival']
['Shelby High School', 'location', 'Shelby, Ohio, United States']
['Shelby High School', 'part of', 'Shelby City School District']
['Shelby High School', 'serves', 'students in grades 9-12']
['Hanna Theatre', 'type', 'classic theater company']
['Hanna Theatre', 'location', 'downtown Cleveland']
['Hanna Theatre', 'opening date', 'March 28, 1921']
['Hanna Theatre', 'location', 'Playhouse Square']
['Hanna Theatre', 'location', 'United States']
['Hanna Theatre', 'reopening date', '2008']
['Hanna Theatre', 'location', 'Ohio']
[0m

In [None]:
len(nodes)

1

In [None]:
nodes[0].metadata['kg_rel_texts']

["['Shelby High School', 'member of', 'Northern Ohio League (1944-2017)']",
 "['Shelby High School', 'member of', 'Sandusky Bay Conference (2017-)']",
 "['Hanna Theatre', 'renovation by', 'Great Lakes Theater Festival']",
 "['Shelby High School', 'location', 'Shelby, Ohio, United States']",
 "['Shelby High School', 'part of', 'Shelby City School District']",
 "['Shelby High School', 'serves', 'students in grades 9-12']",
 "['Hanna Theatre', 'type', 'classic theater company']",
 "['Hanna Theatre', 'location', 'downtown Cleveland']",
 "['Hanna Theatre', 'opening date', 'March 28, 1921']",
 "['Hanna Theatre', 'location', 'Playhouse Square']",
 "['Hanna Theatre', 'location', 'United States']",
 "['Hanna Theatre', 'reopening date', '2008']",
 "['Hanna Theatre', 'location', 'Ohio']"]

In [None]:
for _, row in jerx_inference_df.loc[example_id].iterrows():
    if not row['is_supporting']:
        continue
    print(row['text'])
    print('-'*80)
    print(row['generation'])
    print('='*80)

NameError: name 'jerx_inference_df' is not defined

In [None]:
retriever._index_struct.search_node_by_keyword(keyword)

In [None]:
retriever._index_struct.table