# Multi-hop question answering with agent

In [None]:
import logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')

In [None]:
from dotenv import load_dotenv
load_dotenv()

True

In [None]:
import nest_asyncio
nest_asyncio.apply()

In [None]:
import os
import random
import warnings
import json
import types
from pathlib import Path

import pandas as pd
from pydantic import BaseModel

from bellem.text.utils import fuzzy_match
from bellem.utils import generate_time_id, set_seed

set_seed(42)

In [None]:
HF_HUB_USER_NAME = "bdsaglam"

In [None]:
DATA_DIR = Path("../../data/generated/musique-training")
KG_DIRECTORY = DATA_DIR / 'knowledge-graphs'
QA_DIRECTORY = DATA_DIR / 'question-answering'

In [None]:
ds_df = pd.read_json(DATA_DIR / 'dataset.jsonl', orient='records', lines=True)
comp_df = pd.read_json(DATA_DIR / 'answer-eval/comparisons.jsonl', orient='records', lines=True)
df = pd.merge(ds_df.drop(columns=['answerable', 'answer', 'answer_aliases']), comp_df.drop(columns=['answerable', 'paragraphs', 'question_decomposition', 'question', 'answer', 'answer_aliases', 'answers']), on='id', suffixes=('', ''))
df.set_index("id", drop=False, inplace=True)
df.head()

Unnamed: 0_level_0,id,paragraphs,question,question_decomposition,answers,predicted_answer,exact_match,fuzzy_match
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2hop__128801_205185,2hop__128801_205185,"[{'idx': 0, 'title': 'Pama, Burkina Faso', 'pa...",What county is the town where KNFM is licensed...,"[{'id': 128801, 'question': 'What town is KNFM...","[Midland County, Midland County, Texas]",Midland County seat,False,True
2hop__719559_217649,2hop__719559_217649,"[{'idx': 0, 'title': 'Antoine Marchand', 'para...",What's the record label of the artist who put ...,"[{'id': 719559, 'question': 'Me and Julio Down...",[Warner Bros.],Not explicitly stated,False,False
2hop__128806_205185,2hop__128806_205185,"[{'idx': 0, 'title': 'Spanish Town', 'paragrap...",What region is the town where KQRX is liscense...,"[{'id': 128806, 'question': 'What town is KQRX...","[Midland County, Midland County, Texas]",Southern Plains region,False,False
2hop__128895_11424,2hop__128895_11424,"[{'idx': 0, 'title': 'Ehrhardt, South Carolina...",How many households were there in the town WPU...,"[{'id': 128895, 'question': 'What town is WPUR...","[15,504]","15,504 households",False,True
2hop__143485_815489,2hop__143485_815489,"[{'idx': 0, 'title': 'Boulevard Records (U.S.)...",What is the record label of the person who rec...,"[{'id': 143485, 'question': 'Who recorded Some...","[Custard, Custard Records]",Custard Records Atlantic,False,True


In [None]:
def load_triplets(example):
    id = example['id']
    docs_filepath = DATA_DIR / f"knowledge-graphs/{id}/documents.jsonl"
    if not docs_filepath.exists():
        return []
    triplets = []
    with open(docs_filepath) as f:
        for line in f:
            doc = json.loads(line)
            triplets.extend(doc['triplets'])
    return [' | '.join(triplet) for triplet in triplets]    

In [None]:
def make_jerx_text(example):
    return '\n\n'.join(p['paragraph_text'] for p in example['paragraphs'] if p['is_supporting'])

In [None]:
df['triplets'] = df.apply(load_triplets, axis=1)
df['text'] = df.apply(make_jerx_text, axis=1)

In [None]:
df['triplets'].map(len).describe()

count    193.000000
mean      13.259067
std        2.724356
min        7.000000
25%       12.000000
50%       13.000000
75%       15.000000
max       20.000000
Name: triplets, dtype: float64

In [None]:
import textwrap

def format_paragraph(paragraph):
    return f"Paragraph {paragraph['idx']} - {paragraph['paragraph_text']}"

def present_row(row):
    print(row['id'])
    print()
    print("Success" if row['fuzzy_match'] else "Fail")
    print()
    print(row['question'])
    for item in row['question_decomposition']:
        q = item['question']
        a = item['answer']
        print(f"\t{q}")
        print(f"\t\t{a}")
    print()
    print(f"Prediction: {row['predicted_answer']}")
    print(f"Reference: {row['answers']}")
    print()
    for p in row['paragraphs']:
        if p['is_supporting']:
            for line in textwrap.wrap(format_paragraph(p), width=120):
                print(line)
            print()


In [None]:
i = 0
example = df.iloc[i]
present_row(example)
print(example['triplets'])

2hop__128801_205185

Success

What county is the town where KNFM is licensed the capital of?
	What town is KNFM liscensed in?
		Midland
	#1 >> capital of
		Midland County

Prediction: Midland County seat
Reference: ['Midland County', 'Midland County, Texas']

Paragraph 1 - Midland is a city in and the county seat of Midland County, Texas, United States, on the Southern Plains
of the state's western area. A small portion of the city extends into Martin County.

Paragraph 11 - KNFM (92.3 FM), branded as "Lonestar 92", is a Country music formatted radio station that serves the
Midland–Odessa metropolitan area. The station broadcasts on FM frequency 92.3 and is under ownership of Townsquare
Media.

['Midland | location | Texas', 'Midland | type | City', 'Midland | county seat of | Midland County', 'Midland | region | Southern Plains', 'Midland County | location | Texas', 'Martin County | location | Texas', 'Midland | extends into | Martin County', 'KNFM | branded as | Lonestar 92', 'KNFM |

In [None]:
from bellem.jerx.fewshot.llm import DEFAULT_FEW_SHOT_EXAMPLE_MESSAGES, DEFAULT_JERX_SYSTEM_MESSAGE_FOR_LLAMA

def make_few_shot_chat(example):
    messages = [
        dict(role="system", content=DEFAULT_JERX_SYSTEM_MESSAGE_FOR_LLAMA),
        *DEFAULT_FEW_SHOT_EXAMPLE_MESSAGES,
        {'role': 'assistant', "content": "\n".join(example['triplets'])}
    ]
    return {'messages': messages}

In [None]:
make_few_shot_chat(example)

{'messages': [{'role': 'system',
   'content': "You are an excellent knowledge graph construction agent. Extract knowledge triplets in the form of (subject, relation, object) from user's messages. Avoid stopwords. Use ' | ' as delimiter and provide one triplet per line."},
  {'role': 'user',
   'content': 'Glenhis Hernández (born 7 October 1990 in Havana) is a taekwondo practitioner from Cuba. She was the 2013 World\nChampion in middleweight.\n\nThe current mayor of Havana ("President of the People\'s Power Provincial Assembly") is Marta Hernández Romero, she\nwas elected on March 5, 2011.'},
  {'role': 'assistant',
   'content': 'Glenhis Hernández (Athlete) | born on | October 7, 1990\nGlenhis Hernández (Athlete) | birth place | Havana\nGlenhis Hernández (Athlete) | specializes in | taekwondo\nGlenhis Hernández (Athlete) | won | 2013 World Champion title (Middleweight)\nMarta Hernández Romero (Politician) | serves as | mayor of Havana\nMarta Hernández Romero (Politician) | holds | the

In [None]:
mask = df['exact_match']
success_df = df.loc[mask]

In [None]:
from datasets import Dataset

2024-06-30 21:27:04,894 - datasets - INFO - PyTorch version 2.2.2 available.


In [None]:
jerx_ds_name = "musique-answerable-2hop-jerx"

examples = [{'text': example['text'], 'triplets': example['triplets']} for _, row in success_df.iterrows()]
jerx_ds = Dataset.from_list(examples)
jerx_ds.push_to_hub(f"{HF_HUB_USER_NAME}/{jerx_ds_name}", split="train")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/bdsaglam/musique-answerable-2hop-jerx/commit/71558fb2a1491e87f4dcd72e5a907a2c4e012c1d', commit_message='Upload dataset', commit_description='', oid='71558fb2a1491e87f4dcd72e5a907a2c4e012c1d', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
jerx_chat_ds = jerx_ds.map(make_few_shot_chat, remove_columns=['text', 'triplets'])
jerx_chat_ds.push_to_hub(f"{HF_HUB_USER_NAME}/{jerx_ds_name}-chat", split="train")

Map:   0%|          | 0/63 [00:00<?, ? examples/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/bdsaglam/musique-answerable-2hop-jerx-chat/commit/0f909e14f75d8ed14aa41a5beda9fd0326f298bd', commit_message='Upload dataset', commit_description='', oid='0f909e14f75d8ed14aa41a5beda9fd0326f298bd', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
len(jerx_chat_ds)

63