# Multi-hop question answering with agent

In [1]:
from dotenv import load_dotenv
load_dotenv()

True

In [2]:
import nest_asyncio
nest_asyncio.apply()

In [3]:
import os
import warnings
from pathlib import Path

import pandas as pd
from pydantic import BaseModel

from bellek.text.utils import fuzzy_match
from bellek.utils import generate_time_id, set_seed

set_seed(42)

In [8]:
ds_df= pd.read_json('../../data/generated/musique-kg-llm/train/dataset.jsonl', orient='records', lines=True)
qd_df = pd.read_json('../../data/generated/musique-kg-llm/train/question-decomposition.jsonl', orient='records', lines=True)
comp_df = pd.read_json('../../data/generated/musique-kg-llm/train/answer-eval/comparisons.jsonl', orient='records', lines=True)
df = pd.merge(ds_df.drop(columns=['question', 'question_decomposition']), qd_df, on='id', suffixes=('', ''))
df = pd.merge(df.drop(columns=['answerable']), comp_df.drop(columns=['question', 'reference_answer']), on='id', suffixes=('', ''))
df

Unnamed: 0,id,paragraphs,answer,answer_aliases,question,question_decomposition,predicted_answer,exact_match,fuzzy_match
0,2hop__128801_205185,"[{'idx': 0, 'title': 'Pama, Burkina Faso', 'pa...",Midland County,"[Midland County, Texas]",What county is the town where KNFM is licensed...,[{'question': 'Which town is KNFM licensed in?...,Midland County,True,True
1,2hop__719559_217649,"[{'idx': 0, 'title': 'Antoine Marchand', 'para...",Warner Bros.,[],What's the record label of the artist who put ...,[{'question': 'Who is the artist behind the so...,Columbia Records,False,False
2,2hop__128806_205185,"[{'idx': 0, 'title': 'Spanish Town', 'paragrap...",Midland County,"[Midland County, Texas]",What region is the town where KQRX is liscense...,[{'question': 'In which town is KQRX licensed?...,Austin,False,False
3,2hop__837090_278127,"[{'idx': 0, 'title': 'The Opening (album)', 'p...",Roc-A-Fella Records,[],What is the record label of the Do It Again pe...,[{'question': 'Who is the performer of the son...,ABC Records,False,False
4,2hop__128895_11424,"[{'idx': 0, 'title': 'Ehrhardt, South Carolina...",15504,[],How many households were there in the town WPU...,[{'question': 'In which town is WPUR licensed?...,Not available.,False,False
...,...,...,...,...,...,...,...,...,...
95,2hop__651488_94210,"[{'idx': 0, 'title': 'Retsil, Washington', 'pa...",Giorgio Vasari,"[Buontalenti, Alfonso Parigi, Vasari, vasari, ...",Who was the place where Pieta is located desig...,[{'question': 'By whom was the place where Pie...,Michelangelo Buonarroti,False,False
96,2hop__362083_467995,"[{'idx': 0, 'title': 'Cambridge Singers', 'par...",RCA Records,[RCA],What is the record label of the performer of M...,[{'question': 'Who is the performer of Make th...,RCA Records,True,True
97,2hop__525596_543261,"[{'idx': 0, 'title': 'Paul Marchand', 'paragra...",Delaware,"[State of Delaware, DE]",The Roman Catholic Diocese of Jim Norton's bir...,"[{'question': 'Where was Jim Norton born?'}, {...","Dublin, Ireland",False,False
98,2hop__394596_8607,"[{'idx': 0, 'title': 'Berea, South Carolina', ...",South Hampshire,[],What metro area is JAKAZiD's birthplace a part...,"[{'question': 'Where was JAKAZiD born?'}, {'qu...",South Hampshire,True,True


In [9]:
mask = ~df['fuzzy_match']

In [13]:
fail_df = df.loc[mask]
fail_df.head()

Unnamed: 0,id,paragraphs,answer,answer_aliases,question,question_decomposition,predicted_answer,exact_match,fuzzy_match
1,2hop__719559_217649,"[{'idx': 0, 'title': 'Antoine Marchand', 'para...",Warner Bros.,[],What's the record label of the artist who put ...,[{'question': 'Who is the artist behind the so...,Columbia Records,False,False
2,2hop__128806_205185,"[{'idx': 0, 'title': 'Spanish Town', 'paragrap...",Midland County,"[Midland County, Texas]",What region is the town where KQRX is liscense...,[{'question': 'In which town is KQRX licensed?...,Austin,False,False
3,2hop__837090_278127,"[{'idx': 0, 'title': 'The Opening (album)', 'p...",Roc-A-Fella Records,[],What is the record label of the Do It Again pe...,[{'question': 'Who is the performer of the son...,ABC Records,False,False
4,2hop__128895_11424,"[{'idx': 0, 'title': 'Ehrhardt, South Carolina...",15504,[],How many households were there in the town WPU...,[{'question': 'In which town is WPUR licensed?...,Not available.,False,False
7,2hop__712682_217649,"[{'idx': 0, 'title': 'So Beautiful or So What'...",Warner Bros.,[],What is the record label of the singer-songwri...,[{'question': 'Who is the singer-songwriter of...,Hear Music,False,False


In [18]:
fail_df.iloc[0]['paragraphs']

[{'idx': 0,
  'title': 'Antoine Marchand',
  'paragraph_text': 'Antoine Marchand is a record label established in 2003 by the Dutch early music performer Ton Koopman. Antoine Marchand is the French translation of Ton Koopman. The label is distributed by Dutch Jazz and classics distributor Challenge.',
  'is_supporting': False},
 {'idx': 1,
  'title': 'Me and Julio Down by the Schoolyard',
  'paragraph_text': '"Me and Julio Down by the Schoolyard" is a song by American singer-songwriter Paul Simon. It was the second single from his second self-titled studio album (1972), released on Columbia Records.',
  'is_supporting': True},
 {'idx': 2,
  'title': 'Top and Bottom Brass',
  'paragraph_text': 'Top and Bottom Brass is an album by trumpeter Clark Terry featuring performances recorded in early 1959 and originally released on the Riverside label.',
  'is_supporting': False},
 {'idx': 3,
  'title': "Carryin' On",
  'paragraph_text': "Carryin' On is an album by American jazz guitarist Grant 

In [47]:
import textwrap

def format_paragraph(paragraph):
    return f"#{paragraph['idx']}\n{paragraph['paragraph_text']}"

def present_row(row):
    print(row['id'])
    print()
    print(row['question'])
    for item in row['question_decomposition']:
        q = item['question']
        print(f"\t{q}")
    print()
    print(f"Prediction: {row['predicted_answer']}")
    print(f"Reference: {row['answer']}")
    print()
    for p in row['paragraphs']:
        if p['is_supporting']:
            for line in textwrap.wrap(format_paragraph(p), width=120):
                print(line)
            print()


In [50]:
i = 4
present_row(fail_df.iloc[i])

2hop__712682_217649

What is the record label of the singer-songwriter who released the album So Beautiful or So What?
	Who is the singer-songwriter of the album So Beautiful or So What?
	What is the record label of #1?

Prediction: Hear Music
Reference: Warner Bros.

#0 So Beautiful or So What is the twelfth solo studio album by American folk rock singer-songwriter Paul Simon. It was
released on April 8, 2011, by Hear Music.

#10 The Rhythm of the Saints is the eighth solo studio album by American singer-songwriter Paul Simon, released on
October 16, 1990 on Warner Bros. Like its predecessor, "Graceland" (1986), the album gained commercial success and
received mostly favorable reviews from critics.

