In [1]:
from dotenv import load_dotenv
load_dotenv()

True

In [2]:
import os
from functools import partial
from pathlib import Path

import pandas as pd
from openai import OpenAI

from bellem.utils import jprint
from bellem.utils import set_seed

set_seed(42)

In [3]:
from tqdm.auto import tqdm
tqdm.pandas()

## Dataset

In [4]:
dataset_file: Path = Path("../../data/generated/musique-evaluation/dataset.jsonl")
df = pd.read_json(dataset_file, lines=True)
df.head()

Unnamed: 0,id,paragraphs,question,question_decomposition,answer,answer_aliases,answerable
0,2hop__131818_161450,"[{'idx': 0, 'title': 'Maria Carrillo High Scho...",Where is the Voshmgir District located?,"[{'id': 131818, 'question': 'Which state is Vo...",in the north-east of the country south of the ...,"[Caspian Sea, in the north-east of the country...",True
1,2hop__444265_82341,"[{'idx': 0, 'title': 'Ocala, Florida', 'paragr...",In what part of Florida is Tom Denney's birthp...,"[{'id': 444265, 'question': 'Tom Denney >> pla...",in Northern Florida,"[in Northern Florida, Northern Florida]",True
2,2hop__711946_269414,"[{'idx': 0, 'title': 'Wild Thing (Tone Lōc son...",What record label is the performer who release...,"[{'id': 711946, 'question': 'All Your Faded Th...",Kill Rock Stars,[Kill Rock Stars],True
3,2hop__311931_417706,"[{'idx': 0, 'title': 'The Main Attraction (alb...",What record label does the performer of Emotio...,"[{'id': 311931, 'question': 'Emotional Rain >>...",Attic Records,"[Attic, Attic Records]",True
4,2hop__809785_606637,"[{'idx': 0, 'title': 'The Main Attraction (alb...",What record label does the performer of Advent...,"[{'id': 809785, 'question': 'Adventures in You...",Secret City Records,[Secret City Records],True


In [5]:
jerx_file = Path("../../data/raw/musique-evaluation/jerx-inferences/llama3-base.jsonl")
jerx_df = pd.read_json(jerx_file, lines=True)
jerx_df.head()

Unnamed: 0,id,paragraph_idx,paragraph_text,paragraph_title,is_supporting,text,input,generation
0,2hop__131818_161450,0,Maria Carrillo High School is a public high sc...,Maria Carrillo High School,False,# Maria Carrillo High School\nMaria Carrillo H...,[{'content': 'You are an excellent knowledge g...,Maria Carrillo High School | location | Santa ...
1,2hop__131818_161450,1,"Golestān Province (Persian: استان گلستان‎, Ost...",Golestan Province,True,# Golestan Province\nGolestān Province (Persia...,[{'content': 'You are an excellent knowledge g...,Golestan Province | location | north-east of I...
2,2hop__131818_161450,2,Voshmgir District () is a district (bakhsh) in...,Voshmgir District,True,# Voshmgir District\nVoshmgir District () is a...,[{'content': 'You are an excellent knowledge g...,"Voshmgir District | location | Aqqala County, ..."
3,2hop__131818_161450,3,52 Heroor is a village in the southern state o...,52 Heroor,False,# 52 Heroor\n52 Heroor is a village in the sou...,[{'content': 'You are an excellent knowledge g...,"52 Heroor | location | Karnataka, India\n52 He..."
4,2hop__131818_161450,4,Vennaimalai is a village of Karur District loc...,Vennaimalai,False,# Vennaimalai\nVennaimalai is a village of Kar...,[{'content': 'You are an excellent knowledge g...,Vennaimalai | location | Karur District\nVenna...


In [6]:
jerx_mapping = {(row['id'], row['paragraph_idx']): row['generation'] for _, row in jerx_df.iterrows()}

def extract_triplets(example: dict):
    generations = '\n'.join(jerx_mapping[(example['id'], p['idx'])] for p in example['paragraphs'] if p['is_supporting'])
    example["triplets"] = [line.split(" | ") for line in generations.split('\n') if line.strip()]
    return example

In [7]:
df = df.apply(extract_triplets, axis=1)
df.head()

Unnamed: 0,id,paragraphs,question,question_decomposition,answer,answer_aliases,answerable,triplets
0,2hop__131818_161450,"[{'idx': 0, 'title': 'Maria Carrillo High Scho...",Where is the Voshmgir District located?,"[{'id': 131818, 'question': 'Which state is Vo...",in the north-east of the country south of the ...,"[Caspian Sea, in the north-east of the country...",True,"[[Golestan Province, location, north-east of I..."
1,2hop__444265_82341,"[{'idx': 0, 'title': 'Ocala, Florida', 'paragr...",In what part of Florida is Tom Denney's birthp...,"[{'id': 444265, 'question': 'Tom Denney >> pla...",in Northern Florida,"[in Northern Florida, Northern Florida]",True,"[[Ocala, location, Florida], [Ocala, location ..."
2,2hop__711946_269414,"[{'idx': 0, 'title': 'Wild Thing (Tone Lōc son...",What record label is the performer who release...,"[{'id': 711946, 'question': 'All Your Faded Th...",Kill Rock Stars,[Kill Rock Stars],True,"[[All Your Faded Things, album, ], [All Your F..."
3,2hop__311931_417706,"[{'idx': 0, 'title': 'The Main Attraction (alb...",What record label does the performer of Emotio...,"[{'id': 311931, 'question': 'Emotional Rain >>...",Attic Records,"[Attic, Attic Records]",True,"[[Lee Aaron (album), release date, 1987-02-17]..."
4,2hop__809785_606637,"[{'idx': 0, 'title': 'The Main Attraction (alb...",What record label does the performer of Advent...,"[{'id': 809785, 'question': 'Adventures in You...",Secret City Records,[Secret City Records],True,"[[Adventures in Your Own Backyard, type, album..."


In [8]:
i = 1
example = df.iloc[i]
example

id                                                       2hop__444265_82341
paragraphs                [{'idx': 0, 'title': 'Ocala, Florida', 'paragr...
question                  In what part of Florida is Tom Denney's birthp...
question_decomposition    [{'id': 444265, 'question': 'Tom Denney >> pla...
answer                                                  in Northern Florida
answer_aliases                      [in Northern Florida, Northern Florida]
answerable                                                             True
triplets                  [[Ocala, location, Florida], [Ocala, location ...
Name: 1, dtype: object

In [9]:
print(example['question'])

In what part of Florida is Tom Denney's birthplace located?


In [10]:
print(example['question_decomposition'][0]['question'])
print(example['question_decomposition'][1]['question'])

Tom Denney >> place of birth
where is #1 in the state of florida


## Search

In [11]:
import bm25s

class KnowledgeGraph:
    def __init__(self, triplets: list[tuple[str, str, str]]):
        self.delimiter = ' >> '
        self.corpus = [self.delimiter.join(triplet) for triplet in triplets]
        self.retriever = bm25s.BM25(corpus=self.corpus)
        self.tokenized_corpus = bm25s.tokenize(self.corpus)
        self.retriever.index(self.tokenized_corpus)

    def search(self, query: str, top_k: int = 3):
        top_k = min(top_k, len(self.corpus))
        results, _ = self.retriever.retrieve(bm25s.tokenize(query), k=top_k)
        return [triplet_str.split(self.delimiter) for triplet_str in results[0].tolist()]

In [12]:
kg = KnowledgeGraph(example['triplets'])

Split strings:   0%|          | 0/9 [00:00<?, ?it/s]

BM25S Count Tokens:   0%|          | 0/9 [00:00<?, ?it/s]

BM25S Compute Scores:   0%|          | 0/9 [00:00<?, ?it/s]

In [13]:
question = example['question_decomposition'][0]['question']
question

'Tom Denney >> place of birth'

In [14]:
kg.search(question, top_k=3)

Split strings:   0%|          | 0/1 [00:00<?, ?it/s]

BM25S Retrieve:   0%|          | 0/1 [00:00<?, ?it/s]

[['Tom Denney', 'birth place', 'Ocala, Florida'],
 ['Tom Denney', 'birth date', 'November 23, 1982'],
 ['Tom Denney', 'profession', 'musician']]

In [15]:
question2 = example['question_decomposition'][1]['question']
question2

'where is #1 in the state of florida'

In [16]:
kg.search("where is Ocala in the state of florida", top_k=3)

Split strings:   0%|          | 0/1 [00:00<?, ?it/s]

BM25S Retrieve:   0%|          | 0/1 [00:00<?, ?it/s]

[['Ocala', 'location', 'Florida'],
 ['Ocala', 'location type', 'Northern Florida'],
 ['Tom Denney', 'birth place', 'Ocala, Florida']]