In [1]:
from dotenv import load_dotenv
load_dotenv()

True

In [2]:
import logging
import os
import sys
import json
from pathlib import Path

import numpy as np
import pandas as pd

from IPython.display import Markdown, display

logging.basicConfig(stream=sys.stdout, level=logging.INFO)

In [3]:
from langchain.globals import set_llm_cache
from langchain.cache import SQLiteCache

set_llm_cache(SQLiteCache(database_path="/tmp/langchain-cache.db"))

In [4]:
DATA_DIR = Path('../../tmp/data/musique-llama2/')
DATA_DIR.mkdir(exist_ok=True, parents=True)

## Dataset

In [5]:
from datasets import load_dataset, DatasetDict

raw_dsd = load_dataset("bdsaglam/musique-answerable-2hop-subset")

In [6]:
n = 10
dsd = DatasetDict({
    'train': raw_dsd['train'].select(range(n)),
    'validation': raw_dsd['validation'].select(range(n)),
})
dsd

DatasetDict({
    train: Dataset({
        features: ['id', 'paragraphs', 'question', 'question_decomposition', 'answer', 'answer_aliases', 'answerable', 'text'],
        num_rows: 10
    })
    validation: Dataset({
        features: ['id', 'paragraphs', 'question', 'question_decomposition', 'answer', 'answer_aliases', 'answerable', 'text'],
        num_rows: 10
    })
})

In [43]:
dsd['train'].to_json('./temp.json')

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

93684

In [7]:
def present_example(example, show_context=False):
    print(example['question'])
    paragraphs = [p['paragraph_text'] for p in example['paragraphs']]
    for qd in example['question_decomposition']:
        question = qd['question']
        idx = qd['paragraph_support_idx']
        answer = qd['answer']
        print('  ', f'Q: {question}')
        print('  ', f"P.{idx}:", paragraphs[idx])
        print('  ', f'A: {answer}')
    if show_context:
        print("\nAll paragraphs")
        print('\n'.join(paragraphs))

present_example(dsd['train'][0])

What county is the town where KNFM is licensed the capital of?
   Q: What town is KNFM liscensed in?
   P.11: KNFM (92.3 FM), branded as "Lonestar 92", is a Country music formatted radio station that serves the Midland–Odessa metropolitan area. The station broadcasts on FM frequency 92.3 and is under ownership of Townsquare Media.
   A: Midland
   Q: #1 >> capital of
   P.1: Midland is a city in and the county seat of Midland County, Texas, United States, on the Southern Plains of the state's western area. A small portion of the city extends into Martin County.
   A: Midland County


## Question answering with LLM and KG

In [8]:
from llama_index.llms import OpenAI
from llama_index import ServiceContext

In [24]:
SAMPLE_IDX = 3
example = dsd['train'][SAMPLE_IDX]
SAMPLE_DIR = DATA_DIR / f"sample-{SAMPLE_IDX}"
SAMPLE_DIR.mkdir(exist_ok=True, parents=True)
with open(SAMPLE_DIR / "example.json", 'w') as f: 
    json.dump(example, f, ensure_ascii=False, indent=2)

present_example(example, show_context=True)

What is the record label of the Do It Again performer?
   Q: Do It Again >> performer
   P.16: "Do It Again (Put Ya Hands Up)" is the lead single from rapper Jay-Z's fourth album "Vol. 3... Life and Times of S. Carter". The song features production by Rockwilder, including guest vocals by Amil and Beanie Sigel.
   A: Jay-Z
   Q: #1 >> record label
   P.1: Christión was a male duo featuring brothers Kenni Ski and Allen Anthony, the first R&B act to be signed to Jay-Z's Roc-A-Fella Records. They released their single "Full of Smoke" on Roc-A-Fella in 1996, reaching #53 on the Hot 100 and #15 on the R&B chart.
   A: Roc-A-Fella Records

All paragraphs
The Opening is a live album by American jazz pianist Mal Waldron featuring a performance recorded in Paris in 1970 and released on the French Futura label.
Christión was a male duo featuring brothers Kenni Ski and Allen Anthony, the first R&B act to be signed to Jay-Z's Roc-A-Fella Records. They released their single "Full of Smoke" on Roc-A

### Question decomposition

In [25]:
from bellek.ml.llm.qdecomp import make_question_decomposer

qdecomposer = make_question_decomposer()

In [26]:
sub_questions = qdecomposer(question=example['question'])
pred_q_decomp = {"question_decomposition": [{"question": q} for q in sub_questions]}
with open(SAMPLE_DIR / 'question-decomposition-pred.json', 'w') as f: 
    json.dump(pred_q_decomp, f, ensure_ascii=False, indent=2)

print(example['question'])
print()
print("Groundtruth")
for qd in example['question_decomposition']:
    print('  ' + qd['question'])
print()
print("Prediction")
for question in sub_questions:
    print('  ' + question)

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
What is the record label of the Do It Again performer?

Groundtruth
  Do It Again >> performer
  #1 >> record label

Prediction
  Who is the performer of the song "Do It Again"?
  What is the record label of #1?


### Knowledge graph construction

In [27]:
from llama_index import Document

def make_docs(example, only_supporting=False):
    ps = example['paragraphs']
    for p in ps:
        if only_supporting and not p["is_supporting"]:
            continue
        idx = p["idx"]
        title = p["title"]
        body = p["paragraph_text"]
        is_supporting = p["is_supporting"]
        text = f"# {title}\n{body}"
        yield Document(text=text, metadata=dict(idx=idx, is_supporting=is_supporting))

In [28]:
list(make_docs(example, only_supporting=True))

[Document(id_='17d6cf86-3f5f-4912-bb2e-2212952ae847', embedding=None, metadata={'idx': 1, 'is_supporting': True}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, hash='0ef6a826ed972648506d37b30b32d4d8e6b8d6c72c2c1e70fa6e23996080a5c3', text='# Christión\nChristión was a male duo featuring brothers Kenni Ski and Allen Anthony, the first R&B act to be signed to Jay-Z\'s Roc-A-Fella Records. They released their single "Full of Smoke" on Roc-A-Fella in 1996, reaching #53 on the Hot 100 and #15 on the R&B chart.', start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\n\n{content}', metadata_template='{key}: {value}', metadata_seperator='\n'),
 Document(id_='f782834e-5cb5-42e0-a8fc-ad6d5379481f', embedding=None, metadata={'idx': 16, 'is_supporting': True}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, hash='518414da957e0c1717c5290ccd1a8ccc5fb2d8567c803a25027e741bbf702726', text='# Do It Again (Put Ya Hands U

In [29]:
from llama_index.prompts.base import Prompt
from llama_index.prompts.prompt_type import PromptType

KG_TRIPLET_EXTRACT_TMPL = """<s>[INST] <<SYS>>
You are a helpful assistant that extracts up to {max_knowledge_triplets}  entity-relation-entity triplets from given text. Use '|' as delimiter and provide one triplet per line.
<</SYS>>
Alaa Abdul Zahra plays for Al Shorta SC. His club is AL Kharaitiyat SC, which has its ground at, Al Khor. [/INST] Al Kharaitiyat SC|ground|Al Khor
Alaa Abdul-Zahra|club|Al Kharaitiyat SC
Alaa Abdul-Zahra|club|Al Shorta SC </s><s>[INST] {text} [/INST] """

KG_TRIPLET_EXTRACT_PROMPT = Prompt(
    KG_TRIPLET_EXTRACT_TMPL, 
    prompt_type=PromptType.KNOWLEDGE_TRIPLET_EXTRACT,
)

In [30]:
import kuzu
from bellek.kuzu import KuzuGraphStore
from llama_index import KnowledgeGraphIndex
from llama_index.storage.storage_context import StorageContext

In [31]:
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

embed_model = HuggingFaceEmbedding('sentence-transformers/all-MiniLM-L6-v2')

In [32]:
# knowledge graph database
kg_db_path = SAMPLE_DIR / "kg"
db = kuzu.Database(str(kg_db_path))
graph_store = KuzuGraphStore(db)
storage_context = StorageContext.from_defaults(graph_store=graph_store)

# documents to index into knowledge graph
documents = list(make_docs(example, only_supporting=True))

# language model to use for triplet extraction
llm = OpenAI(temperature=0, model="gpt-3.5-turbo")
kg_service_context = ServiceContext.from_defaults(
    llm=llm,
    embed_model=embed_model,
)

# index documents
if False and kg_db_path.exists():
    print("Loading the knowledge graph for the sample...")
    index = KnowledgeGraphIndex(
        storage_context=storage_context,
        service_context=kg_service_context,
        include_embeddings=False,
    )
else:
    print("Building the knowledge graph for the sample...")
    index = KnowledgeGraphIndex.from_documents(
        documents=documents,
        max_triplets_per_chunk=10,
        storage_context=storage_context,
        service_context=kg_service_context,
        include_embeddings=True,
        # kg_triple_extract_template=KG_TRIPLET_EXTRACT_PROMPT,
    )


Building the knowledge graph for the sample...
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


In [33]:
from pyvis.network import Network

g = index.get_networkx_graph()
net = Network(notebook=True, cdn_resources="in_line", directed=True)
net.from_nx(g)
net.save_graph(str(SAMPLE_DIR / f"kg.html"))

### Query

In [34]:
query_engine = index.as_query_engine(
    include_text=True, 
    embedding_mode="hybrid",
    response_mode="simple_summarize",
    verbose=True,
)

original_text_qa_prompt = query_engine.get_prompts()['response_synthesizer:text_qa_template']
text_qa_prompt_user_msg_content = """Context information is below.
---------------------
{context_str}
---------------------
Given the context information and not prior knowledge, answer the query.
Query: {query_str}
Answer in 2-4 words: """
original_text_qa_prompt.conditionals[0][1].message_templates[1].content = text_qa_prompt_user_msg_content
query_engine.update_prompts({'response_synthesizer:text_qa_template': original_text_qa_prompt})

In [35]:
hop1_question = sub_questions[0]
hop1_answer = query_engine.query(hop1_question).response
hop2_question = sub_questions[1].replace("#1", hop1_answer)
hop2_answer = query_engine.query(hop2_question).response

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
[1;3;32mExtracted keywords: ['"Do It Again"', 'performer', 'Again', 'Do', 'It', 'song']
[0m[1;3;34mKG context:
The following are knowledge sequence in max depth 2 in the form of directed graph like:
`subject -[predicate]->, object, <-[predicate_next_hop]-, object_next_hop ...`
('"Do It Again (Put Ya Hands Up)"', 'features', 'guest vocals')
('"Do It Again (Put Ya Hands Up)"', 'is', 'the lead single')
[0mINFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
[1;3;32mExtracted keywords: ['record', 'label', 'lead', 'record label', 'lead single', 'single']
[0mINFO:llama_index.indices.knowledge_graph.retrievers:> Querying with idx: e9e14795-a671-4fe0-83f2-ee85359ff03d: # Do It Again (Put Ya Hands Up)
"Do It Again (Put Ya Hands Up)" is the lead s...
[1;3;34mKG context:
Th

In [36]:
print(example['question'])
for qd in example['question_decomposition']:
    print('>', qd['question'])
    print(' '*4, qd['answer'])

What is the record label of the Do It Again performer?
> Do It Again >> performer
     Jay-Z
> #1 >> record label
     Roc-A-Fella Records


In [37]:
print(example['question'])
print('>', hop1_question)
print(' '*4, hop1_answer)
print('>', hop2_question)
print(' '*4, hop2_answer)

What is the record label of the Do It Again performer?
> Who is the performer of the song "Do It Again"?
     the lead single
> What is the record label of the lead single?
     Rockwilder


In [38]:
present_example(example, show_context=True)

What is the record label of the Do It Again performer?
   Q: Do It Again >> performer
   P.16: "Do It Again (Put Ya Hands Up)" is the lead single from rapper Jay-Z's fourth album "Vol. 3... Life and Times of S. Carter". The song features production by Rockwilder, including guest vocals by Amil and Beanie Sigel.
   A: Jay-Z
   Q: #1 >> record label
   P.1: Christión was a male duo featuring brothers Kenni Ski and Allen Anthony, the first R&B act to be signed to Jay-Z's Roc-A-Fella Records. They released their single "Full of Smoke" on Roc-A-Fella in 1996, reaching #53 on the Hot 100 and #15 on the R&B chart.
   A: Roc-A-Fella Records

All paragraphs
The Opening is a live album by American jazz pianist Mal Waldron featuring a performance recorded in Paris in 1970 and released on the French Futura label.
Christión was a male duo featuring brothers Kenni Ski and Allen Anthony, the first R&B act to be signed to Jay-Z's Roc-A-Fella Records. They released their single "Full of Smoke" on Roc-A