In [11]:
from dotenv import load_dotenv

load_dotenv()

True

In [1]:
import os
import json
from pathlib import Path

import numpy as np
import pandas as pd
import torch

In [3]:
DATA_DIR = Path('../data/musique/')

## Dataset

In [4]:
from datasets import load_dataset, DatasetDict

raw_dsd = load_dataset('bdsaglam/musique', 'answerable')
raw_dsd

DatasetDict({
    train: Dataset({
        features: ['id', 'paragraphs', 'question', 'question_decomposition', 'answer', 'answerable'],
        num_rows: 19938
    })
    validation: Dataset({
        features: ['id', 'paragraphs', 'question', 'question_decomposition', 'answer', 'answerable'],
        num_rows: 2417
    })
})

In [5]:
raw_dsd['train'][0].keys()

dict_keys(['id', 'paragraphs', 'question', 'question_decomposition', 'answer', 'answerable'])

In [6]:
raw_dsd['train'][0]['paragraphs'].keys()

dict_keys(['idx', 'title', 'paragraph_text', 'is_supporting'])

In [7]:
raw_dsd['train'][0]['paragraphs']['paragraph_text'][0]

'Pakistan Super League (Urdu: پاکستان سپر لیگ \u202c \u200e; PSL) is a Twenty20 cricket league, founded in Lahore on 9 September 2015 with five teams and now comprises six teams. Instead of operating as an association of independently owned teams, the league is a single entity in which each franchise is owned and controlled by investors.'

In [8]:
n = 100
dsd = DatasetDict({
    'train': raw_dsd['train'].select(range(n)),
    'validation': raw_dsd['validation'].select(range(n)),
})
dsd

DatasetDict({
    train: Dataset({
        features: ['id', 'paragraphs', 'question', 'question_decomposition', 'answer', 'answerable'],
        num_rows: 100
    })
    validation: Dataset({
        features: ['id', 'paragraphs', 'question', 'question_decomposition', 'answer', 'answerable'],
        num_rows: 100
    })
})

In [9]:
def present_example(example, show_context=False):
    print(example['question'])
    paragraphs = example['paragraphs']['paragraph_text']
    for q, a, idx in zip(example['question_decomposition']['question'], example['question_decomposition']['answer'], example['question_decomposition']['paragraph_support_idx']):
        print('  ', f'Q: {q}')
        print('  ', f"P.{idx}:", paragraphs[idx])
        print('  ', f'A: {a}')
    if show_context:
        print("\nAll paragraphs")
        print('\n'.join(paragraphs))

### Question decomposition

In [10]:
import langchain
from langchain.chains import LLMChain
from langchain.chat_models import ChatOpenAI
from langchain.schema import (
    AIMessage,
    HumanMessage,
    SystemMessage
)
from langchain.prompts import PromptTemplate
from langchain.prompts.chat import (
    ChatPromptTemplate,
    SystemMessagePromptTemplate,
    HumanMessagePromptTemplate,
)

from langchain.cache import InMemoryCache

langchain.llm_cache = InMemoryCache()

In [43]:
QD_PROMPT_TEMPLATE = """
Decompose the given question into 2 sub-questions such that when sub-questions are answered, the original question can be answered correctly.
The second subquestion must refer to the answer of the first question by `#1` as in the examples below. Do not create open-ended sub-questions like "Who is ..." or "How is ...".

Question: What year saw the creation of the region where the county of Hertfordshire is located?
Sub-questions:
1. In which state is Hertfordshire located?
2. When was #1 birthed?

Question: When was the institute that owned The Collegian founded?
Sub-questions:
1. Which institute does own The Collegian?
2. When #1 founded?

""".strip()


def parse_sub_questions(output: str):
    flag = False
    for line in output.splitlines():
        if line.lower().startswith('sub-questions'):
            flag = True
            continue
        if flag:
            yield line.split('.', 1)[-1].strip()


def make_question_decomposer(model: str = 'gpt-3.5-turbo', temperature: float = 0):
    system_message_prompt = SystemMessagePromptTemplate.from_template(QD_PROMPT_TEMPLATE)
    human_message_prompt = HumanMessagePromptTemplate.from_template("Question: {question}")
    chat_prompt = ChatPromptTemplate.from_messages([system_message_prompt, human_message_prompt])
    llm = ChatOpenAI(temperature=temperature, model=model)
    chain = LLMChain(llm=llm, prompt=chat_prompt)
    def func(question):
        out = chain.run(question=question)
        return list(parse_sub_questions(out))
    return func

qdecomposer = make_question_decomposer()

### Question answering with KG

In [44]:
from llama_index import Document

def make_docs(example, only_supporting=False):
    ps = example['paragraphs']
    for idx, title, body, is_supporting in zip(ps['idx'], ps['title'], ps['paragraph_text'], ps['is_supporting']):
        if only_supporting and not is_supporting:
            continue
        text = f"# {title}\n{body}"
        yield Document(text=text, metadata=dict(idx=idx, is_supporting=is_supporting))

In [45]:
import logging
import sys
from langchain.embeddings import HuggingFaceEmbeddings
from llama_index.llms import OpenAI
from llama_index import ServiceContext
from llama_index import LangchainEmbedding

logging.basicConfig(stream=sys.stdout, level=logging.INFO)

# define LLM
embedding_llm = LangchainEmbedding(
    HuggingFaceEmbeddings(),
    embed_batch_size=1,
)
llm = OpenAI(temperature=0, model="gpt-3.5-turbo")
service_context = ServiceContext.from_defaults(llm=llm, chunk_size=512, embed_model=embedding_llm)

INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: sentence-transformers/all-mpnet-base-v2
INFO:sentence_transformers.SentenceTransformer:Use pytorch device: cpu


In [46]:
import kuzu

from llama_index.graph_stores import KuzuGraphStore
from llama_index import KnowledgeGraphIndex
from llama_index.llms import OpenAI
from llama_index.storage.storage_context import StorageContext
from IPython.display import Markdown, display

In [179]:
SAMPLE_IDX = 18
example = dsd['train'][SAMPLE_IDX]

In [180]:
SAMPLE_DIR = DATA_DIR / f"sample-{SAMPLE_IDX}"
SAMPLE_DIR.mkdir(exist_ok=True, parents=True)
with open(SAMPLE_DIR / "example.json", 'w') as f: 
    json.dump(example, f, ensure_ascii=False, indent=2)

present_example(example, show_context=True)

When did the country formerly known as Zaire become independent?
   Q: zaire is the former name of which african nation
   P.14: Zaire (/ zɑːˈɪər /), officially the Republic of Zaire (French: République du Zaïre; French pronunciation: ​ (za. iʁ)), was the name for the Democratic Republic of the Congo that existed between 1971 and 1997 in Central Africa. The country was a one - party totalitarian dictatorship, run by Mobutu Sese Seko and his ruling Popular Movement of the Revolution party. Zaire was established following Mobutu's seizure of power in a military coup in 1965, following five years of political upheaval following independence known as the Congo Crisis. Zaire had a strongly centralist constitution and foreign assets were nationalised. The period is sometimes referred to as the Second Congolese Republic.
   A: the Democratic Republic of the Congo
   Q: when did #1 become independent
   P.8: After an uprising by the Congolese people, Belgium surrendered to the independence of 

In [181]:
subquestions = qdecomposer(question=example['question'])

decomp = {
    'question': example['question'],
    'subquestions': subquestions,
}
with open(SAMPLE_DIR / 'question-decomp.json', 'w') as f: 
    json.dump(decomp, f, ensure_ascii=False, indent=2)


print("Groundtruth")
print(example['question_decomposition']['question'])
print()
print("Prediction")
print(example['question'])
for q in subquestions:
    print('  ' + q)


Groundtruth
['zaire is the former name of which african nation', 'when did #1 become independent']

Prediction
When did the country formerly known as Zaire become independent?
  What is the current name of the country formerly known as Zaire?
  When did #1 gain independence?


In [182]:
# Hand-crafted

KG_TRIPLET_EXTRACT_TMPL = """
Some text is provided below. Given the text, extract up to {max_knowledge_triplets}  knowledge triplets in the form of (subject, predicate, object) that might be relevant to the following question.
Prioritize triplets that:
1. Offer temporal information like 'founded in,' 'created on,' 'abolished in,' etc.
2. Provide spatial details such as 'located in,' 'borders,' 'from,' etc.
3. Show ownership or affiliation via terms like 'owned by,' 'affiliated with,' 'publisher of,' etc.
4. Offer identification or categorization like 'is,' 'are,' 'was,' etc.
Avoid stopwords.
---------------------
Example:
Question: When was the institute that owned The Collegian founded?
Text: The Collegian is the bi-weekly official student publication of Houston Baptist University in Houston, Texas.
Triplets:
(The Collegian, is, bi-weekly official student publication)
(The Collegian, owned by, Houston Baptist University)
(Houston Baptist University, in, Houston)
(Houston, in, Texas)
---------------------
Question: {question}
Text: {text}
Triplets:
""".strip()

In [183]:
print(KG_TRIPLET_EXTRACT_TMPL)

Some text is provided below. Given the text, extract up to {max_knowledge_triplets}  knowledge triplets in the form of (subject, predicate, object) that might be relevant to the following question.
Prioritize triplets that:
1. Offer temporal information like 'founded in,' 'created on,' 'abolished in,' etc.
2. Provide spatial details such as 'located in,' 'borders,' 'from,' etc.
3. Show ownership or affiliation via terms like 'owned by,' 'affiliated with,' 'publisher of,' etc.
4. Offer identification or categorization like 'is,' 'are,' 'was,' etc.
Avoid stopwords.
---------------------
Example:
Question: When was the institute that owned The Collegian founded?
Text: The Collegian is the bi-weekly official student publication of Houston Baptist University in Houston, Texas.
Triplets:
(The Collegian, is, bi-weekly official student publication)
(The Collegian, owned by, Houston Baptist University)
(Houston Baptist University, in, Houston)
(Houston, in, Texas)
---------------------
Question

In [184]:
from llama_index.prompts.base import Prompt
from llama_index.prompts.prompt_type import PromptType

KG_TRIPLET_EXTRACT_PROMPT = Prompt(
    KG_TRIPLET_EXTRACT_TMPL, 
    prompt_type=PromptType.KNOWLEDGE_TRIPLET_EXTRACT,
)

def make_kg_extract_prompt(question: str):
    if "{question}" in KG_TRIPLET_EXTRACT_TMPL:
        return KG_TRIPLET_EXTRACT_PROMPT.partial_format(question=question)
    return KG_TRIPLET_EXTRACT_PROMPT

In [185]:
kg_db_path = SAMPLE_DIR / "kg"
db = kuzu.Database(str(kg_db_path))
graph_store = KuzuGraphStore(db)
storage_context = StorageContext.from_defaults(graph_store=graph_store)
documents = list(make_docs(example, only_supporting=True))
if False and kg_db_path.exists():
    print("Loading the knowledge graph for the sample...")
    index = KnowledgeGraphIndex(
        storage_context=storage_context,
        service_context=service_context,
        include_embeddings=True,
    )
else:
    print("Building the knowledge graph for the sample...")
    index = KnowledgeGraphIndex.from_documents(
        documents,
        max_triplets_per_chunk=10,
        storage_context=storage_context,
        service_context=service_context,
        include_embeddings=True,
        kg_triple_extract_template=make_kg_extract_prompt(example['question'])
    )


Building the knowledge graph for the sample...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [186]:
from pyvis.network import Network

g = index.get_networkx_graph()
net = Network(notebook=True, cdn_resources="in_line", directed=True)
net.from_nx(g)
net.save_graph(str(SAMPLE_DIR / f"kg.html"))

In [187]:
query_engine = index.as_query_engine(include_text=False, response_mode="simple_summarize")

print(query_engine._response_synthesizer._text_qa_template.prompt.template)
query_engine._response_synthesizer._text_qa_template.prompt.template = """
Context information is below.
---------------------
{context_str}
---------------------
Given the context information and not prior knowledge, answer the question with one phrase, i.e. it shouldn't be a sentence. If the answer is not in the context, output N/A.
Question: {query_str}
Answer in one phrase: 
"""


Context information is below.
---------------------
{context_str}
---------------------
Given the context information and not prior knowledge, answer the question with one phrase, i.e. it shouldn't be a sentence. If the answer is not in the context, output N/A.
Question: {query_str}
Answer in one phrase: 



In [188]:
print(example['question'])
hop1_question = subquestions[0]
print('> ', hop1_question)
hop1_answer = query_engine.query(hop1_question).response
print('< ', hop1_answer)

When did the country formerly known as Zaire become independent?
>  What is the current name of the country formerly known as Zaire?
INFO:llama_index.indices.knowledge_graph.retriever:> Starting query: What is the current name of the country formerly known as Zaire?
INFO:llama_index.indices.knowledge_graph.retriever:> Query keywords: ['formerly known', 'current name', 'country', 'name', 'current', 'Zaire', 'formerly', 'known']


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

INFO:llama_index.indices.knowledge_graph.retriever:> Extracted relationships: The following are knowledge sequence in max depth 2 in the form of `subject [predicate, object, predicate_next_hop, object_next_hop ...]`
Zaire ['run by', 'Mobutu Sese Seko', 'leader of', 'Popular Movement of the Revolution party']
Zaire ['renamed to', 'Zaire', 'sometimes referred to as', 'Second Congolese Republic']
Zaire ['renamed to', 'Zaire', 'established following', "Mobutu's seizure of power"]
Zaire ['following', 'five years of political upheaval', 'known as', 'Congo Crisis']
Zaire ['renamed to', 'Zaire', 'following', 'five years of political upheaval']
Zaire ['established following', "Mobutu's seizure of power", 'in', '1965']
Zaire ['renamed to', 'Zaire', 'officially known as', 'Republic of Zaire']
Zaire ['renamed to', 'Zaire', 'existed between', '1971 and 1997']
Zaire ['sometimes referred to as', 'Second Congolese Republic']
Zaire ['established following', "Mobutu's seizure of power"]
Zaire ['renamed 

In [189]:
hop2_question = subquestions[1].replace("#1", hop1_answer)
print('> ',hop2_question)
hop2_answer = query_engine.query(hop2_question).response
print('< ',hop2_answer)

>  When did The current name of the country formerly known as Zaire is not provided in the given context. I'm sorry, but I can't answer the question based on the provided information. gain independence?
INFO:llama_index.indices.knowledge_graph.retriever:> Starting query: When did The current name of the country formerly known as Zaire is not provided in the given context. I'm sorry, but I can't answer the question based on the provided information. gain independence?
INFO:llama_index.indices.knowledge_graph.retriever:> Query keywords: ['current name', 'independence', 'name', 'country', 'formerly', 'formerly known', 'current', 'Zaire', 'known']


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

INFO:llama_index.indices.knowledge_graph.retriever:> Extracted relationships: The following are knowledge sequence in max depth 2 in the form of `subject [predicate, object, predicate_next_hop, object_next_hop ...]`
Zaire ['run by', 'Mobutu Sese Seko', 'leader of', 'Popular Movement of the Revolution party']
Zaire ['renamed to', 'Zaire', 'sometimes referred to as', 'Second Congolese Republic']
Zaire ['renamed to', 'Zaire', 'established following', "Mobutu's seizure of power"]
Zaire ['following', 'five years of political upheaval', 'known as', 'Congo Crisis']
Zaire ['renamed to', 'Zaire', 'following', 'five years of political upheaval']
Zaire ['established following', "Mobutu's seizure of power", 'in', '1965']
Zaire ['renamed to', 'Zaire', 'officially known as', 'Republic of Zaire']
Zaire ['renamed to', 'Zaire', 'existed between', '1971 and 1997']
Zaire ['sometimes referred to as', 'Second Congolese Republic']
Zaire ['established following', "Mobutu's seizure of power"]
Zaire ['renamed 

In [190]:
print(example['question_decomposition']['answer'])

['the Democratic Republic of the Congo', '1960']
