# Knowledge Graph Index

In [1]:
# My OpenAI Key
import os
import time

import openai
os.environ['OPENAI_API_KEY'] = "EMPTY"
os.environ['OPENAI_API_BASE'] = "http://10.0.0.222:30307/v1"
openai.api_key = "EMPTY"
openai.api_base = "http://10.0.0.222:30307/v1"

#model = "Writer/camel-5b-hf"
#model = "mosaicml/mpt-7b-instruct"
model = "mosaicml/mpt-30b-instruct"

In [2]:
import logging
import sys

sys.path.append('../')

logging.basicConfig(stream=sys.stdout, level=logging.INFO)

## Using Knowledge Graph

#### Building the Knowledge Graph

In [3]:

from llama_index.storage.storage_context import StorageContext
from llama_index import SimpleDirectoryReader, ServiceContext
from llama_index.graph_stores import SimpleGraphStore 
from llama_index.langchain_helpers.text_splitter import SentenceSplitter
from llama_index.node_parser import SimpleNodeParser
from llama_index.prompts.base import Prompt
from llama_index.prompts.prompt_type import PromptType


import tiktoken

from kron.llm_predictor.KronOpenAILLM import OpenAI
from kron.llm_predictor.KronLLMPredictor import KronLLMPredictor
from kron.indices.knowledge_graph.KronKnowledgeGraphIndex import KronKnowledgeGraphIndex 
from kron.prompts.kg_prompts import KRON_KG_TRIPLET_EXTRACT_PROMPT

INFO:numexpr.utils:NumExpr defaulting to 8 threads.


In [4]:
#writer/camel sens endoftext back
from llama_index.utils import globals_helper
enc = tiktoken.get_encoding("gpt2")
tokenizer = lambda text: enc.encode(text, allowed_special={"<|endoftext|>"})
globals_helper._tokenizer = tokenizer

In [5]:
documents = SimpleDirectoryReader('data/').load_data()

In [6]:
CAMEL_INLINE_KG_PROMPT_TEMPLATE = (
            "Below is an instruction that describes a task, paired with an input that provides further context. "
            "Write a response that appropriately completes the request.\n\n"
            "### Instruction:\n"  
            "Some text is provided below. Given the text, extract up to {max_knowledge_triplets} knowledge triplets in the form of " 
            "(subject, predicate, object). \n\n"  
            "### Input: \n"
            "Text: Alice is Bob's mother. \n" 
            "Triplets: \n"
            "    (Alice, is mother of, Bob) \n"
            "Text: Philz is a coffee shop founded in Berkeley in 1982. \n"
            "Triplets: \n"
            "    (Philz, is, coffee shop) \n"
            "    (Philz, founded in, Berkeley) \n"
            "    (Philz, founded in, 1982) \n"
            "Text: This small and colorful book is intended for children. It was named after the Moon, and was gifted to Jack. \n"
            "Triplets: \n"
            "    (book, intended for, children)\n"
            "    (book, is, small) \n"
            "    (book, is, colorful) \n"
            "    (book, named after, Moon) \n"
            "    (book, gifted to, Jack) \n"    
            "Text: Nick saw a few dwellings, brightly painted cottages, shining in the sun. They were not ready for guests. \n"
            "Triplets: \n"
            "    (dwellings, are, cottages) \n"
            "    (dwellings, shine in, sun) \n"
            "    (dwellings, not ready for, guests) \n"
            "    (dwellings, seen by, Nick) \n"
            "    (dwellings, are, a few) \n"
            "    (cottages, are, brightly painted) \n"
            "\n### Text: {text} \n"
            "\n### Triplets:"
)

CAMEL_INLINE_KG_TRIPLET_EXTRACT_PROMPT = Prompt(
    CAMEL_INLINE_KG_PROMPT_TEMPLATE, prompt_type=PromptType.KNOWLEDGE_TRIPLET_EXTRACT
)

In [7]:
MPT_SHORT_INLINE_KG_PROMPT_TEMPLATE = (
            "Below is an instruction that describes a task, paired with an input that provides further context. "
            "Write a response that appropriately completes the request.\n\n"
            "### Instruction:\n"  
            "Some text is provided below. Given the text, extract up to {max_knowledge_triplets}  knowledge triplets in the form of " 
            "(subject, predicate, object). \n\n" 
            "### Input: \n"
            "Text: Alice is Bob's mother. \n" 
            "Triplets: \n"
            "    (Alice, is mother of, Bob) \n"
            "Text: Philz is a coffee shop founded in Berkeley in 1982. \n"
            "Triplets: \n"
            "    (Philz, is, coffee shop) \n"
            "    (Philz, founded in, Berkeley) \n"
            "    (Philz, founded in, 1982) \n"
            "### Text: {text} \n\n"
            "### Response:"
)

MPT_SHORT_INLINE_KG_TRIPLET_EXTRACT_PROMPT = Prompt(
    MPT_SHORT_INLINE_KG_PROMPT_TEMPLATE, prompt_type=PromptType.KNOWLEDGE_TRIPLET_EXTRACT
)

In [8]:
MPT_LONG_INLINE_KG_PROMPT_TEMPLATE = (
            "Below is an instruction that describes a task, paired with an input that provides further context. "
            "Write a response that appropriately completes the request.\n\n"
            "### Instruction:\n"  
            "Some text is provided below. Given the text, extract up to {max_knowledge_triplets} knowledge triplets in the form of " 
            "(subject, predicate, object). Avoid duplicates. \n\n"  
            "### Input: \n"
            "Text: Alice is Bob's mother. \n" 
            "Triplets: \n"
            "    (Alice, is mother of, Bob) \n"
            "Text: Philz is a coffee shop founded in Berkeley in 1982. \n"
            "Triplets: \n"
            "    (Philz, is, coffee shop) \n"
            "    (Philz, founded in, Berkeley) \n"
            "    (Philz, founded in, 1982) \n"
            "Text: This small and colorful book is intended for children. It was named after the Moon, and was gifted to Jack. \n"
            "Triplets: \n"
            "    (book, intended for, children)\n"
            "    (book, is, small) \n"
            "    (book, is, colorful) \n"
            "    (book, named after, Moon) \n"
            "    (book, gifted to, Jack) \n"    
            "Text: Nick saw a few dwellings, brightly painted cottages, shining in the sun. They were not ready for guests. \n"
            "Triplets: \n"
            "    (dwellings, are, cottages) \n"
            "    (dwellings, shine in, sun) \n"
            "    (dwellings, not ready for, guests) \n"
            "    (dwellings, seen by, Nick) \n"
            "    (dwellings, are, a few) \n"
            "    (cottages, are, brightly painted) \n"
            "### Text: {text} \n"
            "### Triplets:"
)

MPT_LONG_INLINE_KG_TRIPLET_EXTRACT_PROMPT = Prompt(
    MPT_LONG_INLINE_KG_PROMPT_TEMPLATE, prompt_type=PromptType.KNOWLEDGE_TRIPLET_EXTRACT
)

In [9]:
max_triplets = 2
chunk_size = 192
chunk_overlap = 48
if 'mpt-30b' in model:
    #prompt = MPT_SHORT_INLINE_KG_TRIPLET_EXTRACT_PROMPT
    #max_tokens = 320
    prompt = MPT_LONG_INLINE_KG_TRIPLET_EXTRACT_PROMPT
    max_tokens = 510
    max_triplets = 3
    chunk_size = 128 #160
    chunk_overlap = 32
elif 'camel-5b' in model:
    #camel generates more tokens from the same text than mpt
    prompt = CAMEL_INLINE_KG_TRIPLET_EXTRACT_PROMPT
    max_tokens = 704
elif 'mpt-7b' in model:
    prompt = CAMEL_INLINE_KG_TRIPLET_EXTRACT_PROMPT
    max_tokens = 560
else:
    prompt = None
    max_tokens = 274

print(str(prompt))

<llama_index.prompts.base.Prompt object at 0x7f545a6b5640>


In [10]:
# define LLM; this is the kron OpenAI - supports local models
llm=OpenAI(temperature=0.01, model=model)
#chunk_size+prompt_length+expected length of returned triples must be less than max_tokens
llm.max_tokens = max_tokens
llm_predictor = KronLLMPredictor(llm)
print(llm_predictor.metadata)

context_window=2048 num_output=510 is_chat_model=False


In [11]:
# define TextSplitterTextSplitter
text_splitter = SentenceSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)

In [12]:
#define NodeParser
node_parser = SimpleNodeParser(text_splitter=text_splitter)

In [13]:
#define ServiceContext
service_context = ServiceContext.from_defaults(llm_predictor=llm_predictor, node_parser=node_parser)

In [14]:

graph_store = SimpleGraphStore()
storage_context = StorageContext.from_defaults(graph_store=graph_store)

# NOTE: can take a while! 

start = time.time()
index = KronKnowledgeGraphIndex.from_documents(
    documents,
    max_triplets_per_chunk=max_triplets,
    storage_context=storage_context,
    service_context=service_context,
    kg_triple_extract_template=prompt,
)
end = time.time()
print(f"Knowledge Graph built in: {end-start}s")

Processing nodes:   0%|          | 0/189 [00:00<?, ?it/s]

Knowledge Graph built in: 54661.5436360836s


In [15]:
persist_path = f"storage/{model.replace('/', '-')}-long-inline"
index.storage_context.persist(persist_dir=persist_path)

In [16]:
## create graph
from pyvis.network import Network

#display all nodes
g = index.get_networkx_graph(limit = 5000)
net = Network(height='1000px', width='100%', notebook=True, cdn_resources="in_line", directed=True)
net.from_nx(g)
#net.show_buttons(filter_=True)
net.show(f"{model.replace('/', '-')}-long-inline-kg-prompt.html")


mosaicml-mpt-30b-instruct-long-inline-kg-prompt.html
