# Knowledge Graph Index

In [1]:
import os
import time

import openai
os.environ['OPENAI_API_KEY'] = "EMPTY"
os.environ['OPENAI_API_BASE'] = "http://10.0.0.222:30307/v1"
openai.api_key = "EMPTY"
openai.api_base = "http://10.0.0.222:30307/v1"

model = "Writer/camel-5b-hf"
#model = "mosaicml/mpt-7b-instruct"
#model = "mosaicml/mpt-30b-instruct"

In [2]:
import logging
import sys

#kron extensions to llama_index to support openai compatible api
sys.path.append('../llama_index/')

logging.basicConfig(stream=sys.stdout, level=logging.INFO)

## Using Knowledge Graph

#### Building the Knowledge Graph

In [3]:

from llama_index.storage.storage_context import StorageContext
from llama_index import SimpleDirectoryReader, ServiceContext
from llama_index.graph_stores import SimpleGraphStore 
from llama_index.langchain_helpers.text_splitter import SentenceSplitter
from llama_index.node_parser import SimpleNodeParser

import tiktoken

#extensions to llama_index to support openai compatible endpoints, e.g. llama-api
from kron.llm_predictor.KronOpenAILLM import OpenAI
from kron.llm_predictor.KronLLMPredictor import KronLLMPredictor
from kron.indices.knowledge_graph.KronKnowledgeGraphIndex import KronKnowledgeGraphIndex 
from kron.prompts.kg_prompts import KRON_KG_TRIPLET_EXTRACT_PROMPT

INFO:numexpr.utils:NumExpr defaulting to 4 threads.


In [4]:
#writer/camel uses endoftext 
from llama_index.utils import globals_helper
enc = tiktoken.get_encoding("gpt2")
tokenizer = lambda text: enc.encode(text, allowed_special={"<|endoftext|>"})
globals_helper._tokenizer = tokenizer

In [5]:
CORPUS_BASE = '/home/arylwen/datasets/documents/ArxivHealthcareNLP'
TXT_BASE = f'{CORPUS_BASE}/text_cleaned/'
#TXT_BASE = '/home/arylwen/datasets/documents/text_cleaned/'

In [6]:
documents = SimpleDirectoryReader(TXT_BASE).load_data()

In [7]:
# define LLM
llm=OpenAI(temperature=0.01, model=model)
#chunk_size+prompt_length+expected length of returned triples must be less than max_tokens
#llm.max_tokens = 274 #128-32
#some sentences can be really long and the text spliter will enter an infinit loop
#llm.max_tokens = 400 #256-64
llm.max_tokens = 384 #192-48
llm_predictor = KronLLMPredictor(llm)
print(llm_predictor.metadata)

context_window=2048 num_output=384 is_chat_model=False


In [8]:
# define TextSplitter

#text_splitter = SentenceSplitter(chunk_size=128, chunk_overlap=32)
text_splitter = SentenceSplitter(chunk_size=192, chunk_overlap=48, paragraph_separator='\n')


In [9]:
#define NodeParser
node_parser = SimpleNodeParser(text_splitter=text_splitter)

In [10]:
#define ServiceContext
service_context = ServiceContext.from_defaults(llm_predictor=llm_predictor, node_parser=node_parser)

In [11]:

graph_store = SimpleGraphStore()
storage_context = StorageContext.from_defaults(graph_store=graph_store)

# NOTE: can take a while! 

start = time.time()
index = KronKnowledgeGraphIndex.from_documents(
    documents,
    max_triplets_per_chunk=2,
    storage_context=storage_context,
    service_context=service_context,
)
end = time.time()
print(f"Knowledge Graph built in: {end-start}s")

**********splits:1
**********splits:201
**********splits:377
**********splits:192
**********splits:104
**********splits:97
**********splits:64
**********splits:93
**********splits:164
**********splits:253
**********splits:207
**********splits:29
**********splits:181
**********splits:49
**********splits:117
**********splits:509
**********splits:142
**********splits:160
**********splits:84
**********splits:112
**********splits:101
**********splits:52
**********splits:205
**********splits:147
**********splits:287
**********splits:170
**********splits:107
**********splits:72
**********splits:94
**********splits:170
**********splits:1
**********splits:113
**********splits:117
**********splits:1
**********splits:2
**********splits:1
**********splits:75
**********splits:267
**********splits:1
**********splits:125
**********splits:1
**********splits:65
**********splits:229
**********splits:172
**********splits:803
**********splits:504
**********splits:159
**********splits:82
**********splits:4

Processing nodes:   0%|          | 0/16754 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
persist_path = f"storage/{model.replace('/', '-')}-default"
index.storage_context.persist(persist_dir=persist_path)

In [None]:
## create graph
from pyvis.network import Network

g = index.get_networkx_graph(limit = 1000)
net = Network(height='800px', width='100%', notebook=True, cdn_resources="in_line", directed=True)
net.from_nx(g)
#net.show_buttons(filter_=True)
net.show(f"{model.replace('/', '-')}-default-kg-prompt.html")