# Knowledge Graph Index 
## Default llama index prompt

In [2]:
import os
import time
import pathlib

import openai
os.environ['OPENAI_API_KEY'] = "sk-48characterstofakeanopenaikey48charactersopenai0"
os.environ['OPENAI_API_BASE'] = "http://10.0.0.222:30307/v1"
openai.api_key = "sk-48characterstofakeanopenaikey48charactersopenai0"
openai.api_base = "http://10.0.0.222:30307/v1"

model = "Writer/camel-5b-hf"
#model = "mosaicml/mpt-7b-instruct"
#model = "mosaicml/mpt-30b-instruct"

In [3]:
import logging
import sys

#kron extensions to llama_index to support openai compatible api
sys.path.append('../llama-index/')

logging.basicConfig(stream=sys.stdout, level=logging.INFO)

## Using Knowledge Graph

#### Building the Knowledge Graph

In [4]:

from llama_index import StorageContext
from llama_index import SimpleDirectoryReader, ServiceContext
from llama_index import KnowledgeGraphIndex
from llama_index.graph_stores import SimpleGraphStore 
from llama_index import load_index_from_storage 
from llama_index.langchain_helpers.text_splitter import SentenceSplitter
from llama_index.node_parser import SimpleNodeParser

import tiktoken

#extensions to llama_index to support openai compatible endpoints, e.g. llama-api
from kron.llm_predictor.KronOpenAILLM import KronOpenAI
from kron.llm_predictor.KronLLMPredictor import KronLLMPredictor
#from kron.indices.knowledge_graph.KronKnowledgeGraphIndex import KronKnowledgeGraphIndex 
#from kron.prompts.kg_prompts import KRON_KG_TRIPLET_EXTRACT_PROMPT

INFO:numexpr.utils:NumExpr defaulting to 4 threads.




In [4]:
#writer/camel uses endoftext 
from llama_index.utils import globals_helper
enc = tiktoken.get_encoding("gpt2")
tokenizer = lambda text: enc.encode(text, allowed_special={"<|endoftext|>"})
globals_helper._tokenizer = tokenizer

In [5]:
CORPUS_BASE = '/home/arylwen/datasets/documents/ArxivHealthcareNLP'
TXT_BASE = f'{CORPUS_BASE}/text_cleaned/'
PROCESSED_TXT_BASE = f'{CORPUS_BASE}/text_cleaned_out/'
persist_path = f"storage/{model.replace('/', '-')}-default"

#folder to save succesive versions of the pyvis graph
HTML_FOLDER = f"html/{model.replace('/', '-')}-default"
if not os.path.exists(HTML_FOLDER):
    os.makedirs(HTML_FOLDER)

In [6]:
documents = SimpleDirectoryReader(TXT_BASE, filename_as_id=True).load_data()

In [7]:
# define LLM
#llm=OpenAI(temperature=0.01, model=model)
llm=KronOpenAI(temperature=0.01, model=model)
#chunk_size+prompt_length+expected length of returned triples must be less than max_tokens
#llm.max_tokens = 274 #128-32
#some sentences can be really long and the text spliter will enter an infinit loop
#llm.max_tokens = 400 #256-64
llm.max_tokens = 384 #192-48
llm_predictor = KronLLMPredictor(llm)
print(llm_predictor.metadata)

context_window=2048 num_output=384 is_chat_model=False model_name='unknown'


In [8]:
# define TextSplitter

#text_splitter = SentenceSplitter(chunk_size=128, chunk_overlap=32)
text_splitter = SentenceSplitter(chunk_size=192, chunk_overlap=48, paragraph_separator='\n')


In [9]:
#define NodeParser
node_parser = SimpleNodeParser(text_splitter=text_splitter)

In [10]:
#define ServiceContext
service_context = ServiceContext.from_defaults(llm_predictor=llm_predictor, node_parser=node_parser)

In [None]:
## create pyvis graph
def save_pyvis_network_graph(file_name):
    #display all nodes
    g = index.get_networkx_graph(limit = 6000)
    net = Network(height='1000px', width='100%', notebook=True, cdn_resources="in_line", directed=True)
    net.from_nx(g)
    html_name = f'{HTML_FOLDER}/{file_name}'
    print(html_name)
    net.show(html_name)

In [11]:


if os.path.exists(persist_path):
    start = time.time()
    print(f'Loading index from {persist_path}')
    # rebuild storage context
    storage_context = StorageContext.from_defaults(persist_dir=persist_path)
    # load index
    index = load_index_from_storage(storage_context=storage_context, 
                                    service_context=service_context, 
                                    max_triplets_per_chunk=2,
                                    show_progress = True)
    ## add documents to index
    #print(index)
    #print(type(documents))
    for d in documents:
        file_name = pathlib.Path(d.id_).name
        print(f'Processing: {file_name}')
        #index the document: extract triples and inseart into the KG graph
        index.insert(document = d)
        #move the file to the processed folder
        in_file_name = f'{TXT_BASE}/{file_name}'
        processed_file_name = f'{PROCESSED_TXT_BASE}/{file_name}'
        pathlib.Path(in_file_name).rename(processed_file_name)
        #index is modified after each doc
        save_pyvis_network_graph(file_name)
        index.storage_context.persist(persist_dir=persist_path)
    end = time.time()
    print(f"Documents added in: {end-start}s")
else:
    print('No KGIndex found, starting fresh.')
    graph_store = SimpleGraphStore()
    storage_context = StorageContext.from_defaults(graph_store=graph_store)

    # NOTE: can take a while! 

    start = time.time()
    index = KnowledgeGraphIndex.from_documents(
        documents,
        max_triplets_per_chunk=2,
        storage_context=storage_context,
        service_context=service_context,
    )
    #move files to the processed files folder/s3/other location
    for d in documents:
        #d.id_ is the full path of the input file
        file_name = pathlib.Path(d.id_).name
        in_file_name = f'{TXT_BASE}/{file_name}'
        processed_file_name = f'{PROCESSED_TXT_BASE}/{file_name}'
        pathlib.Path(in_file_name).rename(processed_file_name)
        save_pyvis_network_graph(file_name)
    #save index TODO what if it fails - compensatory transaction
    index.storage_context.persist(persist_dir=persist_path)
    end = time.time()
    print(f"Knowledge Graph built in: {end-start}s")


Loading index from storage/Writer-camel-5b-hf-default
INFO:llama_index.indices.loading:Loading all indices.
Documents added in: 33301.90764045715s
