# Knowledge Graph Index 
## Custom prompt

In [None]:
import os
import time
import pathlib
from pyvis.network import Network

import openai
os.environ['OPENAI_API_KEY'] = "sk-48characterstofakeanopenaikey48charactersopenai0"
os.environ['OPENAI_API_BASE'] = "http://10.0.0.222:30307/v1"
openai.api_key = "sk-48characterstofakeanopenaikey48charactersopenai0"
openai.api_base = "http://10.0.0.222:30307/v1"

import sys
import logging

sys.path.append('../llama-index')
logging.basicConfig(stream=sys.stdout, level=logging.INFO)

# KG hyper-parameters

In [None]:

model = "Writer/camel-5b-hf"
#model = "mosaicml/mpt-7b-instruct"
#model = "mosaicml/mpt-30b-instruct"

CORPUS = 'ArxivHealthcareNLP'
#CORPUS = 'arxiv_cl'

#INDEX_NAME = f"{model.replace('/', '-')}-inline-no-coref"
INDEX_NAME = f"{model.replace('/', '-')}-inline-coref"

In [None]:
def load_properties(filepath, sep='=', comment_char='#'):
    '''
    Read the file passed as parameter as a properties file.
    '''
    props = {}
    with open(filepath, "rt") as f:
        for line in f:
            l = line.strip()
            if l and not l.startswith(comment_char):
                key_value = l.split(sep)
                key = key_value[0].strip()
                value = sep.join(key_value[1:]).strip().strip('"') 
                props[key] = value 
    return props

corpus_properties = load_properties(f"corpora/{CORPUS}.properties")
corpus_properties

## Using Knowledge Graph

#### Building the Knowledge Graph

In [None]:

from llama_index import StorageContext
from llama_index import SimpleDirectoryReader, ServiceContext
from llama_index import KnowledgeGraphIndex
from llama_index.graph_stores import SimpleGraphStore 
from llama_index import load_index_from_storage 
from llama_index.langchain_helpers.text_splitter import SentenceSplitter
from llama_index.node_parser import SimpleNodeParser
from llama_index.prompts.base import Prompt
from llama_index.prompts.prompt_type import PromptType

import tiktoken

from kron.llm_predictor.KronOpenAILLM import KronOpenAI
from kron.llm_predictor.KronLLMPredictor import KronLLMPredictor

In [None]:
CORPUS_BASE = corpus_properties['corpus_base']
TXT_BASE = f'{CORPUS_BASE}/text_cleaned_in/'
PROCESSED_TXT_BASE = f'{CORPUS_BASE}/text_cleaned_out/'
persist_path = f"storage/{INDEX_NAME}"

#folder to save succesive versions of the pyvis graph
HTML_FOLDER = f"html/{INDEX_NAME}"
if not os.path.exists(HTML_FOLDER):
    print(f'Creating {HTML_FOLDER}.')
    os.makedirs(HTML_FOLDER)

if not os.path.exists(PROCESSED_TXT_BASE):
    print(f'Creating {PROCESSED_TXT_BASE}.')
    os.makedirs(PROCESSED_TXT_BASE)

In [None]:
#Writer/camel sends <|endoftext|> back
from llama_index.utils import globals_helper
enc = tiktoken.get_encoding("gpt2")
tokenizer = lambda text: enc.encode(text, allowed_special={"<|endoftext|>"})
globals_helper._tokenizer = tokenizer

In [None]:
documents = SimpleDirectoryReader(TXT_BASE, filename_as_id=True).load_data()

In [None]:
CAMEL_INLINE_KG_PROMPT_TEMPLATE = (
            "Below is an instruction that describes a task, paired with an input that provides further context. "
            "Write a response that appropriately completes the request.\n\n"
            "### Instruction:\n"  
            "Some text is provided below. Given the text, extract up to {max_knowledge_triplets} knowledge triplets in the form of " 
            "(subject, predicate, object). \n\n"  
            "### Input: \n"
            "Text: Alice is Bob's mother. \n" 
            "Triplets: \n"
            "    (Alice, is mother of, Bob) \n"
            "Text: Philz is a coffee shop founded in Berkeley in 1982. \n"
            "Triplets: \n"
            "    (Philz, is, coffee shop) \n"
            "    (Philz, founded in, Berkeley) \n"
            "    (Philz, founded in, 1982) \n"
            "Text: This small and colorful book is intended for children. It was named after the Moon, and was gifted to Jack. \n"
            "Triplets: \n"
            "    (book, intended for, children)\n"
            "    (book, is, small) \n"
            "    (book, is, colorful) \n"
            "    (book, named after, Moon) \n"
            "    (book, gifted to, Jack) \n"    
            "Text: Nick saw a few dwellings, brightly painted cottages, shining in the sun. They were not ready for guests. \n"
            "Triplets: \n"
            "    (dwellings, are, cottages) \n"
            "    (dwellings, shine in, sun) \n"
            "    (dwellings, not ready for, guests) \n"
            "    (dwellings, seen by, Nick) \n"
            "    (dwellings, are, a few) \n"
            "    (cottages, are, brightly painted) \n"
            "\n### Text: {text} \n"
            "\n### Triplets:"
)

CAMEL_INLINE_KG_TRIPLET_EXTRACT_PROMPT = Prompt(
    CAMEL_INLINE_KG_PROMPT_TEMPLATE, prompt_type=PromptType.KNOWLEDGE_TRIPLET_EXTRACT
)

In [None]:
MPT_SHORT_INLINE_KG_PROMPT_TEMPLATE = (
            "Below is an instruction that describes a task, paired with an input that provides further context. "
            "Write a response that appropriately completes the request.\n\n"
            "### Instruction:\n"  
            "Some text is provided below. Given the text, extract up to {max_knowledge_triplets}  knowledge triplets in the form of " 
            "(subject, predicate, object). \n\n" 
            "### Input: \n"
            "Text: Alice is Bob's mother. \n" 
            "Triplets: \n"
            "    (Alice, is mother of, Bob) \n"
            "Text: Philz is a coffee shop founded in Berkeley in 1982. \n"
            "Triplets: \n"
            "    (Philz, is, coffee shop) \n"
            "    (Philz, founded in, Berkeley) \n"
            "    (Philz, founded in, 1982) \n"
            "### Text: {text} \n\n"
            "### Response:"
)

MPT_SHORT_INLINE_KG_TRIPLET_EXTRACT_PROMPT = Prompt(
    MPT_SHORT_INLINE_KG_PROMPT_TEMPLATE, prompt_type=PromptType.KNOWLEDGE_TRIPLET_EXTRACT
)

In [None]:
MPT_LONG_INLINE_KG_PROMPT_TEMPLATE = (
            "Below is an instruction that describes a task, paired with an input that provides further context. "
            "Write a response that appropriately completes the request.\n\n"
            "### Instruction:\n"  
            "Some text is provided below. Given the text, extract up to {max_knowledge_triplets} knowledge triplets in the form of " 
            "(subject, predicate, object). Avoid duplicates. \n\n"  
            "### Input: \n"
            "Text: Alice is Bob's mother. \n" 
            "Triplets: \n"
            "    (Alice, is mother of, Bob) \n"
            "Text: Philz is a coffee shop founded in Berkeley in 1982. \n"
            "Triplets: \n"
            "    (Philz, is, coffee shop) \n"
            "    (Philz, founded in, Berkeley) \n"
            "    (Philz, founded in, 1982) \n"
            "Text: This small and colorful book is intended for children. It was named after the Moon, and was gifted to Jack. \n"
            "Triplets: \n"
            "    (book, intended for, children)\n"
            "    (book, is, small) \n"
            "    (book, is, colorful) \n"
            "    (book, named after, Moon) \n"
            "    (book, gifted to, Jack) \n"    
            "Text: Nick saw a few dwellings, brightly painted cottages, shining in the sun. They were not ready for guests. \n"
            "Triplets: \n"
            "    (dwellings, are, cottages) \n"
            "    (dwellings, shine in, sun) \n"
            "    (dwellings, not ready for, guests) \n"
            "    (dwellings, seen by, Nick) \n"
            "    (dwellings, are, a few) \n"
            "    (cottages, are, brightly painted) \n"
            "### Text: {text} \n"
            "### Triplets:"
)

MPT_LONG_INLINE_KG_TRIPLET_EXTRACT_PROMPT = Prompt(
    MPT_LONG_INLINE_KG_PROMPT_TEMPLATE, prompt_type=PromptType.KNOWLEDGE_TRIPLET_EXTRACT
)

In [None]:
max_triplets = 2
chunk_size = 192
chunk_overlap = 48
if 'mpt-30b' in model:
    #prompt = MPT_SHORT_INLINE_KG_TRIPLET_EXTRACT_PROMPT
    #max_tokens = 320
    prompt = MPT_LONG_INLINE_KG_TRIPLET_EXTRACT_PROMPT
    max_tokens = 510
    max_triplets = 3
    chunk_size = 128 #160
    chunk_overlap = 32
elif 'camel-5b' in model:
    #camel generates more tokens from the same text than mpt
    prompt = CAMEL_INLINE_KG_TRIPLET_EXTRACT_PROMPT
    max_tokens = 704
elif 'mpt-7b' in model:
    prompt = CAMEL_INLINE_KG_TRIPLET_EXTRACT_PROMPT
    max_tokens = 560
else:
    prompt = None
    max_tokens = 274

print(str(prompt))

In [None]:
# define LLM; this is the kron OpenAI - supports local models
llm=KronOpenAI(temperature=0.01, model=model)
#chunk_size+prompt_length+expected length of returned triples must be less than max_tokens
llm.max_tokens = max_tokens
llm_predictor = KronLLMPredictor(llm)
print(llm_predictor.metadata)

In [None]:
# define TextSplitterTextSplitter
text_splitter = SentenceSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap, paragraph_separator='\n')

In [None]:
#define NodeParser
node_parser = SimpleNodeParser(text_splitter=text_splitter)

In [None]:
#define ServiceContext
service_context = ServiceContext.from_defaults(llm_predictor=llm_predictor, node_parser=node_parser)

In [None]:
## create pyvis graph
def save_pyvis_network_graph(file_name):
    #display all nodes
    g = index.get_networkx_graph(limit = 6000)
    net = Network(height='1000px', width='100%', notebook=True, cdn_resources="in_line", directed=True)
    net.from_nx(g)
    html_name = f'{HTML_FOLDER}/{file_name}.html'
    #print(html_name)
    net.show(html_name)


In [None]:
if not os.path.exists(persist_path):
    print('No KGIndex found, creating new empty index.')
    graph_store = SimpleGraphStore()
    storage_context = StorageContext.from_defaults(graph_store=graph_store)
    index = KnowledgeGraphIndex(
        [],
        max_triplets_per_chunk=2,
        storage_context=storage_context,
        service_context=service_context,
    )
    index.storage_context.persist(persist_dir=persist_path)

In [None]:
if os.path.exists(persist_path):
    start = time.time()
    print(f'Loading index from {persist_path}')
    # rebuild storage context
    storage_context = StorageContext.from_defaults(persist_dir=persist_path)
    # load index
    index = load_index_from_storage(storage_context=storage_context, 
                                    service_context=service_context, 
                                    max_triplets_per_chunk=2,
                                    show_progress = True)
    ## add documents to index
    for d in documents:
        file_name = pathlib.Path(d.id_).name
        print(f'Processing: {file_name}')
        #index the document: extract triples and inseart into the KG graph
        index.insert(document = d)
        #move the file to the processed folder
        in_file_name = f'{TXT_BASE}/{file_name}'
        processed_file_name = f'{PROCESSED_TXT_BASE}/{file_name}'
        pathlib.Path(in_file_name).rename(processed_file_name)
        #index is modified after each doc
        save_pyvis_network_graph(file_name)
        index.storage_context.persist(persist_dir=persist_path)
    end = time.time()
    print(f"Documents added in: {end-start}s")
else:
    print('No KGIndex found, starting fresh.')
    graph_store = SimpleGraphStore()
    storage_context = StorageContext.from_defaults(graph_store=graph_store)

    # NOTE: can take a while! 

    start = time.time()
    index = KnowledgeGraphIndex.from_documents(
        documents,
        max_triplets_per_chunk=2,
        storage_context=storage_context,
        service_context=service_context,
    )
    #move files to the processed files folder/s3/other location
    for d in documents:
        #d.id_ is the full path of the input file
        file_name = pathlib.Path(d.id_).name
        in_file_name = f'{TXT_BASE}/{file_name}'
        processed_file_name = f'{PROCESSED_TXT_BASE}/{file_name}'
        pathlib.Path(in_file_name).rename(processed_file_name)
        save_pyvis_network_graph(file_name)
    #save index TODO what if it fails - compensatory transaction
    index.storage_context.persist(persist_dir=persist_path)
    end = time.time()
    print(f"Knowledge Graph built in: {end-start}s.")
