# Knowledge Graph Index 
## S3 storage
### Default llama index KG prompt

In [1]:
import os
import time
import pathlib
from pyvis.network import Network

import sys
#kron extensions to llama_index to support openai compatible api
sys.path.append('../llama-index/')

import logging
logging.basicConfig(stream=sys.stdout, level=logging.INFO)

import openai
os.environ['OPENAI_API_KEY'] = "sk-48characterstofakeanopenaikey48charactersopenai0"
os.environ['OPENAI_API_BASE'] = "http://10.0.0.222:30307/v1"
openai.api_key = "sk-48characterstofakeanopenaikey48charactersopenai0"
openai.api_base = "http://10.0.0.222:30307/v1"

import dotenv
dotenv.load_dotenv()
AWS_ACCESS_KEY_ID = os.environ['AF_AWS_ACCESS_KEY_ID'] 
AWS_SECRET_ACCESS_KEY = os.environ['AF_AWS_SECRET_ACCESS_KEY'] 
#AWS_DEFAULT_REGION = os.environ['AF_AWS_DEFAULT_REGION'] 
AWS_ENDPOINT_URL = os.environ['AF_AWS_ENDPOINT_URL'] 

model = "Writer/camel-5b-hf"
#model = "mosaicml/mpt-7b-instruct"
#model = "mosaicml/mpt-30b-instruct"

CORPUS = 'ArxivHealthcareNLP'
#CORPUS = 'arxiv_cl'

INDEX_NAME = f"{model.replace('/', '-')}-default-no-coref"
#INDEX_NAME = f"{model.replace('/', '-')}-default-coref"

## Using Knowledge Graph

#### Building the Knowledge Graph

In [2]:

from llama_index import StorageContext
from llama_index import SimpleDirectoryReader, ServiceContext
from llama_index import KnowledgeGraphIndex
from llama_index.graph_stores import SimpleGraphStore 
from llama_index import load_index_from_storage 
from llama_index.langchain_helpers.text_splitter import SentenceSplitter
from llama_index.node_parser import SimpleNodeParser

import tiktoken

#extensions to llama_index to support openai compatible endpoints, e.g. llama-api
from kron.llm_predictor.KronOpenAILLM import KronOpenAI
from kron.llm_predictor.KronLLMPredictor import KronLLMPredictor

INFO:numexpr.utils:NumExpr defaulting to 4 threads.


In [3]:
#writer/camel uses endoftext 
def monkey_patch_global_helper_tokenizer():
    from llama_index.utils import globals_helper
    enc = tiktoken.get_encoding("gpt2")
    tokenizer = lambda text: enc.encode(text, allowed_special={"<|endoftext|>"})
    globals_helper._tokenizer = tokenizer

In [4]:
#documents = SimpleDirectoryReader(TXT_BASE, filename_as_id=True).load_data()
from kron.readers import S3Reader

s3_prefix = 'dags/ArxivHealthcareNLP/txt_cleaned/publicdomain'

def load_documents(s3_prefix):
    license = s3_prefix.split('/')[-1]

    loader = S3Reader(
                    bucket='papers-kg', 
                    prefix=s3_prefix, 
                    filename_as_id = True,
                    aws_access_id=AWS_ACCESS_KEY_ID, 
                    aws_access_secret=AWS_SECRET_ACCESS_KEY,
                    s3_endpoint_url = AWS_ENDPOINT_URL,
                    file_metadata = lambda x: {'license': license},
                )
    documents = loader.load_data()
    return documents

documents = load_documents(s3_prefix)

In [5]:
# update metadata and exclusions
def update_documents(documents):
    for document in documents:
        # retain the filename
        document.id_ = document.id_.split('/')[-1].split('.txt')[0]
        document.excluded_embed_metadata_keys = ['license']
        document.excluded_llm_metadata_keys = ['license']
update_documents(documents)
documents

[Document(id_='2211.01705v1.A_speech_corpus_for_chronic_kidney_disease', embedding=None, metadata={'license': 'publicdomain'}, excluded_embed_metadata_keys=['license'], excluded_llm_metadata_keys=['license'], relationships={}, hash='56df9c2451410ff42336601c4e8f4dd4f9338fefeaf74cad63f875a476e431a9', text="Jihyun Mun1,Sunhee Kim2,Myeong Ju Kim3,Jiwon Ryu4,Sejoong Kim3,Minhwa Chung1 1Department of Linguistics,Seoul National University,Republic of Korea 2Department of French Language Education,Seoul National University,Republic of Korea 3Center for Artificial Intelligence in Healthcare,Seoul National University,Republic of Korea 4Department of Internal Medicine,Seoul National University Bundang Hospital,Republic of Korea 5Department of Internal Medicine,Seoul National University College of Medicine,Republic of Korea In this study,authors present speech corpus of patients with chronic kidney disease CKD that will be used for research on pathological voice analysis,automatic illness identifi

In [6]:
def get_service_context(model):
    # define LLM
    llm=KronOpenAI(temperature=0.01, model=model)
    #chunk_size+prompt_length+expected length of returned triples must be less than max_tokens
    llm.max_tokens = 384 #192-48
    llm_predictor = KronLLMPredictor(llm)
    print(llm_predictor.metadata)

    # define TextSplitter
    text_splitter = SentenceSplitter(chunk_size=192, chunk_overlap=48, paragraph_separator='\n')

    #define NodeParser
    node_parser = SimpleNodeParser(text_splitter=text_splitter)

    #define ServiceContext
    service_context = ServiceContext.from_defaults(llm_predictor=llm_predictor, node_parser=node_parser)

    return service_context

service_context = get_service_context(model)

context_window=2048 num_output=384 is_chat_model=False is_function_calling_model=False model_name='unknown'


In [7]:
import s3fs
s3_fs = s3fs.S3FileSystem(
    key = AWS_ACCESS_KEY_ID,
    secret = AWS_SECRET_ACCESS_KEY,
    endpoint_url= AWS_ENDPOINT_URL,
)

In [8]:
#html-kg is the bucket name for html and indices-kg is the bucket name for indices
HTML_FOLDER = f"html-kg/{INDEX_NAME}"
persist_path = f"indices-kg/{INDEX_NAME}"

In [9]:
## create pyvis graph
## use generate_html with a s3 write
def save_pyvis_network_graph(index, file_name):
    #display all nodes
    g = index.get_networkx_graph(limit = 6000)
    net = Network(height='1000px', width='100%', notebook=True, cdn_resources="in_line", directed=True)
    net.from_nx(g)
    html_name = f'{HTML_FOLDER}/{file_name}.html'
    #print(html_name)
    #net.show(html_name)
    html = net.generate_html(html_name)
    s3_fs.write_text(html_name, html)

In [10]:
if not s3_fs.exists(persist_path):
    print('No KGIndex found, creating new empty index.')
    graph_store = SimpleGraphStore()
    storage_context = StorageContext.from_defaults(graph_store=graph_store)
    index = KnowledgeGraphIndex(
        [],
        max_triplets_per_chunk=2,
        storage_context=storage_context,
        service_context=service_context,
    )
    index.storage_context.persist(persist_dir=persist_path, fs=s3_fs)

In [11]:
if s3_fs.exists(persist_path):
    start = time.time()
    print(f'Loading index from {persist_path}')
    # rebuild storage context
    storage_context = StorageContext.from_defaults(persist_dir=persist_path, fs=s3_fs)
    # load index
    index = load_index_from_storage(storage_context=storage_context, 
                                    service_context=service_context, 
                                    max_triplets_per_chunk=2,
                                    show_progress = True)
    ## add documents to index
    for d in documents:
        file_name = pathlib.Path(d.id_).name
        print(f'Processing: {file_name}')
        #index the document: extract triples and inseart into the KG graph
        index.insert(document = d, show_progress = True)
        #move the file to the processed folder
        #in_file_name = f'{TXT_BASE}/{file_name}'
        #processed_file_name = f'{PROCESSED_TXT_BASE}/{file_name}'
        #pathlib.Path(in_file_name).rename(processed_file_name)
        #index is modified after each doc
        save_pyvis_network_graph(index, file_name)
        index.storage_context.persist(persist_dir=persist_path, fs=s3_fs)
    end = time.time()
    print(f"Documents added in: {end-start}s")
else:
    print('No KG Index found, please initialize the Index first.')

Loading index from indices-kg/Writer-camel-5b-hf-default-no-coref
INFO:llama_index.indices.loading:Loading all indices.
Processing: 2211.01705v1.A_speech_corpus_for_chronic_kidney_disease
(Jihyun Mun, 1, Department of Linguistics, Seoul National University, Republic of Korea)
(Sunhee Kim, 2, Department of French Language Education, Seoul National University, Republic of Korea)
(Myeong Ju Kim, 3, Center for Artificial Intelligence in Healthcare, Seoul National University, Republic of Korea)
(Jiwon Ryu, 4, Department of Internal Medicine, Seoul National University Bund
(speakers, in this, corpus)
(CKD patients, with varying, degrees)
(delivered, sustained, vowels)
(sentence, and paragraph, stimuli)
(compared, and analyzed, voice)
(phoneme-level pronunciation,prosody, glottal source)
(aerodynamic parameters)
(there may not be, blood and urine tests, CKD)
(CKD affects, variety of bodily systems, respiratory system)
(CKD patients, reduced strength and endurance, respiratory muscles)
(CKD pa