In [1]:
import os
import time
import pathlib

import openai
os.environ['OPENAI_API_KEY'] = "EMPTY"
os.environ['OPENAI_API_BASE'] = "http://10.0.0.222:30307/v1"
openai.api_key = "EMPTY"
openai.api_base = "http://10.0.0.222:30307/v1"

model = "Writer/camel-5b-hf"
#model = "mosaicml/mpt-7b-instruct"
#model = "mosaicml/mpt-30b-instruct"

persist_path = f"storage/{model.replace('/', '-')}-default"

In [2]:
import logging
import sys

#kron extensions to llama_index to support openai compatible api
sys.path.append('../llama_index/')

logging.basicConfig(stream=sys.stdout, level=logging.INFO)

## Using Knowledge Graph

In [3]:
### Query Knowledge Graph

In [4]:

from llama_index import StorageContext
from llama_index import SimpleDirectoryReader, ServiceContext
from llama_index.graph_stores import SimpleGraphStore 
from llama_index import load_index_from_storage 
from llama_index.langchain_helpers.text_splitter import SentenceSplitter
from llama_index.node_parser import SimpleNodeParser

import tiktoken

#extensions to llama_index to support openai compatible endpoints, e.g. llama-api
from kron.llm_predictor.KronOpenAILLM import OpenAI
from kron.llm_predictor.KronLLMPredictor import KronLLMPredictor
from kron.indices.knowledge_graph.KronKnowledgeGraphIndex import KronKnowledgeGraphIndex 
from kron.prompts.kg_prompts import KRON_KG_TRIPLET_EXTRACT_PROMPT

INFO:numexpr.utils:NumExpr defaulting to 4 threads.


In [5]:
#writer/camel uses endoftext 
from llama_index.utils import globals_helper
enc = tiktoken.get_encoding("gpt2")
tokenizer = lambda text: enc.encode(text, allowed_special={"<|endoftext|>"})
globals_helper._tokenizer = tokenizer

In [6]:
# define LLM
llm=OpenAI(temperature=0.01, model=model)
#chunk_size+prompt_length+expected length of returned triples must be less than max_tokens
#llm.max_tokens = 274 #128-32
#some sentences can be really long and the text spliter will enter an infinit loop
#llm.max_tokens = 400 #256-64
llm.max_tokens = 384 #192-48
llm_predictor = KronLLMPredictor(llm)
print(llm_predictor.metadata)

context_window=2048 num_output=384 is_chat_model=False


In [7]:
# define TextSplitter
text_splitter = SentenceSplitter(chunk_size=192, chunk_overlap=48, paragraph_separator='\n')
#define NodeParser
node_parser = SimpleNodeParser(text_splitter=text_splitter)
#define ServiceContext
service_context = ServiceContext.from_defaults(llm_predictor=llm_predictor, node_parser=node_parser)


In [8]:
print(f'Loading index from {persist_path}')
# rebuild storage context
storage_context = StorageContext.from_defaults(persist_dir=persist_path)
# load index
index = load_index_from_storage(storage_context=storage_context, 
                                    service_context=service_context, 
                                    max_triplets_per_chunk=2,
                                    show_progress = True)

Loading index from storage/Writer-camel-5b-hf-default
INFO:llama_index.indices.loading:Loading all indices.


In [9]:
query_engine = index.as_query_engine()
query_engine.query("What is SPGC?")

INFO:llama_index.indices.knowledge_graph.retriever:> Starting query: What is SPGC?
INFO:llama_index.indices.knowledge_graph.retriever:> Query keywords: ['stands', 'SPGC', 'SPGC stands for Sustainable Palm Oil.<|endoftext|>', 'Palm', 'Sustainable', 'endoftext', 'Oil']
ERROR:llama_index.indices.knowledge_graph.retriever:Index was not constructed with embeddings, skipping embedding usage...
INFO:llama_index.indices.knowledge_graph.retriever:> Querying with idx: 5e30414e-2db8-427e-aa6f-dec734e86d85: and to the best of our knowledge no work has investigated acoustic features o...


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/arylwen/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


INFO:llama_index.indices.knowledge_graph.retriever:> Extracted relationships: The following are knowledge triplets in max depth 2 in the form of `subject [predicate, object, predicate_next_hop, object_next_hop ...]`
SPGC ['investigate', 'acoustic features']
SPGC ['multilingual', 'AD recognition']
SPGC ['speech', 'dementia']
SPGC ['languages', 'predictive power']
SPGC ['discuss', 'machine learning']
SPGC ['architectures', 'novel']
SPGC ['features', 'extraction']


Response(response="Answer: The SPGC is a speech processing and machine learning challenge focused on multilingual Alzheimer's dementia recognition through spontaneous speech.<|endoftext|>", source_nodes=[NodeWithScore(node=TextNode(id_='5e30414e-2db8-427e-aa6f-dec734e86d85', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='/home/arylwen/datasets/documents/ArxivHealthcareNLP/text_cleaned/2301.05562v1.Multilingual_Alzheimer_s_Dementia_Recognition_through_Spontaneous_Speech__a_Signal_Processing_Grand_Challenge.pdf.txt', node_type=None, metadata={}, hash='a82022a658c8899762e25a93b7b1b077d2de105945c51cdbdd5bd8c5d1b1cfbe'), <NodeRelationship.PREVIOUS: '2'>: RelatedNodeInfo(node_id='86314e96-ddc3-4c08-9a7e-e56840b38a24', node_type=None, metadata={}, hash='c4d2fc7ad72066001804fcef469bc7fc58b52271f2ad43e2243441a7178736c2'), <NodeRelationship.NEXT: '3'>: RelatedNodeInfo(node_id='4b

In [10]:
query_engine.query("What is AD recognition?")

INFO:llama_index.indices.knowledge_graph.retriever:> Starting query: What is AD recognition?
INFO:llama_index.indices.knowledge_graph.retriever:> Query keywords: ['identifying', 'diagnosing', 'related', "AD (Alzheimer's Disease) recognition refers to the process of identifying and diagnosing individuals with Alzheimer's disease or related disorders.<|endoftext|>", 'recognition', 'individuals', 'refers', 'AD', 'process', 'disease', 'endoftext', 'Disease', 'disorders', 'Alzheimer']
ERROR:llama_index.indices.knowledge_graph.retriever:Index was not constructed with embeddings, skipping embedding usage...
INFO:llama_index.indices.knowledge_graph.retriever:> Extracted relationships: The following are knowledge triplets in max depth 2 in the form of `subject [predicate, object, predicate_next_hop, object_next_hop ...]`


Response(response="Answer: AD recognition is the process of identifying and recognizing Alzheimer's disease by analyzing medical images, such as MRIs or CT scans.<|endoftext|>", source_nodes=[NodeWithScore(node=TextNode(id_='f4c85938-76ba-41ba-ab3a-02d7198f8548', embedding=None, metadata={'kg_rel_texts': [], 'kg_rel_map': {'identifying': [], 'diagnosing': [], 'related': [], "AD (Alzheimer's Disease) recognition refers to the process of identifying and diagnosing individuals with Alzheimer's disease or related disorders.<|endoftext|>": [], 'recognition': [], 'individuals': [], 'refers': [], 'AD': [], 'process': [], 'disease': [], 'endoftext': [], 'Disease': [], 'disorders': [], 'Alzheimer': []}}, excluded_embed_metadata_keys=['kg_rel_map', 'kg_rel_texts'], excluded_llm_metadata_keys=['kg_rel_map', 'kg_rel_texts'], relationships={}, hash='960573c33a2ec416442182de990340a8637ce21a788e118b153daa4746a15e2c', text='The following are knowledge triplets in max depth 2 in the form of `subject [p

In [12]:
query_engine.query("What is the relationship between AD recognition and SPGC?")


INFO:llama_index.indices.knowledge_graph.retriever:> Starting query: What is the relationship between AD recognition and SPGC?
INFO:llama_index.indices.knowledge_graph.retriever:> Query keywords: ['medical', 'tool', 'project', 'involving', 'patients', 'institutions', "AD recognition is a technique used to identify and classify Alzheimer's disease patients based on their symptoms and medical history. SPGC is a collaborative research project involving multiple institutions and organizations working together to develop a standardized diagnostic tool and treatment protocols for Alzheimer's disease.<|endoftext|>", 'used', 'history', 'disease', 'working', 'identify', 'based', 'SPGC', 'together', 'organizations', 'standardized', 'symptoms', 'multiple', 'recognition', 'develop', 'classify', 'research', 'technique', 'AD', 'collaborative', 'diagnostic', 'treatment', 'protocols', 'endoftext', 'Alzheimer']
ERROR:llama_index.indices.knowledge_graph.retriever:Index was not constructed with embedding

Response(response='Answer: The relationship between AD recognition and SPGC is that the SPGC is a speech recognition challenge focused on multilingual AD detection using spontaneous speech, and AD recognition is a subfield of speech processing and machine learning that focuses on understanding and diagnosing cognitive impairments using', source_nodes=[NodeWithScore(node=TextNode(id_='5e30414e-2db8-427e-aa6f-dec734e86d85', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='/home/arylwen/datasets/documents/ArxivHealthcareNLP/text_cleaned/2301.05562v1.Multilingual_Alzheimer_s_Dementia_Recognition_through_Spontaneous_Speech__a_Signal_Processing_Grand_Challenge.pdf.txt', node_type=None, metadata={}, hash='a82022a658c8899762e25a93b7b1b077d2de105945c51cdbdd5bd8c5d1b1cfbe'), <NodeRelationship.PREVIOUS: '2'>: RelatedNodeInfo(node_id='86314e96-ddc3-4c08-9a7e-e56840b38a24', node_type=