In [1]:
#%pip install llama_index transformers
#%pip install python-dotenv
#%pip install nest_asyncio
# %pip install diskcache
#%pip install llama-index-embeddings-huggingface



import os
from llama_index.core import Settings
from llama_index.core import SimpleDirectoryReader
import nest_asyncio

from llama_index.core import  VectorStoreIndex
from llama_index.core.node_parser import SentenceSplitter
from llama_index.embeddings.openai import OpenAIEmbedding

from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core.extractors import TitleExtractor
from llama_index.core.ingestion import IngestionPipeline
from llama_index.llms.openai import OpenAI
from llama_index.core.postprocessor import SentenceTransformerRerank

from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core import StorageContext

import chromadb

import time

from diskcache import Cache
import getpass

  from .autonotebook import tqdm as notebook_tqdm





In [2]:

import openai

chroma_client = chromadb.PersistentClient("./chroma.db")
cache = Cache("./cache")
nest_asyncio.apply()

print("Please enter your OpenAI API Key")
api_key = getpass.getpass()
openai.api_key = api_key

Settings.llm = OpenAI(model="gpt-4o-mini")
Settings.embed_model = OpenAIEmbedding()
pdf_dir_path = "./Dataset_folder"
rerank = SentenceTransformerRerank(
    model="cross-encoder/ms-marco-MiniLM-L-2-v2", top_n=3
)

Please enter your OpenAI API Key


In [3]:
#print(os.environ)
#print(api_key)

# docs = []
# for doc in documnet:
#     docs.append(doc.text)
# print(docs)

In [4]:
def build_index(pdf_dir_path, storage_context):
    docs = SimpleDirectoryReader(pdf_dir_path).load_data()
    pipeline = IngestionPipeline(
        transformations=[
            SentenceSplitter(),
            TitleExtractor(),
            #HuggingFaceEmbedding(model_name="sentence-transformers/all-MiniLM-L6-v2"),
            OpenAIEmbedding(model_name="text-embedding-ada-002"),
        ]
    )
    nodes = pipeline.run(documents=docs)
    index = VectorStoreIndex(nodes=nodes, storage_context=storage_context)

    return index

In [5]:
def save_index():
    print("Creating and saving the index")
    try:
        chroma_collection = chroma_client.create_collection(name="Insurance_Doc_RAG_LlamaIndex")
    except Exception as e:
        print(f"Collection already exists: {e}")
        chroma_collection = chroma_client.get_collection(name="Insurance_Doc_RAG_LlamaIndex")

    vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
    storage_context = StorageContext.from_defaults(vector_store=vector_store)

    try:
        index = build_index(pdf_dir_path, storage_context)
        print("Index created and saved")
        return index
    except Exception as e:
        print(f"Error while building the index: {e}")
        return None

In [6]:
def load_index():
    try:
        chroma_collection = chroma_client.get_collection(name="Insurance_Doc_RAG_LlamaIndex")
    except Exception as e:
        print(f"Error loading the collection: {e}")
        return None

    print("Loading the index")
    vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
    storage_context = StorageContext.from_defaults(vector_store=vector_store)

    try:
        index = VectorStoreIndex.from_vector_store(
            vector_store=vector_store,
            storage_context=storage_context
        )
        return index
    except Exception as e:
        print(f"Error while loading the index: {e}")
        return None

In [7]:
# Vector_Index = VectorStoreIndex()
# index = Vector_Index.build_index_from_nodes(nodes=nodes)
# Load documents
# pdf_dir_path = r"C:\Users\sandy\Downloads\Policy+Documents"
# docs = SimpleDirectoryReader(pdf_dir_path).load_data()

# with open('vector_store_index.pkl', 'rb') as f:
#     loaded_index = pickle.load(f)

# if loaded_index is not None:
#     index = loaded_index
# else:
#     index = build_index(pdf_dir_path)
#     with open("vector_store_index.pkl", "wb") as f:
#         pickle.dump(index, f)

In [8]:
# retriever = index.as_retriever()

# results = retriever.retrieve("What is the procedure to claim the insurance?")

In [9]:
# for res in results:
#     print(res.node.metadata["document_title"])

In [10]:
#from transformers import AutoModelForSequenceClassification, AutoTokenizer
# %pip install tf_keras
#%pip uninstall keras
#%pip install keras==2.11.0

In [11]:
def query_comm(query, index):
    retriever = index.as_retriever()
    results = retriever.retrieve(query)

    system_message = f"""You are an Question answering expert. The user will ask you a question/query. 
    The Question is : {query}
    Now, the Documents related to the question is : {[res.node.text for res in results]}
    If the question is related to the document, answer it using the information in the document.
    If the question is not related to the document, answer "Please contact the insurance company/agent as I am not able to answer the question".
    If you answer the question, please provide the relevant document as reference.
    Reference Format : Page Number | Document Name.
    Page Numbers is {[res.node.metadata['page_label'] for res in results]}
    Document Names is {[res.node.metadata['document_title'] for res in results]}
    Example:
    Reference 1 : Page 4 | Accidental Death Benefit Claims Procedure and Exclusions
    etc.
    Use all the info retrieved and if used then give the reference. Multiple references can be used.
    """

    llm = OpenAI(model="gpt-4o-mini", system_prompt=system_message)

    query_engine = index.as_query_engine(response_mode="compact", similirty_top_k=3, llm=llm, node_postprocessors=[rerank])
    response = query_engine.query(query)
    return response

In [13]:
index = load_index()
if index is None:
    index = save_index()

if index:
    print("Index is ready")
else:
    print("Failed to create or load the index")


print("Welcome to Insurance Documentation Chatbot. Please enter your query.")
query = input()
print("User: ", query)
print("-"*100)
time.sleep(1)
print("Searching in Cache... Please wait!")
if cache.get(query) is not None:
    print("Data found in Cache. Retrieving relevant information...")
    time.sleep(1)
    response = cache.get(query)
else:
    print("Data not found in Cache. Searching in Documents...")
    time.sleep(1)
    print("Data Found. Retrieving relevant information...")
    response = query_comm(query, index)
    cache.set(query, response, expire=600)
print("-"*100)
print(response)
print("-"*100)
print("Thanks for using Insurance Documentation Chatbot. Have a great day!")

Loading the index
Index is ready
Welcome to Insurance Documentation Chatbot. Please enter your query.
User:  What is the procedure to claim the insurance?
----------------------------------------------------------------------------------------------------
Searching in Cache... Please wait!
Data found in Cache. Retrieving relevant information...
----------------------------------------------------------------------------------------------------
To claim the insurance, the following procedure should be followed:

1. Ensure that the claim is valid and meets the definitions and exclusion criteria outlined in the policy.
2. If the claim is for Open Heart Replacement or Repair of Heart Valves, it must be determined to be medically necessary by a Consultant Cardiologist or Surgeon, supported by relevant imaging findings and established diagnostic reports.
3. The benefit as set out in the Scheme Member’s Certificate of Insurance will be paid to the Nominee of the deceased Scheme Member for Sin