In [3]:
#%pip install pdfplumber

import pdfplumber
import pandas as pd
import ast


In [4]:
def extract_text_from_pdf(pdf_path):
    page_data = []
    with pdfplumber.open(pdf_path) as pdf:
        all_pages_text = ""
        for page_number, page in enumerate(pdf.pages):
        
            all_text = page.extract_text() or ""

            tables = page.find_tables()

            # text_data=""
            # excluded_data=""

            for table in tables:
                bbox = table.bbox
                text_within_table = page.within_bbox(bbox).extract_text() or ""

                table_content = table.extract()
                df = pd.DataFrame(table_content)
                table_data = df.to_string(index=False, header=False)

                all_text = all_text.replace(text_within_table, table_data)
            
            # all_pages_text += all_text.strip()
            # all_pages_text += "\n"
            page_data.append({
                "page_number": page_number+1,
                "page_text": all_text.strip()
            })
        
        df = pd.DataFrame(page_data)
        return df

pdf_path = r"./HDFC-Life-Group-Poorna-Suraksha-101N137V02-Policy-Document.pdf"
df = extract_text_from_pdf(pdf_path)
print(df)

    page_number                                          page_text
0             1  Part A\n<<Date>>\n<<Master Policyholder’s Name...
1             2  MASTER POLICY DOCUMENT- HDFC Life Group Poorna...
2             3  POLICY SCHEDULE\n1. Master Policy Number:<< sy...
3             4  Part B\nDefinitions\nThe following capitalized...
4             5  i. Acute condition - Acute condition is a dise...
5             6  (38) Sum Assured- means the amount payable und...
6             7  Part C\n1. Benefits:\n(1) Benefits on Death or...
7             8  21. Progressive\nScleroderma 22. Muscular\nDys...
8             9  In case if the Scheme Member surrenders the Ce...
9            10  Part D\n1. Additional Sum Assured option:\nThe...
10           11  2. Assignment or Transfer\nAssignment shall be...
11           12  thereof, within 15 days from the date of recei...
12           13  Part E\n1. Additional Servicing Charges\nNil\n...
13           14  Part F\n1. Waiting Period and Exclusions:\ni.

In [5]:
print(df["page_text"][2])


POLICY SCHEDULE
1. Master Policy Number:<< system/operations generated>>
2. Date of Proposal:<<<< system/operations generated>>
3. Date of Inception: <<Date of Inception>>
4. Effective Date:<< system/operations service generated>>
5. Master Policyholder:<<Name of Company/Group>>
6. Name of the Scheme:<<NAME of Scheme>>
7. Scheme Type:<<Compulsory/Voluntary>>
8. Plan option:<<Name of Plan option>>
9. Eligibility to join the Scheme for the Scheme Member:
         Eligibility                          Age (Last Birthday) (in years)                                                      None  None
Minimum Age at Entry                                                  < 18 >                                                      None  None
Maximum Age at Entry                                  Single Premium Payment                                               Life Option <79 >
                None                                                    None Extra Life Option &\nAccelerated Critical I

In [6]:
#%pip install tiktoken
import tiktoken

# Load the tokenizer model for OpenAI's models (like "text-embedding-ada-002")
encoding = tiktoken.get_encoding("cl100k_base")  # 'cl100k_base' is used for 'text-embedding-ada-002'

# Define a function to count tokens
def count_tokens(text):
    return len(encoding.encode(text))

# Example usage on extracted text
for index, row in df.iterrows():
    text = row['page_text']
    token_count = count_tokens(text)
    #print(f"Page {row['page_number']} has {token_count} tokens.")
    
    # If the token count exceeds the limit, you'll need to split it into smaller chunks
    exceeded_token_count = 0
    if token_count > 8191:
        exceeded_token_count += 1
        #print(f"Warning: Page {row['page_number']} exceeds the token limit!")

print(f"Total pages with exceeded token count: {exceeded_token_count}")




Total pages with exceeded token count: 0


In [7]:
# Please note : Since ChromaDB automatically embeds the data given embedding Function, we don't need to embed the data here.
# So this below code is not required. So it is commented.

#import openai

# 

# df2 = df.copy()

# def create_embeddings(df):
#     df["embeddings"] = None
#     for page_number, row in df.iterrows():
#         embeddings = openai.embeddings.create(input=[row["page_text"]], model="text-embedding-3-small")
#         df.at[page_number, "embeddings"] = embeddings.data[0].embedding
#     return df

# df2 = create_embeddings(df2)
# print(df2)



In [8]:
df['metadata'] = None
for index, row in df.iterrows():
    df.at[index, "metadata"] = f"Page Number: {row['page_number']} | Section: {row['page_text'][:100]}"

df.head(10)




Unnamed: 0,page_number,page_text,metadata
0,1,Part A\n<<Date>>\n<<Master Policyholder’s Name...,Page Number: 1 | Section: Part A\n<<Date>>\n<<...
1,2,MASTER POLICY DOCUMENT- HDFC Life Group Poorna...,Page Number: 2 | Section: MASTER POLICY DOCUME...
2,3,POLICY SCHEDULE\n1. Master Policy Number:<< sy...,Page Number: 3 | Section: POLICY SCHEDULE\n1. ...
3,4,Part B\nDefinitions\nThe following capitalized...,Page Number: 4 | Section: Part B\nDefinitions\...
4,5,i. Acute condition - Acute condition is a dise...,Page Number: 5 | Section: i. Acute condition -...
5,6,(38) Sum Assured- means the amount payable und...,Page Number: 6 | Section: (38) Sum Assured- me...
6,7,Part C\n1. Benefits:\n(1) Benefits on Death or...,Page Number: 7 | Section: Part C\n1. Benefits:...
7,8,21. Progressive\nScleroderma 22. Muscular\nDys...,Page Number: 8 | Section: 21. Progressive\nScl...
8,9,In case if the Scheme Member surrenders the Ce...,Page Number: 9 | Section: In case if the Schem...
9,10,Part D\n1. Additional Sum Assured option:\nThe...,Page Number: 10 | Section: Part D\n1. Addition...


In [9]:
import chromadb
from chromadb.utils import embedding_functions
import openai
import getpass

print("Please enter your OpenAI API key: Input is hidden")
openai_api_key = getpass.getpass()

client = chromadb.Client()
collection_list = client.list_collections()

embedding_function = embedding_functions.OpenAIEmbeddingFunction(model_name="text-embedding-3-small", api_key=openai_api_key)

if "insurance_policy_documents" not in [collection.name for collection in collection_list]:
    main_collection = client.create_collection("insurance_policy_documents", embedding_function=embedding_function)
else:
    main_collection = client.get_collection("insurance_policy_documents")
    #collection.modify(embedding_function=embedding_function)

for idx, row in df.iterrows():
    #embeddings = row["embeddings"]
    metadata = row["metadata"]
    text = row["page_text"]

    # if isinstance(embeddings, list) and not isinstance(embeddings[0], list):
    #     embeddings = [embeddings]
    if not isinstance(metadata, dict):
        metadata = {"metadata": str(metadata)}

    try:
        main_collection.add(
            documents=[text],
            metadatas=[metadata],
            ids=[str(idx)]
        )
        print(f"Successfully added item {idx}")
    except Exception as e:
        print(f"Error adding item {idx}: {e}")


Please enter your OpenAI API key: Input is hidden
Successfully added item 0
Successfully added item 1
Successfully added item 2
Successfully added item 3
Successfully added item 4
Successfully added item 5
Successfully added item 6
Successfully added item 7
Successfully added item 8
Successfully added item 9
Successfully added item 10
Successfully added item 11
Successfully added item 12
Successfully added item 13
Successfully added item 14
Successfully added item 15
Successfully added item 16
Successfully added item 17
Successfully added item 18
Successfully added item 19
Successfully added item 20
Successfully added item 21
Successfully added item 22
Successfully added item 23
Successfully added item 24
Successfully added item 25
Successfully added item 26
Successfully added item 27
Successfully added item 28
Successfully added item 29
Successfully added item 30


In [10]:
collection_list = client.list_collections()

if "insurance_policy_cache" not in [collection.name for collection in collection_list]:
    cache_collection = client.create_collection("insurance_policy_cache", embedding_function=embedding_function)
else:
    cache_collection = client.get_collection("insurance_policy_cache")

def query_cache_layer(query):
    #ids_to_check = [query + str(i) for i in range(3)]
    #print(ids_to_check)
    #print(cache_collection.count())
    results = cache_collection.query(
        query_texts=[query],
        n_results=1,
        #where={"ids": {"$in": ["what is Waiting Period and Exclusions?0"]}}
    )
    return results
#query_cache_layer("what is Waiting Period and Exclusions?")

In [11]:

def add_to_cache(query, results):
    docs = results["documents"][0]
    metadatas = results["metadatas"][0]
    docs = [docs]
    #print(type(metadatas))
    # metadatas = metadatas.replace("\n", "")
    # metadatas = ast.literal_eval(metadatas)
    # metadata_list = []
    # for metadata in metadatas:
    #     if not isinstance(metadata, dict):
    #         metadata = {"metadata": str(metadata)}
    #     metadata_list.append(metadata)

    # print(docs)
    #print(metadata_list)
    try:
        for idx, (doc, metadata) in enumerate(zip(docs, metadatas)):
            cache_collection.add(
                documents=doc,
                metadatas=metadatas,
                ids=[str(idx)]
            )
            #print(f"Successfully added item {idx}")
        #print(cache_collection.get)
    except Exception as e:
        print(f"Error adding item {idx}: {e}")

In [12]:
#query = "What is the procedure to claim the insurance?"
#query_embedding = openai.embeddings.create(input=[query], model="text-embedding-3-small").data[0].embedding

def main_collection_query(query):
    results = main_collection.query(
        query_texts=[query],
        n_results=3,    
    )
    #print(results)
    filtered_results = {
    "distances": [],
    "metadatas": [],
    "documents": [],

    }
    #print(results["distances"][0])

    for i, (distance, metadata, doc) in enumerate(zip(results["distances"][0], results["metadatas"][0], results["documents"][0])):
        #print(distance)
        if distance < 1.10:
            filtered_results["distances"].append(distance)
            filtered_results["metadatas"].append(metadata)
            filtered_results["documents"].append(doc)
            
    #print(filtered_results)        
    return filtered_results
#main_collection_query("What is Waiting Period and Exclusions?")

In [13]:
import google.generativeai as genai

print("Please enter your Google Gemini API key: Input is hidden")
gemini_api_key = getpass.getpass()
genai.configure(api_key=gemini_api_key)

def chat_comm(user_message):
    generation_config = {
        "temperature": 0.1
    }
    model = genai.GenerativeModel("gemini-1.5-flash-latest", generation_config=generation_config)
    chat = model.start_chat()
    response = chat.send_message(user_message)
    return response.text

  from .autonotebook import tqdm as notebook_tqdm


Please enter your Google Gemini API key: Input is hidden


In [14]:
def display_result(query, docs, metadatas):
    #print(docs)
    system_prompt = f"""
    You are a helpful assistant that can answer questions about the insurance policy document.
    You are given a question and a list of documents that are relevant to the question.
    You need to answer the question based on the documents.
    You need to display the answer in a readable format. It should be clear, concise and easy to understand.
    The Query is: {query}
    The Relevant Document/ documents are: {docs}
    Also, after the answer is given, give the source of the data so the user knows which document the answer is coming from.
    Use the Metadata as the reference of the data.
    Give all available references in the answer and ask them to check the documents for more information.
    The Metadata is: {metadatas}
    1 Shot example:
    **Reference 1: Page Number: 16 | Section: a. Completed claim form, (including NEFT details and bank account proof as specified in the claim**
    **Reference 2: Page Number: 12 | Section: thereof, within 15 days from the date of receipt of the Certificate of Insurance, as per IRDAI**
    etc
    List all available references in the answer at the end of the answer.
    If the answer is not found in the documents, say that you are not able to find the answer in the documents and tell them to contact the insurance provider or the agent/customer care.
    """
    #print(system_prompt)

    response = chat_comm(system_prompt)
    return response
    #"""Once the answer is given, give the source of the data so the user knows which document the answer is coming from.
    #The format of the answer should be:
      #"Source: {result["metadatas"][0]}""""

In [15]:


from transformers import  AutoTokenizer, AutoModelForSequenceClassification
import numpy as np
import torch

tokenizer = AutoTokenizer.from_pretrained("allenai/longformer-base-4096")
rerank_model = AutoModelForSequenceClassification.from_pretrained("allenai/longformer-base-4096")

def rerank_results(query, results, main_flag=True):
    reranked_docs = []
    reranked_metadatas = []
    scores = []
    metadatas = results["metadatas"][:]
    docs = results["documents"][:]
    if main_flag == True:
        docs = docs[0]
        metadatas = metadatas[0]
    # docs = docs[0]
    # metadatas = metadatas[0]
    # print(docs)
    # print(metadatas)
    inputs = tokenizer([query]*len(docs), docs, return_tensors="pt", padding=True, truncation=True, max_length=1024)
    with torch.no_grad():  # Disable gradient calculation for inference
        outputs =   rerank_model(**inputs)
    scores = outputs.logits[:, 0].cpu().numpy().flatten()
    sorted_indices = np.argsort(scores)[::-1].tolist()
    reranked_docs = [docs[idx] for idx in sorted_indices]
    reranked_metadatas = [metadatas[idx] for idx in sorted_indices]
    return reranked_docs, reranked_metadatas


Some weights of LongformerForSequenceClassification were not initialized from the model checkpoint at allenai/longformer-base-4096 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [17]:
import time

print("Welcome to the Insurance Policy Document RAG Chatbot.")
time.sleep(1)
print("Please ask any question about the insurance policy document and the chatbot will answer it based on the given document.")
print("-"*100)
user_message = input()
print("\nUser:", user_message)
cache_result = query_cache_layer(user_message)
print("Searching in cache... Please wait")
time.sleep(1)
#print(cache_result)
cache_hit = False
if cache_result['distances'][0] != []:
    for distance in cache_result['distances'][0]:
        if distance < 1.10:
            cache_hit = True
if cache_hit == True:
    print("Data found in cache!")
    time.sleep(1)
    print("Reranking documents to give best answer... Please wait")
    time.sleep(1)
    print("-"*100)
    reranked_docs, reranked_metadatas = rerank_results(user_message, cache_result)
    #print(reranked_docs)
    response = display_result(user_message, reranked_docs, reranked_metadatas)
    print(response)
    print("-"*100)
    print("Thanks for using the Insurance Policy Document RAG Chatbot. Have a nice day!")
else:
    print("Data was not found in cache. Searching in main collection... Please wait")
    time.sleep(1)
    print("Data found in Main Collection!. Reranking documents to give best answer... Please wait")
    time.sleep(1)
    result = main_collection_query(user_message)
    #print(result)
    add_to_cache(user_message, result)
    reranked_docs, reranked_metadatas = rerank_results(user_message, result, main_flag=False)
    response = display_result(user_message, reranked_docs, reranked_metadatas)
    print('-'*100)
    print(response)
    print("-"*100)
    print("Thanks for using the Insurance Policy Document RAG Chatbot. Have a nice day!")



Welcome to the Insurance Policy Document RAG Chatbot.
Please ask any question about the insurance policy document and the chatbot will answer it based on the given document.
----------------------------------------------------------------------------------------------------

User: What is the procedure to claim the insurance?
Searching in cache... Please wait
Data found in cache!
Reranking documents to give best answer... Please wait


Input ids are automatically padded to be a multiple of `config.attention_window`: 512


----------------------------------------------------------------------------------------------------
To claim insurance, the Master Policyholder must inform the insurer within 30 days of the death or illness of a Scheme Member.  A claim must then be filed with the insurer on behalf of the Nominee of the deceased Scheme Member using the insurer's prescribed form and all required documents within 90 days of the death/illness.  The insurer may condone delays in claim intimation if the claim is genuine and the delay is due to reasons beyond the claimant's control.

The required documents vary depending on whether the death was due to natural or unnatural causes.  See the document for a complete list of required documentation.  Note that additional documents may be requested depending on the circumstances of the death.


**Reference 1: Page Number: 15 | Section: Claims Procedure**

Please refer to the provided document for more detailed information on claim procedures and required documenta