In [20]:
import pandas as pd
import os 
from dotenv import load_dotenv
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain_core.prompts import ChatPromptTemplate
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.vectorstores import Chroma

In [22]:
DATA_DIR = "niva_bupa_pdfs"
load_dotenv()
openai_api_key = os.getenv("OPENAI_API_KEY")


In [8]:
for root, dirs, files in os.walk(DATA_DIR):
    print(files)

['pdf_links.txt']
['heartbeat-proposal-form.pdf', 'health-premia-proposal-form.pdf', 'health-companion-proposal-form.pdf', 'health-recharge-proposal-form.pdf', 'health-pulse-proposal-form.pdf', 'go-active-proposal-form.pdf']
['health-recharge-and-health-companion-brochure.pdf', 'heartbeat-brochure.pdf', 'health-companion-brochure.pdf', 'health-pulse-brochure.pdf', 'goactive-brochure.pdf', 'health-premia-brochure.pdf', 'health-recharge-brochure.pdf']
['health-premia-policy-document.pdf', 'health-pulse-policy-document.pdf', 'health-companion-policy-document.pdf', 'goactive-policy-document.pdf', 'health-recharge-policy-document.pdf', 'heartbeat-policy-document.pdf']
['health-recharge-claim-form.pdf', 'heartbeat-claim-form.pdf', 'health-companion-claim-form.pdf', 'goactive-claim-form.pdf']


In [9]:
def chunk_strategy(file_path):
    file_name = os.path.basename(file_path).lower()

    if 'policy' in file_name:
        return {
            'chunk_size': 1500,
            'chunk_overlap': 300
        }

    elif 'form' in file_name:
        return {
            'chunk_size': 500,
            'chunk_overlap': 100
        }
    
    elif 'brochure' in file_name:
        return {
            'chunk_size': 1000,
            'chunk_overlap': 200
        }

    else:
        return {
        'chunk_size': 800,
        'chunk_overlap': 150
    } 

In [16]:
chunk_info = {}
documents = []

for root, dirs, files in os.walk(DATA_DIR):
    for file in files:
        if file.endswith('.pdf'):
            file_path = os.path.join(root, file)

            params = chunk_strategy(file_path)
            #print(file_path, params)

            text_splitter = RecursiveCharacterTextSplitter(
                        chunk_size=params['chunk_size'],
                        chunk_overlap=params['chunk_overlap'],
                        length_function=len,
                        separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""]
                    )
            
            loader = PyPDFLoader(file_path)
            doc = loader.load()
            chunks = text_splitter.split_documents(doc)

            chunk_info[file] = {
                        'params': params,
                        'num_chunks': len(chunks),
                        'avg_chunk_size': sum(len(c.page_content) for c in chunks) / len(chunks)   # number of characters per chunk
                    }
            documents.extend(doc)
            print(f"Processed {file}: {len(chunks)} chunks created")

Processed heartbeat-proposal-form.pdf: 94 chunks created
Processed health-premia-proposal-form.pdf: 73 chunks created
Processed health-companion-proposal-form.pdf: 74 chunks created
Processed health-recharge-proposal-form.pdf: 60 chunks created
Processed health-pulse-proposal-form.pdf: 59 chunks created
Processed go-active-proposal-form.pdf: 58 chunks created
Processed health-recharge-and-health-companion-brochure.pdf: 5 chunks created
Processed heartbeat-brochure.pdf: 14 chunks created
Processed health-companion-brochure.pdf: 4 chunks created
Processed health-pulse-brochure.pdf: 9 chunks created
Processed goactive-brochure.pdf: 29 chunks created
Processed health-premia-brochure.pdf: 2 chunks created
Processed health-recharge-brochure.pdf: 6 chunks created
Processed health-premia-policy-document.pdf: 247 chunks created
Processed health-pulse-policy-document.pdf: 255 chunks created
Processed health-companion-policy-document.pdf: 95 chunks created
Processed goactive-policy-document.pdf: 

In [15]:
documents

[Document(metadata={'source': 'niva_bupa_pdfs/application_form/heartbeat-proposal-form.pdf', 'page': 0, 'page_label': '1'}, page_content='Product Name: Heartbeat, Product UIN No.:IRDAI/NL-HLT/MBHI/P-H/V.III/19/16-17\nHeartbeat Proposal Form\n1. Proposer Details\nTitle\nName\nDOB\nLandmark City\nAlternate numberMobile number \nEmployment: Salaried Self-employed Student Housewife Other, please specify   \nPremium paid by Relationship with Proposer \nYes No Yes No If Yes, please tick the relevant option \na. Unorganized sector b. Informal sector\nc. Economically vulnerable or backward classes\nCurrent address\nDistrict State\nPin code Landline number\nEmail ID \nAadhaar Number\nPAN Number (Mandatory for premium above Rupees 1 lac)\nNationality\nOtherGender   Male Female\nAnnual income (Rs)\nD D M M Y Y YY\nd. Other categories of persons\nBank details:\nBank name \nBranch\nCity\nAccount number\nAccount type: Savings Current\nIFSC Code\n1\nNotes:\n1. This form is to be completed by the PROP

In [17]:
embeddings = OpenAIEmbeddings()
vectorstore = Chroma.from_documents(
    documents=documents,
    embedding=embeddings,
    persist_directory="./niva_bupa_vectorstore"
)

  embeddings = OpenAIEmbeddings()


In [34]:
retriever = vectorstore.as_retriever(search_kwargs={"k": 3})  # Retrieve top 3 relevant chunks

In [39]:
from langchain_core.prompts import ChatPromptTemplate

prompt = ChatPromptTemplate.from_template(
    "You are an insurance policy expert. Answer the user's query based on the given document context. \n\n"
    "Context:\n{context}\n\n"
    "User Query: {input}\n\n"
    "Answer:"
)

In [40]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chat_models import ChatOpenAI

# Load OpenAI Chat Model
llm = ChatOpenAI(model_name="ft:gpt-4o-mini-2024-07-18:personal::B1zvbetx", openai_api_key=openai_api_key)

# Create a document chain
document_chain = create_stuff_documents_chain(llm, prompt)

# Create retrieval chain
retrieval_chain = create_retrieval_chain(retriever, document_chain)

In [42]:
user_query = "What is atronuat?"  # Example query

response = retrieval_chain.invoke({"input": user_query})

print("Chatbot Response:")
print(response["answer"])

Chatbot Response:
It seems there might be a typographical error in your query. If you meant "what is a 'congenital anomaly'?" then according to the policy document, a congenital anomaly refers to a condition that is present since birth and is abnormal with reference to physical structure or function. If you meant something else, please clarify.


In [43]:
response

{'input': 'What is atronuat?',
 'context': [Document(metadata={'page': 0, 'page_label': '1', 'source': 'niva_bupa_pdfs/brochures/health-premia-brochure.pdf'}, page_content='PLATINUM\nVARIANT BROCHURE\nYour health deserves nothing less.'),
  Document(metadata={'page': 0, 'page_label': '1', 'source': 'niva_bupa_pdfs/policy_wordings/goactive-policy-document.pdf'}, page_content="TM\nGoActive\nPolicy Document \n1. Preamble\n This is a contract of insurance between You and Us which is subject to the payment of the full premium in advance and the \nterms, conditions and exclusions to this Policy. This Policy has been issued on the basis of the Disclosure to Information \nNorm, including the information provided by You in respect of the Insured Persons in the Proposal Form and the Information \nSummary Sheet.\n Please inform Us immediately of any change in the address or any other changes aﬀecting You or any Insured Person. \n Note: The terms listed in Section 2(Deﬁnitions) and used elsewhere 