<a href="https://colab.research.google.com/github/amaan2398/Mr.HelpMate-AI/blob/main/Mr.HelpMate-AI.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 1. Setup and Imports

In [None]:
!pip install -U langchain-community
!pip install -U pypdf
!pip install -U faiss-cpu
!pip install langchain-google-genai



## 1.1. Library Installation and Imports

In [None]:
import os

import numpy as np

from google.colab import userdata

from langchain.vectorstores import FAISS
from langchain.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from langchain_community.llms import HuggingFaceHub, Ollama
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough

from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings

## 1.2. API Key Configuration

In [None]:
os.environ['HUGGINGFACEHUB_API_TOKEN'] = ''
os.environ['GOOGLE_API_KEY'] = ''

# 2. Data Loading and Preprocessing

## 2.1. Loading Documents from Directory

In [None]:
os.listdir('Policy_Docs')

['Principal-Sample-Life-Insurance-Policy.pdf']

## 2.2. Document Splitting and Chunking

In [None]:
loader = PyPDFDirectoryLoader('Policy_Docs')
pdfpages = loader.load_and_split()

In [None]:
doc_before_split = loader.load()
print(f'Number of pages: {len(doc_before_split)}')
print()
if len(doc_before_split) > 1:
    print(f'Number of characters in first page: {len(doc_before_split[0].page_content)}')
    print()
    print(f'Glimps of data: \n{doc_before_split[0].page_content[:100]}...')
else:
    print('No data loaded!')

Number of pages: 64

Number of characters in first page: 203

Glimps of data: 
GROUP POLICY FOR: 
RHODE ISLAND JOHN DOE 
 
ALL MEMBERS 
Group Member Life Insurance 
 
Print Date: ...


## 2.3. Document Statistics

In [None]:
for idx, doc in enumerate(doc_before_split):
    print(f'Page {idx+1}: {len(doc.page_content)} chars')

Page 1: 203 chars
Page 2: 34 chars
Page 3: 1578 chars
Page 4: 34 chars
Page 5: 836 chars
Page 6: 1131 chars
Page 7: 1289 chars
Page 8: 1187 chars
Page 9: 2301 chars
Page 10: 1460 chars
Page 11: 1845 chars
Page 12: 2164 chars
Page 13: 1682 chars
Page 14: 1980 chars
Page 15: 219 chars
Page 16: 2027 chars
Page 17: 1786 chars
Page 18: 1842 chars
Page 19: 943 chars
Page 20: 1928 chars
Page 21: 2268 chars
Page 22: 1362 chars
Page 23: 2136 chars
Page 24: 724 chars
Page 25: 501 chars
Page 26: 1556 chars
Page 27: 556 chars
Page 28: 2170 chars
Page 29: 2321 chars
Page 30: 2631 chars
Page 31: 2548 chars
Page 32: 2541 chars
Page 33: 2771 chars
Page 34: 533 chars
Page 35: 1474 chars
Page 36: 1961 chars
Page 37: 783 chars
Page 38: 1932 chars
Page 39: 1247 chars
Page 40: 2010 chars
Page 41: 1483 chars
Page 42: 2305 chars
Page 43: 2359 chars
Page 44: 2286 chars
Page 45: 1060 chars
Page 46: 1773 chars
Page 47: 2359 chars
Page 48: 2462 chars
Page 49: 2368 chars
Page 50: 2560 chars
Page 51: 1707 chars
Pa

In [None]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=100,
    length_function=len,
    is_separator_regex=False
)

doc_after_split = text_splitter.split_documents(pdfpages)

In [None]:
doc_after_split[10: 12]

[Document(metadata={'producer': 'ADEP Document Services - PDF Generator', 'creator': 'Acrobat PDFMaker 10.1 for Word', 'creationdate': '2014-07-16T12:50:27-05:00', 'author': 'Apache POI', 'company': 'Principal Financial Group', 'contenttypeid': '0x010100963E1E66C16CFF4188CB6D2716FCB7F5', 'itemretentionformula': '', 'moddate': '2014-07-16T12:50:34-05:00', 'order': '467600.000000', 'paper copies': '1', 'sourcemodified': 'D:20140716175024', 'title': 'Life Policy', '_copysource': '2486961970434790079.docx', '_dlc_policyid': '', 'source': 'Policy_Docs/Principal-Sample-Life-Insurance-Policy.pdf', 'total_pages': 64, 'page': 7, 'page_label': '8'}, page_content='This policy has been updated effective January 1, 2014 \n \n \n \nGC 6001 TABLE OF CONTENTS, PAGE 3  \n \n \n \n Section A - Member Life Insurance \n \n \n Schedule of Insurance Article 1 \n Death Benefits Payable Article 2 \n Beneficiary Article 3 \n Facility of Payment Article 4 \n Settlement of Proceeds Article 5 \n Member Life Insur

In [None]:
avg_doc_length = lambda docs: sum([len(doc.page_content) for doc in docs]) // len(docs)
avg_char_count_before = avg_doc_length(doc_before_split)
avg_char_count_after = avg_doc_length(doc_after_split)

print(f'Average number of characters in original docs: {avg_char_count_before}')
print(f'Average number of characters in split docs: {avg_char_count_after}')

Average number of characters in original docs: 1618
Average number of characters in split docs: 756


# 3. Creating the RAG System Components

## 3.1. Generating Embeddings

In [None]:
embeddings = GoogleGenerativeAIEmbeddings(
    model="models/text-embedding-004",
    google_api_key=os.getenv("GOOGLE_API_KEY")
)

In [None]:
sample_embedding = np.array(embeddings.embed_query(doc_after_split[0].page_content))
print(f'Embedding shape: {sample_embedding.shape}')
print(f'Embedding values: {sample_embedding[:10]}')

Embedding shape: (768,)
Embedding values: [ 0.06530181  0.01385744  0.02795277 -0.00104217 -0.01671013  0.00738504
  0.00435269  0.05256361  0.02740174  0.05536716]


## 3.2. Building the Vector Store (FAISS)

In [None]:
# Create the FAISS Store
faiss_vector_store = FAISS.from_documents(doc_after_split, embeddings)


## 3.3. Initializing the Retriever

In [None]:
# Explicitly create FAISS retriever (confirm if it's a valid retriever)
faiss_retriever = faiss_vector_store.as_retriever(search_type = 'similarity', search_kwargs={'k': 5})

print("FAISS Retriever Type:", type(faiss_retriever))

FAISS Retriever Type: <class 'langchain_core.vectorstores.base.VectorStoreRetriever'>


# 4. Testing and Verification

In [None]:
queries = [
    'What is a "Member" according to this policy?',
    'What is the grace period for paying premiums?',
    'Who is considered a "Dependent"?'
]

## 4.1. Retrieving Relevant Documents

In [None]:
query = queries[0]

relevant_docs = faiss_retriever.get_relevant_documents(query)
print(f'Number of relevant docs: {len(relevant_docs)}')
print(relevant_docs[0])
print('*'*100)
print(relevant_docs[1])
print('*'*100)
print(relevant_docs[2])

Number of relevant docs: 5
page_content='Payment of premium b eyond the effective date of the change constitutes the Policyholder's 
consent to the change. 
 
 
Article 3 - Policyholder Eligibility Requirements 
 
To be an eligible group and to remain an eligible group, the Policyholder must:' metadata={'producer': 'ADEP Document Services - PDF Generator', 'creator': 'Acrobat PDFMaker 10.1 for Word', 'creationdate': '2014-07-16T12:50:27-05:00', 'author': 'Apache POI', 'company': 'Principal Financial Group', 'contenttypeid': '0x010100963E1E66C16CFF4188CB6D2716FCB7F5', 'itemretentionformula': '', 'moddate': '2014-07-16T12:50:34-05:00', 'order': '467600.000000', 'paper copies': '1', 'sourcemodified': 'D:20140716175024', 'title': 'Life Policy', '_copysource': '2486961970434790079.docx', '_dlc_policyid': '', 'source': 'Policy_Docs/Principal-Sample-Life-Insurance-Policy.pdf', 'total_pages': 64, 'page': 15, 'page_label': '16'}
******************************************************************

In [None]:
query = queries[1]

relevant_docs = faiss_retriever.get_relevant_documents(query)
print(f'Number of relevant docs: {len(relevant_docs)}')
print(relevant_docs[0])
print('*'*100)
print(relevant_docs[1])
print('*'*100)
print(relevant_docs[2])

Number of relevant docs: 5
page_content='Payment of premium b eyond the effective date of the change constitutes the Policyholder's 
consent to the change. 
 
 
Article 3 - Policyholder Eligibility Requirements 
 
To be an eligible group and to remain an eligible group, the Policyholder must:' metadata={'producer': 'ADEP Document Services - PDF Generator', 'creator': 'Acrobat PDFMaker 10.1 for Word', 'creationdate': '2014-07-16T12:50:27-05:00', 'author': 'Apache POI', 'company': 'Principal Financial Group', 'contenttypeid': '0x010100963E1E66C16CFF4188CB6D2716FCB7F5', 'itemretentionformula': '', 'moddate': '2014-07-16T12:50:34-05:00', 'order': '467600.000000', 'paper copies': '1', 'sourcemodified': 'D:20140716175024', 'title': 'Life Policy', '_copysource': '2486961970434790079.docx', '_dlc_policyid': '', 'source': 'Policy_Docs/Principal-Sample-Life-Insurance-Policy.pdf', 'total_pages': 64, 'page': 15, 'page_label': '16'}
******************************************************************

In [None]:
query = queries[2]

relevant_docs = faiss_retriever.get_relevant_documents(query)
print(f'Number of relevant docs: {len(relevant_docs)}')
print(relevant_docs[0])
print('*'*100)
print(relevant_docs[1])
print('*'*100)
print(relevant_docs[2])

Number of relevant docs: 5
page_content='Dependent is acquired, provided the new Dependent is not then confined in a Hospital or 
Skilled Nursing Facility.  Requests for insurance and Proof of Good Health are not required' metadata={'producer': 'ADEP Document Services - PDF Generator', 'creator': 'Acrobat PDFMaker 10.1 for Word', 'creationdate': '2014-07-16T12:50:27-05:00', 'author': 'Apache POI', 'company': 'Principal Financial Group', 'contenttypeid': '0x010100963E1E66C16CFF4188CB6D2716FCB7F5', 'itemretentionformula': '', 'moddate': '2014-07-16T12:50:34-05:00', 'order': '467600.000000', 'paper copies': '1', 'sourcemodified': 'D:20140716175024', 'title': 'Life Policy', '_copysource': '2486961970434790079.docx', '_dlc_policyid': '', 'source': 'Policy_Docs/Principal-Sample-Life-Insurance-Policy.pdf', 'total_pages': 64, 'page': 32, 'page_label': '33'}
****************************************************************************************************
page_content='Dependent 
 
a. A Membe

# 5. Setting Up the Language Model (LLM) Chain

## 5.1. Initializing the LLM

In [None]:

llm = ChatGoogleGenerativeAI(
    model="gemini-1.5-flash",
    temperature=0.1,
    max_output_tokens=1000,
    google_api_key=os.getenv("GOOGLE_API_KEY")
)



## 5.2. Defining the RAG Prompt Template

In [None]:
prompt_temlate = """
You are a helpful assistant. Use the following pieces of context to answer the question at the end.
If you don't know the answer, just say that you don't know, don't try to make up an answer.
Rules:
1. Complete Sentense
2. Structure response
3. Contextual relevant response

Context: {context}

Question: {question}
Helpful Answer:
"""
PROMPT = ChatPromptTemplate.from_template(prompt_temlate)

output_parser = StrOutputParser()

## 5.3. Building the Full RAG Chain

In [None]:
chain = (
    {'context': faiss_retriever, 'question': RunnablePassthrough()}
    | PROMPT
    | llm
    | output_parser
)

# 6. Running the Full RAG Pipeline

## 6.1. Executing Queries

In [None]:
for query in queries:
    print(f'Query: {query}')
    print(f'Response: {chain.invoke(query)}')
    print('-' * 50)

Query: What is a "Member" according to this policy?
Response: Based on the provided text, a "Member" is an individual who has completed 30 consecutive days of continuous Active Work with the Policyholder.  A Member is eligible for Member Life Insurance and potentially Member Accidental Death and Dismemberment Insurance, depending on their eligibility date.  Members are not required to contribute to the premium for their own insurance, but may be required to contribute to their dependents' insurance.
--------------------------------------------------
Query: What is the grace period for paying premiums?
Response: The grace period for paying premiums is 31 days.
--------------------------------------------------
Query: Who is considered a "Dependent"?
Response: Based on the provided text, a dependent is defined in several ways:

*   **A Member's spouse:**  Must be legally married to the Member, not in the Armed Forces of any country, and not insured under the Group Policy as a Member. A C