In [9]:
# This notebook will generate vector stores which consists of two data 
# 1. FAQ generated using llm to handle generic questions.
# 2. Pdf of data to handle context based questions specific to that pdf

# Adding both in vector store so that later used in integration

# 1. FAQ generated using llm to handle generic questions.


In [10]:
from langchain_community.vectorstores import FAISS
from langchain.prompts import PromptTemplate
from langchain_ollama.llms import OllamaLLM
import pandas as pd
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document  # Import Document schema
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_community.document_loaders import PyPDFLoader



In [11]:
# to check the version of libraries installed

import pkg_resources

# List of packages to check
packages = [
    "langchain_community", 
    "langchain", 
    "langchain_ollama", 
    "pandas", 
    "langchain_google_genai"
]

# Print package versions
for package in packages:
    try:
        version = pkg_resources.get_distribution(package).version
        print(f"{package} version: {version}")
    except pkg_resources.DistributionNotFound:
        print(f"{package} is not installed.")

langchain_community version: 0.2.10
langchain version: 0.2.12
langchain_ollama version: 0.1.3
pandas version: 2.2.2
langchain_google_genai version: 1.0.10


In [12]:
llm = OllamaLLM(model="gemma")

In [13]:
prompt_template = """
You are an expert analyzer. Create total {count} multiple FAQs in the context of {context_name} insurance policies. 
Provide the output in the following JSON format, not string format:
[
    {{
        "id": 1,
        "question": "What is company insurance?",
        "answer": "Company insurance is..."
    }},
    {{
        "id": 2,
        "question": "How does liability insurance work?",
        "answer": "Liability insurance works..."
    }}
]
"""

prompt = PromptTemplate(input_variables=["context_name","count"], template=prompt_template)

print(prompt)

input_variables=['context_name', 'count'] template='\nYou are an expert analyzer. Create total {count} multiple FAQs in the context of {context_name} insurance policies. \nProvide the output in the following JSON format, not string format:\n[\n    {{\n        "id": 1,\n        "question": "What is company insurance?",\n        "answer": "Company insurance is..."\n    }},\n    {{\n        "id": 2,\n        "question": "How does liability insurance work?",\n        "answer": "Liability insurance works..."\n    }}\n]\n'


In [14]:
# Create an LLMChain for processing the prompt

chain = prompt | llm

In [15]:
print(chain)

first=PromptTemplate(input_variables=['context_name', 'count'], template='\nYou are an expert analyzer. Create total {count} multiple FAQs in the context of {context_name} insurance policies. \nProvide the output in the following JSON format, not string format:\n[\n    {{\n        "id": 1,\n        "question": "What is company insurance?",\n        "answer": "Company insurance is..."\n    }},\n    {{\n        "id": 2,\n        "question": "How does liability insurance work?",\n        "answer": "Liability insurance works..."\n    }}\n]\n') last=OllamaLLM(model='gemma', _client=<ollama._client.Client object at 0x000001BEF8DB3E60>, _async_client=<ollama._client.AsyncClient object at 0x000001BEFC069130>)


In [16]:
response = chain.invoke(input={"context_name": "Tech Mahindra employees insurance policy", "count": 5})
print(response)

[
    {
        "id": 1,
        "question": "What is the purpose of Tech Mahindra's employee insurance policy?",
        "answer": "The policy provides financial protection to employees in the event of illness, injury, or other covered events."
    },
    {
        "id": 2,
        "question": "What types of coverage are typically included in the policy?",
        "answer": "The policy covers hospitalization, surgical expenses, maternity expenses, critical illnesses, and other specified benefits."
    },
    {
        "id": 3,
        "question": "How do employees qualify for coverage under the policy?",
        "answer": "Employees are eligible for coverage based on their employment status and company policy guidelines."
    },
    {
        "id": 4,
        "question": "What is the claims process for the employee insurance policy?",
        "answer": "Employees should submit a claim form along with necessary documentation to the insurance provider."
    },
    {
        "id": 5,
   

In [17]:
# storing above response in a json file

import json

try:
    # Parse the string response to valid JSON format
    parsed_output = json.loads(response)
except json.JSONDecodeError as e:
    print(f"Error decoding JSON: {e}")
    parsed_output = {}

# Save the output to a JSON file
output_file_path = "output_faqs.json"

with open(output_file_path, "w") as json_file:
    json.dump(parsed_output, json_file, indent=4)

print(f"Output saved to {output_file_path}")

Output saved to output_faqs.json


In [18]:
# converting to a dataframe
df = pd.DataFrame(parsed_output)

df.to_csv("faqs.csv")

In [19]:
df = pd.read_csv("faqs.csv",usecols=["id","question","answer"] )
df

Unnamed: 0,id,question,answer
0,1,What is the purpose of Tech Mahindra's employe...,The policy provides financial protection to em...
1,2,What types of coverage are typically included ...,"The policy covers hospitalization, surgical ex..."
2,3,How do employees qualify for coverage under th...,Employees are eligible for coverage based on t...
3,4,What is the claims process for the employee in...,Employees should submit a claim form along wit...
4,5,Where can employees find more information abou...,Employees can refer to the company's intranet ...


In [20]:
df['content'] = df['question'] + "\n" + df['answer']
df

Unnamed: 0,id,question,answer,content
0,1,What is the purpose of Tech Mahindra's employe...,The policy provides financial protection to em...,What is the purpose of Tech Mahindra's employe...
1,2,What types of coverage are typically included ...,"The policy covers hospitalization, surgical ex...",What types of coverage are typically included ...
2,3,How do employees qualify for coverage under th...,Employees are eligible for coverage based on t...,How do employees qualify for coverage under th...
3,4,What is the claims process for the employee in...,Employees should submit a claim form along wit...,What is the claims process for the employee in...
4,5,Where can employees find more information abou...,Employees can refer to the company's intranet ...,Where can employees find more information abou...


In [21]:
documents = [Document(page_content=row['answer'], metadata={"id": row['id']}) for _, row in df.iterrows()]
documents

[Document(metadata={'id': 1}, page_content='The policy provides financial protection to employees in the event of illness, injury, or other covered events.'),
 Document(metadata={'id': 2}, page_content='The policy covers hospitalization, surgical expenses, maternity expenses, critical illnesses, and other specified benefits.'),
 Document(metadata={'id': 3}, page_content='Employees are eligible for coverage based on their employment status and company policy guidelines.'),
 Document(metadata={'id': 4}, page_content='Employees should submit a claim form along with necessary documentation to the insurance provider.'),
 Document(metadata={'id': 5}, page_content="Employees can refer to the company's intranet or contact the HR department for detailed policy information and guidelines.")]

In [22]:
splitter = RecursiveCharacterTextSplitter(chunk_size=100, chunk_overlap=1)
split_docs = splitter.split_documents(documents)
split_docs 

[Document(metadata={'id': 1}, page_content='The policy provides financial protection to employees in the event of illness, injury, or other'),
 Document(metadata={'id': 1}, page_content='covered events.'),
 Document(metadata={'id': 2}, page_content='The policy covers hospitalization, surgical expenses, maternity expenses, critical illnesses, and'),
 Document(metadata={'id': 2}, page_content='other specified benefits.'),
 Document(metadata={'id': 3}, page_content='Employees are eligible for coverage based on their employment status and company policy guidelines.'),
 Document(metadata={'id': 4}, page_content='Employees should submit a claim form along with necessary documentation to the insurance provider.'),
 Document(metadata={'id': 5}, page_content="Employees can refer to the company's intranet or contact the HR department for detailed policy"),
 Document(metadata={'id': 5}, page_content='information and guidelines.')]

In [23]:
# before passing it to embedding model we have to convert it to strings

texts = [doc.page_content for doc in split_docs]
metadata = [doc.metadata for doc in split_docs]  #  metadata (e.g., id)


In [24]:
embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
vectors = embeddings.embed_documents(texts)
len(vectors)

8

In [25]:
# in FAISS , we will strore below created vector_store in local to use it later
# we are not using or saving above created vectors as FAISS is capable of storing and embedding both

vector_store = FAISS.from_texts(
    texts=texts, 
    embedding=embeddings, 
    metadatas=metadata
)

In [26]:
# Path to save the FAISS index
faiss_index_path = "faiss_index.bin"

# Save the FAISS index to disk
vector_store.save_local(faiss_index_path)
print("FAISS index saved to", faiss_index_path)

FAISS index saved to faiss_index.bin


In [27]:
# loading stored FAISS  back here

loaded_vector_store = FAISS.load_local(faiss_index_path, embeddings, allow_dangerous_deserialization=True)


In [28]:
query = "What is included in the employee insurance policy?"
query_embedding = embeddings.embed_query(query)
results = loaded_vector_store.similarity_search(query, k=1) 
results

[Document(metadata={'id': 1}, page_content='The policy provides financial protection to employees in the event of illness, injury, or other')]

In [29]:
len(query_embedding)

768

In [30]:
# Display the results
for result in results:
    print("Matched Text:", result.page_content)
    print("Metadata:", result.metadata)

Matched Text: The policy provides financial protection to employees in the event of illness, injury, or other
Metadata: {'id': 1}


# 2. Insurance pdf to handle context based questions


In [31]:
from langchain_community.document_loaders import PyPDFLoader
pdf_file = PyPDFLoader(file_path = r"D:\GenAI/COVERSATIONAL_AI/EnrolPolicyConditions_24.pdf")

In [32]:
loader = pdf_file.load()

In [33]:
splitter = RecursiveCharacterTextSplitter(chunk_size=200, chunk_overlap=5)
split_docs_pdf = splitter.split_documents(loader)


In [34]:
texts_pdf = [doc.page_content for doc in split_docs_pdf]
pdf_metadata = [{"source": "pdf", "page": i} for i in range(len(split_docs_pdf))]  # Example metadata


embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
vectors_pdf = embeddings.embed_documents(texts_pdf)
len(vectors_pdf)

448

In [35]:
vector_store.add_texts(texts=texts_pdf, metadatas=pdf_metadata)


['e1cbe4b3-b1c6-4d72-94ab-78b4b3a2954a',
 'a7913a1d-e7c3-4a3a-891b-c0bc39fbbc61',
 '074cbe58-142c-4c29-9c03-d09253db1d84',
 '6bf1cf8d-07df-4822-bba8-a737d51b7f60',
 '2ebf7097-2e42-4e8e-b3a0-cddcfc0c9c06',
 '141a6904-4547-4ea8-975b-eb35e1f1298f',
 'd93593f3-62fa-4a0e-834e-9bac88d0a9e2',
 'e26c2208-b78b-4f69-84ef-7f4d19f96050',
 '95da97f0-6b63-4b78-89ac-81ccd1be4f02',
 '5b28931a-222d-4eb2-b283-d308cba43253',
 '3001c8af-ebde-4687-83ea-09c65bfb5831',
 '5ce8495d-8562-4aa2-a22b-543bb8d5128e',
 'ba236c45-2f3b-4b7f-91df-c3c8bad548a0',
 'f0a1d42e-a240-47bf-b17d-88229a99bf4b',
 '2830a694-d03f-4f56-953e-2bc74629b6df',
 'bd2c0b0e-e7ea-45a1-9afb-b206e403b26b',
 'd2186f5a-2357-4869-a0e8-dcc5b4e85bf9',
 '8ace82b7-ac83-459a-a436-1c0d1551259a',
 '9e15d777-aadf-419e-ad51-4b14153d1da8',
 'b21de7e9-e0d8-499b-9229-1914c30a4534',
 '8c730927-ef84-48cb-823d-d503440a2cc6',
 '262b06e1-8947-4601-a20e-30284a9b034b',
 'e5a3695f-4cc9-4800-9b16-53fdfbb13db8',
 '12db486e-c0e1-42a4-a3fa-3ca21f6f9786',
 '77ba2597-1b53-

In [36]:
# saving again updated vector data base

# Path to save the FAISS index
faiss_index_path = "faiss_index.bin"

# Save the FAISS index to disk
vector_store.save_local(faiss_index_path)
print("FAISS index saved to", faiss_index_path)

FAISS index saved to faiss_index.bin


In [53]:
query = "What is cataract policy"
results = vector_store.similarity_search(query, k=1) 
results

[Document(metadata={'source': 'pdf', 'page': 132}, page_content='Cataract: 10% of the sum insured or INR 25,000, whichever is less and 10% co -pay for all dependents  \nin Base Policy and 15% co -pay in Top up policy for parents /In-laws  only .')]

In [41]:
# Display the results
for result in results:
    print("Matched Text:", result.page_content)
    print("Metadata:", result.metadata)

Matched Text: Cataract: 10% of the sum insured or INR 25,000, whichever is less and 10% co -pay for all dependents  
in Base Policy and 15% co -pay in Top up policy for parents /In-laws  only .
Metadata: {'source': 'pdf', 'page': 132}


In [44]:
result.page_content

'Cataract: 10% of the sum insured or INR 25,000, whichever is less and 10% co -pay for all dependents  \nin Base Policy and 15% co -pay in Top up policy for parents /In-laws  only .'

In [55]:
# above now we can see faiss vector store capable to answer either from faq or pdf

In [52]:
# as we can see that output is not that much human readable to taking help of llm to do so

new_prompt = """You are an expert conversational assistant. Based on the question {user_query} 
and the response {faiss_response}, summarize and refine the answer into a more concise, human-readable, and 
conversational format. Ensure the response is clear, engaging, and uses minimal words while maintaining the 
essence of the original answer.
"""

prompt_structure = PromptTemplate(input_variables=["user_query","faiss_response"], template = new_prompt)


chain = prompt_structure | llm
human_response = chain.invoke(input = {"user_query":query,"faiss_response":result.page_content})

print(human_response)


** Cataract coverage:**

* In the base policy, you're covered for **10% of your sum insured or INR 25,000, whichever is less**. 
* There's a **10% co-pay** for dependents.

* In the top-up policy, the co-pay for parents/in-laws is slightly higher at **15%**.


In [None]:
# now above answer seems perfect

# Auto completion

In [56]:
# auto completion

llm = OllamaLLM(model="gemma")


embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
faiss_index_path = "faiss_index.bin"


final_vectors = FAISS.load_local(faiss_index_path, embeddings, allow_dangerous_deserialization=True)

In [57]:
prompt_template = """
User's partial query {partial_query} and {faiss_context}

Try to create three user question starts with {partial_query}

that an user can ask with the mentioned context.Response should not contain any sepcial line chacter and 

all user question should be point wise with numberings
"""

# Create the PromptTemplate object
prompt = PromptTemplate(input_variables=["partial_query","faiss_context"], template=prompt_template)


# Set up LLMChain
chain = prompt | llm

In [58]:
partial_query = "what is my insurance"
faiss_context = final_vectors.similarity_search(partial_query,k = 1)
faiss_context_1 = " ".join([i.page_content for i in faiss_context])


In [59]:
faiss_context_1

'X-rays and such similar expens es that are medically necessary, subject to Policy T&C.'

In [60]:
response = chain.invoke(input={"partial_query":partial_query,"faiss_context":faiss_context_1})

In [61]:
print(response)

1. What is my insurance coverage for medically necessary expenses like X-rays?


2. What is my insurance policy's coverage for routine X-rays and other similar expenses?


3. Can you explain what my insurance covers for medically necessary expenses, such as X-rays?
