In [117]:
# import Libraries

import openai
import langchain
import pinecone 
from langchain.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Pinecone
from langchain.llms import OpenAI

In [118]:
from dotenv import load_dotenv
load_dotenv()

True

In [119]:
import os

In [120]:
## Lets Read the document
def read_doc(directory):
    file_loader=PyPDFDirectoryLoader(directory)
    documents=file_loader.load()
    return documents

In [121]:
doc = read_doc("documents/")

In [122]:
len(doc)

13

In [123]:
def chunk_data(docs, chunk_size=800, chunk_overlap=50):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size,chunk_overlap=chunk_overlap)
    chunks = text_splitter.split_documents(docs)
    return chunks  

In [124]:
documents = chunk_data(docs=doc)

In [125]:
len(documents)

47

In [None]:
embeddings= OpenAIEmbeddings(api_key=os.environ['OPENAI_API_KEY'])
embeddings

In [128]:
# print(os.environ['OPENAI_API_KEY'])

In [129]:
vectors = embeddings.embed_query("how are you?")
len(vectors)

1536

In [130]:
from pinecone import Pinecone
from langchain_pinecone import PineconeVectorStore
from langchain.embeddings.openai import OpenAIEmbeddings
import os

# Initialize Pinecone client
pc = Pinecone(api_key=os.environ['PINCONE_API_KEY'])

# Connect to the existing index
index = pc.Index("insurance")



In [139]:
# print(documents)

In [140]:
documents_with_embeddings = []

for i, chunk in enumerate(documents):
    embedding = embeddings.embed_query(chunk.page_content)
    
    #unique ID for this chunk
    doc_id = f"doc_{i}"
    
    #storing document data
    documents_with_embeddings.append({
        "id": doc_id,
        "values": embedding,
        "metadata": {
            "text": chunk.page_content,
            "source": chunk.metadata.get("source", "unknown")
        }
    })

In [141]:
len(documents_with_embeddings)

47

In [142]:
#upsert documents in batches
batch_size = 100
for i in range(0, len(documents_with_embeddings), batch_size):
    batch = documents_with_embeddings[i:i+batch_size]
    index.upsert(
        vectors=batch,
        namespace="ns1"
    )
    print(f"Upserted batch {i//batch_size + 1}")

Upserted batch 1


In [143]:
#initializing the vector store with the Pinecone index
from langchain_pinecone import PineconeVectorStore

vectorstore = PineconeVectorStore(
    index_name="insurance",
    embedding=embeddings,
    namespace="ns1",
    pinecone_api_key=os.environ['PINCONE_API_KEY']
)

In [146]:
# Search for similar documents with a query
query = "I want to dispute my claim denial"
results_with_score = vectorstore.similarity_search_with_score(query, k=2)

# Display the results
for i, (doc, score) in enumerate(results_with_score):
    print(f"Result {i+1} (Score: {score}):\n")
    print(doc.page_content)
    print("\n---\n")

Result 1 (Score: 0.804803848):

annual Premium. 
F 6 When we will contest the validity of this Policy 
We have the right to contest the validity of this Policy, or the payment of the Death Benefit or any other Policy 
benefits, if you or any Life Insured under this Policy have incorrectly stated, misrepresented or failed to disclose a 
material fact in the application for insurance, or on any medical examination, or in any written or electronic 
statements or answers provided as evidence of insurability. 
Except in the case of fraud, we will not contest this Policy for misrepresentation after it has been in force for two (2) 
years during the lifetime of every Life Insured, from the later of the Coverage Date or the last date of reinstatement.

---

Result 2 (Score: 0.776828408):

applicants and whether or not an offer of insurance coverage can be made and under what terms Coverage is 
available. Such terms may include any combination of the following: the payment of an extra premium, 

In [108]:
# Check index stats to confirm documents were uploaded
index = pc.Index("insurance")
stats = index.describe_index_stats(namespace="ns1")
print(stats)

{'dimension': 1536,
 'index_fullness': 0.0,
 'metric': 'cosine',
 'namespaces': {'ns1': {'vector_count': 47}},
 'total_vector_count': 47,
 'vector_type': 'dense'}


In [109]:
# Alternative direct query using Pinecone (not LangChain)
query_embedding = embeddings.embed_query("What are the exclusions in this insurance policy?")
query_results = index.query(
    namespace="ns1",
    vector=query_embedding,
    top_k=2,
    include_metadata=True
)

# Display the direct Pinecone query results
for match in query_results['matches']:
    print(f"ID: {match['id']}, Score: {match['score']}")
    print(f"Content: {match['metadata']['text'][:200]}...")
    print("\n---\n")

ID: doc_45, Score: 0.830493271
Content: Beneficiary restrictions: 
Your policy contains a provision restricting or removing your right to designate a beneficiary to receive any 
insurance money payable under the contract if, 
this coverage ...

---

ID: doc_44, Score: 0.825645924
Content: Specimen
Provincial amendments 
This policy contract is amended by adding the following provisions: 
Limitation of Actions: 
Every action or proceeding against an insurer for the recovery of insurance...

---

