In [1]:
import os
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer
from pinecone import Pinecone

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
pc = Pinecone(api_key="pcsk_6XSjon_UYY1pDQBYf3ecN7sYPiJgWdSRYBY1yTfjiFoDQ88WmPvFBLJqaPv8CDR6GfUk6r")

In [11]:
index = pc.Index("loan-laws")

In [12]:
pdf_files = ["/Users/haggarwal/Documents/legal-analyzer-app/temp/loan_1.pdf", "/Users/haggarwal/Documents/legal-analyzer-app/temp/loan_2.pdf","/Users/haggarwal/Documents/legal-analyzer-app/temp/loan_3.pdf"]
docs = []

for pdf in pdf_files:
    loader = PyPDFLoader(pdf)
    pages = loader.load()
    for page_num, page in enumerate(pages):
        splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
        chunks = splitter.split_documents([page])
        for i, chunk in enumerate(chunks):
            chunk.metadata.update({
                "source": pdf,
                "page": page_num + 1,
                "chunk_id": f"{pdf}_p{page_num+1}_c{i}"
            })
            docs.append(chunk)

print(f"Total chunks: {len(docs)}")

Total chunks: 804


In [13]:
docs[0]

Document(metadata={'producer': 'Microsoft® Word 2019', 'creator': 'Microsoft® Word 2019', 'creationdate': '2021-06-08T14:50:53-07:00', 'author': 'HP', 'moddate': '2021-06-08T14:50:53-07:00', 'source': '/Users/haggarwal/Documents/legal-analyzer-app/temp/loan_1.pdf', 'total_pages': 106, 'page': 1, 'page_label': '1', 'chunk_id': '/Users/haggarwal/Documents/legal-analyzer-app/temp/loan_1.pdf_p1_c0'}, page_content='1 \n \nTHE BANKING REGULATION ACT, 1949 \n__________ \nARRANGEMENT OF SECTIONS \n____________ \nPART I \nPRELIMINARY \nSECTIONS \n1. Short title, extent and commencement. \n2. Application of other laws not barred. \n3. Act not to apply to certain co-operative societies. \n4. Power to suspend operation of Act. \n5. Interpretation. \n5A. Act to override memorandum, articles, etc. \nPART II \nBUSINESS OF BANKING COMPANIES \n6. Form of business in which banking companies may engage. \n7. Use of words “bank”, “banker”, “banking” or “banking company”. \n8. Prohibition of trading. \n9. 

In [14]:
embedder = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")  

In [15]:
vectors = []
for doc in docs:
    emb = embedder.encode(doc.page_content).tolist()  # numpy -> list
    vectors.append({
        "id": doc.metadata["chunk_id"],
        "values": emb,
        "metadata": {
            "source": doc.metadata["source"],
            "page": doc.metadata["page"],
            "text": doc.page_content
        }
    })

In [25]:
# index.upsert(vectors)
# print("✅ Data upserted into Pinecone with SentenceTransformer embeddings.") This wont work when vector size is more than 2MBs

from tqdm import tqdm

def batch_upsert(index, vectors, namespace=None, batch_size=100):
    for i in tqdm(range(0, len(vectors), batch_size)):
        batch = vectors[i:i+batch_size]
        index.upsert(vectors=batch)
    print("✅ Data upserted successfully in batches.")

# Example usage:
batch_upsert(index, vectors, namespace="default", batch_size=100)


100%|██████████| 9/9 [00:55<00:00,  6.16s/it]

✅ Data upserted successfully in batches.





In [26]:
print(index.describe_index_stats())

{'dimension': 768,
 'index_fullness': 0.0,
 'metric': 'cosine',
 'namespaces': {'': {'vector_count': 804}},
 'total_vector_count': 804,
 'vector_type': 'dense'}


In [27]:
query = "What if I delay my loan EMI by 1 month?"
q_emb = embedder.encode(query).tolist()

results = index.query(
    vector=q_emb,
    top_k=5,
    include_metadata=True
)

for match in results["matches"]:
    print(f"Score: {match['score']:.3f}")
    print(f"Source: {match['metadata']['source']} (p{match['metadata']['page']})")
    print(f"Text: {match['metadata']['text'][:200]}...")
    print("---")


Score: 0.348
Source: /Users/haggarwal/Documents/legal-analyzer-app/temp/loan_2.pdf (p24.0)
Text: sub-section (1). 
(6) If the application is not disposed of by the Debts Recovery Tribunal within the period of four 
months as specified in sub-section (5), any part to the application may make an ap...
---
Score: 0.343
Source: /Users/haggarwal/Documents/legal-analyzer-app/temp/loan_2.pdf (p29.0)
Text: (b) that on other grounds, it is just and equitable to grant relief, 
may, on the appli cation of a secured creditor or 5[asset reconstruction company ] or any other person 
interested on such terms a...
---
Score: 0.343
Source: /Users/haggarwal/Documents/legal-analyzer-app/temp/loan_1.pdf (p26.0)
Text: which the loan or advance was made, or is granted by a banking company after the commencement of 
section 5 of the Banking Laws (Amendment) Act, 1968 (58 of 1968), but in pursuance of a commitment 
en...
---
Score: 0.342
Source: /Users/haggarwal/Documents/legal-analyzer-app/temp/loan_2.pdf (p