Documentaization

In [40]:
from langchain_core.documents import Document
from pathlib import Path
import os
from langchain_community.document_loaders import PyPDFLoader, PyMuPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from pathlib import Path

In [32]:
doc=Document(page_content="This is a test document", metadata={"source": "test"})


In [41]:
### Read all the pdf's inside the directory
def process_all_pdfs(pdf_directory):
    """Process all PDF files in a directory"""
    all_documents = []
    pdf_dir = Path(pdf_directory)
    
    # Find all PDF files recursively
    pdf_files = list(pdf_dir.glob("**/*.pdf"))
    
    print(f"Found {len(pdf_files)} PDF files to process")
    
    for pdf_file in pdf_files:
        print(f"\nProcessing: {pdf_file.name}")
        try:
            loader = PyPDFLoader(str(pdf_file))
            documents = loader.load()
            
            # Add source information to metadata
            for doc in documents:
                doc.metadata['source_file'] = pdf_file.name
                doc.metadata['file_type'] = 'pdf'
            
            all_documents.extend(documents)
            print(f"  ✓ Loaded {len(documents)} pages")
            
        except Exception as e:
            print(f"  ✗ Error: {e}")
    
    print(f"\nTotal documents loaded: {len(all_documents)}")
    return all_documents

# Process all PDFs in the data directory
all_pdf_documents = process_all_pdfs("../data")

Found 2 PDF files to process

Processing: Employee Handbook_Bhavna Corp_2.0  (1).pdf
  ✓ Loaded 20 pages

Processing: Leave Policy _V4 (1).pdf
  ✓ Loaded 10 pages

Total documents loaded: 30


Creating Chunks

In [42]:
### Text splitting get into chunks

def split_documents(documents,chunk_size=1000,chunk_overlap=200):
    """Split documents into smaller chunks for better RAG performance"""
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
        separators=["\n\n", "\n", " ", ""]
    )
    split_docs = text_splitter.split_documents(documents)
    print(f"Split {len(documents)} documents into {len(split_docs)} chunks")
    
    # Show example of a chunk
    if split_docs:
        print(f"\nExample chunk:")
        print(f"Content: {split_docs[0].page_content[:200]}...")
        print(f"Metadata: {split_docs[0].metadata}")
    
    return split_docs

In [43]:
chunks=split_documents(all_pdf_documents)
chunks

Split 30 documents into 107 chunks

Example chunk:
Content: BSIPL-ISMS-POL-HREHB (Human Resource Employee Handbook)      
                            
© 2026, <Bhavna Software India Pvt Ltd >. All Rights Reserved 
Highly Confidential 
 
 
  
 
 
 
 
 
 
 
 
 
...
Metadata: {'producer': 'Microsoft® Word for Microsoft 365', 'creator': 'Microsoft® Word for Microsoft 365', 'creationdate': '2025-12-31T11:31:12+05:30', 'author': 'Swetha Seethiraju', 'moddate': '2025-12-31T11:31:12+05:30', 'source': '..\\data\\Employee Handbook_Bhavna Corp_2.0  (1).pdf', 'total_pages': 20, 'page': 0, 'page_label': '1', 'source_file': 'Employee Handbook_Bhavna Corp_2.0  (1).pdf', 'file_type': 'pdf'}


[Document(metadata={'producer': 'Microsoft® Word for Microsoft 365', 'creator': 'Microsoft® Word for Microsoft 365', 'creationdate': '2025-12-31T11:31:12+05:30', 'author': 'Swetha Seethiraju', 'moddate': '2025-12-31T11:31:12+05:30', 'source': '..\\data\\Employee Handbook_Bhavna Corp_2.0  (1).pdf', 'total_pages': 20, 'page': 0, 'page_label': '1', 'source_file': 'Employee Handbook_Bhavna Corp_2.0  (1).pdf', 'file_type': 'pdf'}, page_content='BSIPL-ISMS-POL-HREHB (Human Resource Employee Handbook)      \n                            \n© 2026, <Bhavna Software India Pvt Ltd >. All Rights Reserved \nHighly Confidential \n \n \n  \n \n \n \n \n \n \n \n \n \nEmployee Handbook \nDocument Code: BSIPL-ISMS-POL-HREHB \nVersion: 2.0'),
 Document(metadata={'producer': 'Microsoft® Word for Microsoft 365', 'creator': 'Microsoft® Word for Microsoft 365', 'creationdate': '2025-12-31T11:31:12+05:30', 'author': 'Swetha Seethiraju', 'moddate': '2025-12-31T11:31:12+05:30', 'source': '..\\data\\Employee Han

Embedding and VectorStoreDb

In [44]:
import numpy as np
import os
from sentence_transformers import SentenceTransformer
import uuid
from typing import List, Dict, Any, Tuple
from sklearn.metrics.pairwise import cosine_similarity
import faiss
# import chromadb                                      breaks on Python version:3.14.0
# from chromadb.config import Settings

In [72]:
import truststore
truststore.inject_into_ssl()


class EmbeddingManager:
    """Handles document embedding generation using SentenceTransformer"""
    
    def __init__(self, model_name: str = "all-MiniLM-L6-v2"):
        """
        Initialize the embedding manager
        
        Args:
            model_name: HuggingFace model name for sentence embeddings
        """
        self.model_name = model_name
        self.model = None
        self._load_model()

    def _load_model(self):
        """Load the SentenceTransformer model"""
        try:
            print(f"Loading embedding model: {self.model_name}")
            self.model = SentenceTransformer(self.model_name)
            print(f"Model loaded successfully. Embedding dimension: {self.model.get_sentence_embedding_dimension()}")
        except Exception as e:
            print(f"Error loading model {self.model_name}: {e}")
            raise

    def generate_embeddings(self, texts: List[str]) -> np.ndarray:
        """
        Generate embeddings for a list of texts
        
        Args:
            texts: List of text strings to embed
            
        Returns:
            numpy array of embeddings with shape (len(texts), embedding_dim)
        """
        if not self.model:
            raise ValueError("Model not loaded")
        
        print(f"Generating embeddings for {len(texts)} texts...")
        embeddings = self.model.encode(texts, show_progress_bar=True)
        print(f"Generated embeddings with shape: {embeddings.shape}")
        return embeddings
    


    ##initilize embedding manager
    embedding_manager = EmbeddingManager()
    embedding_manager

Loading embedding model: all-MiniLM-L6-v2


Loading weights: 100%|██████████| 103/103 [00:00<00:00, 561.84it/s, Materializing param=pooler.dense.weight]                             
[1mBertModel LOAD REPORT[0m from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


Model loaded successfully. Embedding dimension: 384


Vector Store

In [68]:
class VectorStore:
    """Manages document embeddings in a FAISS index with local file persistence"""
    
    def __init__(self, index_name: str = "pdf_documents", persist_directory: str = "../data/vector_store"):
        """
        Initialize the FAISS vector store
        
        Args:
            index_name: Base name for the FAISS and metadata files
            persist_directory: Directory to save/load the index files
        """
        self.index_name = index_name
        self.persist_directory = persist_directory
        self.index_path = os.path.join(persist_directory, f"{index_name}.index")
        self.metadata_path = os.path.join(persist_directory, f"{index_name}.pkl")
        
        self.index = None
        self.metadata = []  # To store the actual text chunks and IDs
        self._initialize_store()

    def _initialize_store(self):
        """Initialize FAISS index by loading from disk or creating a new one"""
        try:
            os.makedirs(self.persist_directory, exist_ok=True)
            
            if os.path.exists(self.index_path) and os.path.exists(self.metadata_path):
                # Load existing index and metadata
                self.index = faiss.read_index(self.index_path)
                with open(self.metadata_path, 'rb') as f:
                    self.metadata = pickle.load(f)
                print(f"Vector store loaded. Index: {self.index_name}")
                print(f"Existing documents in index: {len(self.metadata)}")
            else:
                # We don't initialize the index yet because we don't know the dimension
                # until the first document is added.
                print(f"New vector store created at {self.persist_directory}")
                
        except Exception as e:
            print(f"Error initializing vector store: {e}")
            raise


    def add_documents(self, documents: List[Any], embeddings: np.ndarray):
        """
        Add documents and their embeddings to the FAISS index
        """
        if len(documents) != len(embeddings):
            raise ValueError("Document count must match embedding count")
        
        # FAISS requires float32
        vectors = embeddings.astype('float32')
        
        # Initialize the index if it's the first add
        if self.index is None:
            self.index = faiss.IndexFlatL2(vectors.shape[1])
        
        # Add to index
        self.index.add(vectors)
        
        # Save metadata and text to our local docstore
        for i, doc in enumerate(documents):
            self.docstore.append({
                "id": f"doc_{uuid.uuid4().hex[:8]}",
                "text": doc.page_content if hasattr(doc, 'page_content') else str(doc),
                "metadata": getattr(doc, 'metadata', {})
            })
            
        self._save()
        print(f"✅ Added {len(documents)} documents. Total: {len(self.docstore)}")

    def similarity_search(self, query_embedding: np.ndarray, k: int = 3) -> List[Dict]:
        """
        Search for relevant snippets
        """
        if self.index is None:
            return []
            
        # Reshape and search
        query_vector = query_embedding.astype('float32').reshape(1, -1)
        distances, indices = self.index.search(query_vector, k)
        
        results = []
        for i, idx in enumerate(indices[0]):
            if idx != -1 and idx < len(self.docstore):
                results.append({
                    "content": self.docstore[idx]["text"],
                    "metadata": self.docstore[idx]["metadata"],
                    "score": float(distances[0][i])
                })
        return results
        
vectorstore = VectorStore()
vectorstore

New vector store created at ../data/vector_store


<__main__.VectorStore at 0x1d1a32fc1a0>

In [47]:
chunks

[Document(metadata={'producer': 'Microsoft® Word for Microsoft 365', 'creator': 'Microsoft® Word for Microsoft 365', 'creationdate': '2025-12-31T11:31:12+05:30', 'author': 'Swetha Seethiraju', 'moddate': '2025-12-31T11:31:12+05:30', 'source': '..\\data\\Employee Handbook_Bhavna Corp_2.0  (1).pdf', 'total_pages': 20, 'page': 0, 'page_label': '1', 'source_file': 'Employee Handbook_Bhavna Corp_2.0  (1).pdf', 'file_type': 'pdf'}, page_content='BSIPL-ISMS-POL-HREHB (Human Resource Employee Handbook)      \n                            \n© 2026, <Bhavna Software India Pvt Ltd >. All Rights Reserved \nHighly Confidential \n \n \n  \n \n \n \n \n \n \n \n \n \nEmployee Handbook \nDocument Code: BSIPL-ISMS-POL-HREHB \nVersion: 2.0'),
 Document(metadata={'producer': 'Microsoft® Word for Microsoft 365', 'creator': 'Microsoft® Word for Microsoft 365', 'creationdate': '2025-12-31T11:31:12+05:30', 'author': 'Swetha Seethiraju', 'moddate': '2025-12-31T11:31:12+05:30', 'source': '..\\data\\Employee Han

In [71]:
# convert the text into embeddings 
texts=[doc.page_content for doc in chunks]

# generating embeddings 
embeddings=EmbeddingManager.generate_embeddings(texts)

# stroring the embeddings to the vector store
# vectorstore.add_documents(chunks, embeddings)


TypeError: EmbeddingManager.generate_embeddings() missing 1 required positional argument: 'texts'