### Rag pipeline-Data Ingestion to vectorDB

In [8]:
import os
from langchain_community.document_loaders import PyPDFLoader , PyMuPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from pathlib import Path


In [10]:
### read all pdfs in the directory


def process_all_pdfs(pdf_directory):
    """process all pdfs in the given directory"""
    all_documents = []
    pdf_dir = Path(pdf_directory)

    # find all pdf files recursively
    pdf_files = list(pdf_dir.glob("**/*.pdf"))
    print(f"Found {len(pdf_files)} PDF files to process")
    for pdf_file in pdf_files:
        print(f"\nProcessing file: {pdf_file.name}")
        try:
            loader = PyMuPDFLoader(str(pdf_file))
            documents = loader.load()

            # add source information to metadata
            for doc in documents:
                doc.metadata["source_file"] = pdf_file.name
                doc.metadata['file_type'] = 'pdf'
            all_documents.extend(documents)
            print(f"Loaded {len(documents)} pages")
        except Exception as e:
            print(f"Error processing file {e}")
    print(f"\nTotal documents loaded from all PDFs: {len(all_documents)}")
    return all_documents

# process all pdfs in the data directory
all_pdf_documents = process_all_pdfs("../data")


Found 2 PDF files to process

Processing file: equal-employment-opportunity-policy.pdf
Loaded 6 pages

Processing file: UST.pdf
Loaded 1 pages

Total documents loaded from all PDFs: 7


In [11]:
all_pdf_documents

[Document(metadata={'producer': 'PDFium', 'creator': 'PDFium', 'creationdate': 'D:20260119121503', 'source': '..\\data\\pdf\\equal-employment-opportunity-policy.pdf', 'file_path': '..\\data\\pdf\\equal-employment-opportunity-policy.pdf', 'total_pages': 6, 'format': 'PDF 1.7', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'moddate': '', 'trapped': '', 'modDate': '', 'creationDate': 'D:20260119121503', 'page': 0, 'source_file': 'equal-employment-opportunity-policy.pdf', 'file_type': 'pdf'}, page_content='People Policy Document: \nEqual Employment \nOpportunity Policy \nVersion 1.0 \n \nust.com'),
 Document(metadata={'producer': 'PDFium', 'creator': 'PDFium', 'creationdate': 'D:20260119121503', 'source': '..\\data\\pdf\\equal-employment-opportunity-policy.pdf', 'file_path': '..\\data\\pdf\\equal-employment-opportunity-policy.pdf', 'total_pages': 6, 'format': 'PDF 1.7', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'moddate': '', 'trapped': '', 'modDate': '', 'creat

In [16]:
### text splitting get into chunks
def split_documents(documents, chunk_size=1000, chunk_overlap=200):
    """split documents into smaller chunks"""
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        separators=["\n\n", "\n", " ", ""]
    )
    split_docs = text_splitter.split_documents(documents)
    print(f"Split  {len(documents)} documents into {len(split_docs)} chunks")
    
    ## example of a chunk
    if split_docs:
        print("\nExample chunk:")
        print(f"Content : {split_docs[0].page_content[:200]}...")  # this prints first 200 characters of the first chunk
        print(f"Metadata: {split_docs[0].metadata}")
    return split_docs

In [17]:
chunks= split_documents(all_pdf_documents)
chunks

Split  7 documents into 12 chunks

Example chunk:
Content : People Policy Document: 
Equal Employment 
Opportunity Policy 
Version 1.0 
 
ust.com...
Metadata: {'producer': 'PDFium', 'creator': 'PDFium', 'creationdate': 'D:20260119121503', 'source': '..\\data\\pdf\\equal-employment-opportunity-policy.pdf', 'file_path': '..\\data\\pdf\\equal-employment-opportunity-policy.pdf', 'total_pages': 6, 'format': 'PDF 1.7', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'moddate': '', 'trapped': '', 'modDate': '', 'creationDate': 'D:20260119121503', 'page': 0, 'source_file': 'equal-employment-opportunity-policy.pdf', 'file_type': 'pdf'}


[Document(metadata={'producer': 'PDFium', 'creator': 'PDFium', 'creationdate': 'D:20260119121503', 'source': '..\\data\\pdf\\equal-employment-opportunity-policy.pdf', 'file_path': '..\\data\\pdf\\equal-employment-opportunity-policy.pdf', 'total_pages': 6, 'format': 'PDF 1.7', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'moddate': '', 'trapped': '', 'modDate': '', 'creationDate': 'D:20260119121503', 'page': 0, 'source_file': 'equal-employment-opportunity-policy.pdf', 'file_type': 'pdf'}, page_content='People Policy Document: \nEqual Employment \nOpportunity Policy \nVersion 1.0 \n \nust.com'),
 Document(metadata={'producer': 'PDFium', 'creator': 'PDFium', 'creationdate': 'D:20260119121503', 'source': '..\\data\\pdf\\equal-employment-opportunity-policy.pdf', 'file_path': '..\\data\\pdf\\equal-employment-opportunity-policy.pdf', 'total_pages': 6, 'format': 'PDF 1.7', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'moddate': '', 'trapped': '', 'modDate': '', 'creat

### converting text into vectors and embedding

In [None]:
import sys
!{sys.executable} -m pip install sentence-transformers chromadb


In [50]:
import sys
!{sys.executable} -m pip install --upgrade certifi




In [51]:
import ssl
import certifi

ssl_context = ssl.create_default_context(cafile=certifi.where())


In [54]:
import ssl
import ssl
ssl._create_default_https_context = ssl._create_unverified_context



In [43]:
import numpy as np
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
import uuid
from typing import List, Dict, Any,Tuple
from sklearn.metrics.pairwise import cosine_similarity

In [57]:
class EmbeddingManager:
    """This Handles document embedding generation using SentenceTransformer"""
    def __init__(self, model_name: str = 'all-MiniLM-L6-v2'):
        """this initializes the embedding manager and that str name at the end is the hugging face model name that i have used for sentence embeddings"""
        self.model_name = model_name
        self.model = None
        self._load_model()
    
    def _load_model(self):
        """load the sentence transformer model"""
        try:
            print(f"Loading embedding model: {self.model_name}")
            self.model = SentenceTransformer(self.model_name)
            print(f"Model loaded successfully Embedding dimention: {self.model.get_sentence_embedding_dimension()}")
        except Exception as e:
            print(f"Error loading model {self.model_name} : {e}")
            raise

    def generate_embeddings(self, texts: List[str]) -> np.ndarray:
        """generate embeddings for a list of texts
        texts: List of strings to generate embeddings for
        returns: numpy array of embeddings"""
        if not self.model:
            raise ValueError("Model not loaded")
        print(f"Generating embeddings for {len(texts)} texts")
        embeddings = self.model.encode(texts, show_progress_bar=True)
        print(f"Generated embeddings with shape: {embeddings.shape}")
        return embeddings
    

# initializing the embedding manager
embedding_manager = EmbeddingManager()
embedding_manager

Loading embedding model: all-MiniLM-L6-v2


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


Model loaded successfully Embedding dimention: 384


<__main__.EmbeddingManager at 0x1997d4cb020>

### VectorDBStore

In [82]:
class VectorStore:
    """This class manages a vector store using ChromaDB for storing and querying document embeddings."""
    def __init__(self, collection_name: str = "pdf_documents", persist_directory: str = "../data/vector_store/"):

        """Initialize the vector store and create a collection.
        collection_name: Name of the collection to create/use
        persist_directory: Directory to persist the vector store data
        """
        self.collection_name = collection_name
        self.persist_directory = persist_directory
        self.client = None
        self._initialize_store()


    def _initialize_store(self):
        """Initialize the ChromaDB client and create/get the collection."""
        try:
            #creating a ChromaDB client
            os.makedirs(self.persist_directory, exist_ok=True)
            self.client = chromadb.PersistentClient(path=self.persist_directory)
          
          #get or create the collection
            self.collection = self.client.get_or_create_collection(
                 name=self.collection_name,
                 metadata={"description":"PDF document embeddings for RAG"}
                 )
            print(f"Vector store initialized with collection: {self.collection_name}")
            print(f"existing number of documents in the collection: {self.collection.count()}")
        
        except Exception as e:
            print(f"Error initializing vector store: {e}")
            raise
    def add_documents(self, documents: List[Any], embeddings: np.ndarray):
        """Add documents to the vector store after generating embeddings.
        
        Arguments:
        documents: List of document dicts with 'id', 'text', and 'metadata'
        embeddings: Numpy array of embeddings corresponding to the documents
        """
        if len(documents) != len(embeddings):
            raise ValueError("Number of documents and embeddings must match")
        print(f"Adding {len(documents)} documents to the vector store")

        #prepare data for chromadb
        ids = []
        documents_text = []
        metadatas= []
        embeddings_list=[]

        for i,(doc,embedding) in enumerate(zip(documents,embeddings)):
            
            #generating a unique ID
            doc_id=f"doc_{uuid.uuid4().hex[:8]}_{i}"
            ids.append(doc_id)
        #prepare metadata
        metadata=dict(doc.metadata)
        metadata['doc_index']=i
        metadata['content_length']=len(doc.page_content)
        metadatas.append(metadata)

        #document content
        documents_text.append(doc.page_content)

        #embedding list

        embeddings_list.append(embedding.tolist())

        #add to collection
        try:
            self.collection.add(
                ids=ids,
                embeddings=embeddings_list,
                metadatas= metadatas,
                documents= documents_text,
                
                
            )
            print(f"Successfully added {len(documents)} documents to the vector store")
            print(f"Total documents in the collection now: {self.collection.count()}")
        except Exception as e:
            print(f"Error adding documents to vector store: {e}")
            raise
VectorStore= VectorStore()
VectorStore








Vector store initialized with collection: pdf_documents
existing number of documents in the collection: 0


<__main__.VectorStore at 0x1997d5f4740>

In [83]:
### converting the text to embeddings
texts=[doc.page_content for doc in chunks]


### generating embeddings
embeddings = embedding_manager.generate_embeddings(texts)

### storing embeddings in the vector database

VectorStore.add_documents(chunks, embeddings)

Generating embeddings for 12 texts


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches: 100%|██████████| 1/1 [00:01<00:00,  1.24s/it]

Generated embeddings with shape: (12, 384)
Adding 12 documents to the vector store
Error adding documents to vector store: Unequal lengths for fields: ids: 12, metadatas: 1, embeddings: 1, documents: 1 in add.





ValueError: Unequal lengths for fields: ids: 12, metadatas: 1, embeddings: 1, documents: 1 in add.