# RAG Pipeline - Data Ingestion to Vector DB pipeline

In [1]:
import os
from langchain_community.document_loaders import PyMuPDFLoader, PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from pathlib import Path

In [4]:
# !pip install pypdf

In [5]:
### Read all the pdfs inside the directory
def process_all_pdfs(pdf_directory):
    all_documents = []
    pdf_dir = Path(pdf_directory)

    # find all pdf files recursively
    pdf_files = list(pdf_dir.glob("**/*.pdf"))

    print(f"Found {len(pdf_files)} PDF file to process.")

    for pdf_file in pdf_files:
        print(f"\n Processing {pdf_file.name}")
        try:
            loader = PyPDFLoader(str(pdf_file))
            documents = loader.load()

            # Add source information to metadata
            for doc in documents:
                doc.metadata['source_file'] = pdf_file.name
                doc.metadata["file_type"] = 'pdf'
            
            all_documents.extend(documents)
            print(f"Loaded {len(documents)} pages.")
        
        except Exception as e:
            print(f"Error: {e}")

    print(f"\nTotal Documents loaded: {len(all_documents)}")
    return all_documents

all_pdf_documents = process_all_pdfs("data")

Found 3 PDF file to process.

 Processing A Major Project Progress Report on Implementing ReadWrite Overlay Filesystem for a Microkernel Based Operating System.pdf
Loaded 13 pages.

 Processing Minor_Project_Report_Formatting_20241218T145305Z_001__3_.pdf
Loaded 36 pages.

 Processing Which_kind_of_research_papers_influence_policymaki.pdf
Loaded 30 pages.

Total Documents loaded: 79


In [6]:
all_pdf_documents

[Document(metadata={'producer': 'Canva', 'creator': 'Canva', 'creationdate': '2025-08-12T04:37:00+00:00', 'title': 'A Major Project Progress Report on Implementing Read/Write Overlay Filesystem for a Microkernel Based Operating System', 'moddate': '2025-08-12T04:36:59+00:00', 'keywords': 'DAGvvONU8lA,BAFjjl8uqmo,0', 'author': 'Spandan Guragain', 'source': 'data\\pdfs\\A Major Project Progress Report on Implementing ReadWrite Overlay Filesystem for a Microkernel Based Operating System.pdf', 'total_pages': 13, 'page': 0, 'page_label': '1', 'source_file': 'A Major Project Progress Report on Implementing ReadWrite Overlay Filesystem for a Microkernel Based Operating System.pdf', 'file_type': 'pdf'}, page_content='A Major Project Progress Report\non Implementing Read/Write Overlay\nFilesystem Support In A Microkernel\nBased Operating System\nRijan Karki      Saurav Khanal      Spandan Guragain      Sudesh Subedi\n1'),
 Document(metadata={'producer': 'Canva', 'creator': 'Canva', 'creationdat

### Text splitting- get into chunks

In [9]:
def split_documents(documents, chunk_size = 1000, chunk_overlap = 200):
    """Splits document into smaller chunk for better RAG performance"""
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size = chunk_size,
        chunk_overlap = chunk_overlap,
        length_function = len,
        separators=["\n\n", "\n", " ", ""]
    )
    split_docs = text_splitter.split_documents(documents)
    print(f"Split {len(documents)} documents into {len(split_docs)} chunks")

    # show example of a chunk
    if split_docs:
        print(f"\nExample chunk: ")
        print(f"Content: {split_docs[0].page_content[:200]}...")
        print(f"Metadata: {split_docs[0].metadata}")
    
    return split_docs


In [10]:
chunks = split_documents(all_pdf_documents)
chunks

Split 79 documents into 179 chunks

Example chunk: 
Content: A Major Project Progress Report
on Implementing Read/Write Overlay
Filesystem Support In A Microkernel
Based Operating System
Rijan Karki      Saurav Khanal      Spandan Guragain      Sudesh Subedi
1...
Metadata: {'producer': 'Canva', 'creator': 'Canva', 'creationdate': '2025-08-12T04:37:00+00:00', 'title': 'A Major Project Progress Report on Implementing Read/Write Overlay Filesystem for a Microkernel Based Operating System', 'moddate': '2025-08-12T04:36:59+00:00', 'keywords': 'DAGvvONU8lA,BAFjjl8uqmo,0', 'author': 'Spandan Guragain', 'source': 'data\\pdfs\\A Major Project Progress Report on Implementing ReadWrite Overlay Filesystem for a Microkernel Based Operating System.pdf', 'total_pages': 13, 'page': 0, 'page_label': '1', 'source_file': 'A Major Project Progress Report on Implementing ReadWrite Overlay Filesystem for a Microkernel Based Operating System.pdf', 'file_type': 'pdf'}


[Document(metadata={'producer': 'Canva', 'creator': 'Canva', 'creationdate': '2025-08-12T04:37:00+00:00', 'title': 'A Major Project Progress Report on Implementing Read/Write Overlay Filesystem for a Microkernel Based Operating System', 'moddate': '2025-08-12T04:36:59+00:00', 'keywords': 'DAGvvONU8lA,BAFjjl8uqmo,0', 'author': 'Spandan Guragain', 'source': 'data\\pdfs\\A Major Project Progress Report on Implementing ReadWrite Overlay Filesystem for a Microkernel Based Operating System.pdf', 'total_pages': 13, 'page': 0, 'page_label': '1', 'source_file': 'A Major Project Progress Report on Implementing ReadWrite Overlay Filesystem for a Microkernel Based Operating System.pdf', 'file_type': 'pdf'}, page_content='A Major Project Progress Report\non Implementing Read/Write Overlay\nFilesystem Support In A Microkernel\nBased Operating System\nRijan Karki      Saurav Khanal      Spandan Guragain      Sudesh Subedi\n1'),
 Document(metadata={'producer': 'Canva', 'creator': 'Canva', 'creationdat

### Embeding and VectorStoreDB

In [14]:
# !pip install sentence-transformers
# !pip install chromadb
# !pip install faiss-cpu

In [15]:
import numpy as np
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
import uuid
from typing import List, Tuple, Dict, Any
from sklearn.metrics.pairwise import cosine_similarity

In [16]:
class EmbeddingManager:
    def __init__(self, model_name: str = "all-MiniLM-L6-V2"):
        """Initialize the embedding manager"""
        self.model_name = model_name
        self.model = None
        self._load_model()
    
    def _load_model(self):
        """Load the sentence transformer model"""
        try:
            print(f"Loading the model {self.model_name}")
            self.model = SentenceTransformer(self.model_name)
            print(f"Model successfully loaded. Embedding dimension: {self.model.get_sentence_embedding_dimension()}")
        except Exception as e:
            print(f"Error loading model {self.model_name}: {e}")
            raise

    def generate_embeddings(self, texts: List[str]) -> np.ndarray:
        """Generate a embedding for a list of text
            Args: texts: List of text strings to embed
            returns: numpy array of embeddings with shape (len(texts), embedding_dim)
        """
        if not self.model:
            raise ValueError("Model not loaded!")
        print(f"Generating embeddings for {len(texts)} texts.")
        embeddings = self.model.encode(texts, show_progress_bar=True)
        print(f"Generated embeddings with shape: {embeddings.shape}")
        return embeddings


## initialize the embedding manager

embedding_manager = EmbeddingManager()
embedding_manager



Loading the model all-MiniLM-L6-V2
Model successfully loaded. Embedding dimension: 384


<__main__.EmbeddingManager at 0x22998a6fc90>