## RAG Pipeline: Data ingestion to vector db pipeline

In [1]:
import os
from langchain_community.document_loaders import PyPDFLoader, PyMuPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from pathlib import Path

In [2]:
### Read all the pdf's inside the directory
def process_all_pdfs(pdf_directory):
    """Process all PDF files in a directory"""
    all_documents = []
    pdf_dir = Path(pdf_directory)
    
    # Find all PDF files recursively
    pdf_files = list(pdf_dir.glob("**/*.pdf"))
    
    print(f"Found {len(pdf_files)} PDF files to process")
    
    for pdf_file in pdf_files:
        print(f"\nProcessing: {pdf_file.name}")
        try:
            loader = PyPDFLoader(str(pdf_file))
            documents = loader.load()
            
            # Add source information to metadata
            for doc in documents:
                doc.metadata['source_file'] = pdf_file.name
                doc.metadata['file_type'] = 'pdf'
            
            all_documents.extend(documents)
            print(f"  ✓ Loaded {len(documents)} pages")
            
        except Exception as e:
            print(f"  ✗ Error: {e}")
    
    print(f"\nTotal documents loaded: {len(all_documents)}")
    return all_documents

# Process all PDFs in the data directory
all_pdf_documents = process_all_pdfs("../data")

Found 2 PDF files to process

Processing: Comcast_Sr Lead Solutions Engineer.pdf
  ✓ Loaded 6 pages

Processing: Ali_Fouladgar_CoverLetter_Metronome_Infrastructure Engineer.pdf
  ✓ Loaded 1 pages

Total documents loaded: 7


In [None]:
print(all_pdf_documents)

[Document(metadata={'producer': 'Skia/PDF m144 Google Docs Renderer', 'creator': 'PyPDF', 'creationdate': '', 'title': 'Comcast_Sr Lead Solutions Engineer', 'source': '../data/pdf/Comcast_Sr Lead Solutions Engineer.pdf', 'total_pages': 6, 'page': 0, 'page_label': '1', 'source_file': 'Comcast_Sr Lead Solutions Engineer.pdf', 'file_type': 'pdf'}, page_content='Sr.  Lead  Solutions  Engineer  Location  Philadelphia,  Pennsylvania,  California,  New  York,  New  York,  New  York,  \nPennsylvania\n \nReq  ID  R419927  \nJob  Type  Full  Time  \nCategory  Support  \nDate  posted  09/24/2025  \nApply  Now \nUniversal  Ads,  a  part  of  Comcast,  enables  any  brand,  of  any  size,  to  seamlessly  make  \nand\n \nbuy\n \ncommercials\n \nacross\n \npremium\n \nvideo\n \nreaching\n \nnew\n \nqualified\n \naudiences\n \nat\n \nscale.\n \nUniversal\n \nAds\n \ncombines\n \npremium\n \nand\n \nbrand-safe\n \nvideo\n \ncontent\n \ndirectly\n \nfrom\n \nthe\n \nmost\n \ninfluential\n \nmedia\n \nc

In [5]:
def split_documents(documents, chunk_size=1000, chunk_overlap=200):
    """Split documents into smaller chunks for better RAG performance"""
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
        separators=["\n\n", "\n", " ", ""]
    )
    split_docs = text_splitter.split_documents(documents)
    print(f"Split {len(documents)} documents into {len(split_docs)} chunks!")

    if split_docs:
        print(f"\nExample chunk:")
        print(f"Content: {split_docs[0].page_content[:200]}...")
        print(f"Metadata: {split_docs[0].metadata}")
    
    return split_docs


In [6]:
chunks=split_documents(all_pdf_documents)
chunks

Split 7 documents into 14 chunks!

Example chunk:
Content: Sr.  Lead  Solutions  Engineer  Location  Philadelphia,  Pennsylvania,  California,  New  York,  New  York,  New  York,  
Pennsylvania
 
Req  ID  R419927  
Job  Type  Full  Time  
Category  Support  
...
Metadata: {'producer': 'Skia/PDF m144 Google Docs Renderer', 'creator': 'PyPDF', 'creationdate': '', 'title': 'Comcast_Sr Lead Solutions Engineer', 'source': '../data/pdf/Comcast_Sr Lead Solutions Engineer.pdf', 'total_pages': 6, 'page': 0, 'page_label': '1', 'source_file': 'Comcast_Sr Lead Solutions Engineer.pdf', 'file_type': 'pdf'}


[Document(metadata={'producer': 'Skia/PDF m144 Google Docs Renderer', 'creator': 'PyPDF', 'creationdate': '', 'title': 'Comcast_Sr Lead Solutions Engineer', 'source': '../data/pdf/Comcast_Sr Lead Solutions Engineer.pdf', 'total_pages': 6, 'page': 0, 'page_label': '1', 'source_file': 'Comcast_Sr Lead Solutions Engineer.pdf', 'file_type': 'pdf'}, page_content='Sr.  Lead  Solutions  Engineer  Location  Philadelphia,  Pennsylvania,  California,  New  York,  New  York,  New  York,  \nPennsylvania\n \nReq  ID  R419927  \nJob  Type  Full  Time  \nCategory  Support  \nDate  posted  09/24/2025  \nApply  Now \nUniversal  Ads,  a  part  of  Comcast,  enables  any  brand,  of  any  size,  to  seamlessly  make  \nand\n \nbuy\n \ncommercials\n \nacross\n \npremium\n \nvideo\n \nreaching\n \nnew\n \nqualified\n \naudiences\n \nat\n \nscale.\n \nUniversal\n \nAds\n \ncombines\n \npremium\n \nand\n \nbrand-safe\n \nvideo\n \ncontent\n \ndirectly\n \nfrom\n \nthe\n \nmost\n \ninfluential\n \nmedia\n \nc