## RAG Pipelines- Data Ingestion to Vector DB Pipeline

In [4]:
import os
from langchain_community.document_loaders import PyPDFLoader, PyMuPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from pathlib import Path

In [7]:
### Read all the pdf's inside the directory
def process_all_pdfs(pdf_directory):
  """Process all PDF files in a directory"""
  all_documents = []
  pdf_dir = Path(pdf_directory)
  # Find all PDF files recursively
  pdf_files = list(pdf_dir.glob("**/*.pdf"))
    
  print(f"Found {len(pdf_files)} PDF files to process")

  for pdf_file in pdf_files:
    print(f"\nProcessing: {pdf_file.name}")
    try:
      loader = PyPDFLoader(str(pdf_file))
      documents = loader.load()
      
      # Add source information to metadata
      for doc in documents:
          doc.metadata['source_file'] = pdf_file.name
          doc.metadata['file_type'] = 'pdf'
      
      all_documents.extend(documents)
      print(f"✓ Loaded {len(documents)} pages")
        
    except Exception as e:
      print(f"✗ Error: {e}")

  print(f"\nTotal documents loaded: {len(all_documents)}")
  return all_documents

# Process all PDFs in the data directory
all_pdf_documents = process_all_pdfs("../data")

Found 2 PDF files to process

Processing: Machine Learning Basics.pdf
✓ Loaded 1 pages

Processing: Python Programming Introduction.pdf
✓ Loaded 1 pages

Total documents loaded: 2


In [8]:
all_pdf_documents

[Document(metadata={'producer': 'PyPDF', 'creator': 'Microsoft Word', 'creationdate': '2025-10-21T13:51:10+00:00', 'author': 'aayush', 'moddate': '2025-10-21T13:51:10+00:00', 'source': '../data/pdf_files/Machine Learning Basics.pdf', 'total_pages': 1, 'page': 0, 'page_label': '1', 'source_file': 'Machine Learning Basics.pdf', 'file_type': 'pdf'}, page_content='Machine Learning Basics \n \nMachine learning is a subset of artificial intelligence that enables systems to learn and \nimprove \nfrom experience without being explicitly programmed. It focuses on developing \ncomputer programs \nthat can access data and use it to learn for themselves. \n \nTypes of Machine Learning: \n1. Supervised Learning: Learning with labeled data \n2. Unsupervised Learning: Finding patterns in unlabeled data \n3. Reinforcement Learning: Learning through rewards and penalties \n \nApplications include image recognition, speech processing, and recommendation \nsystems.'),
 Document(metadata={'producer': 'PyP

In [9]:
### Text splitting get into chunks

def split_documents(documents, chunk_size = 1000, chunk_overlap = 200):
  """Split documents into smaller chunks for better RAG performance"""
  text_splitter = RecursiveCharacterTextSplitter(
      chunk_size = chunk_size,
      chunk_overlap = chunk_overlap,
      length_function = len,
      separators = ["\n\n", "\n", " ", ""]
  )
  split_docs = text_splitter.split_documents(documents)
  print(f"Split {len(documents)} documents into {len(split_docs)} chunks")
  
  # Show example of a chunk
  if split_docs:
      print(f"\nExample chunk:")
      print(f"Content: {split_docs[0].page_content[:200]}...")
      print(f"Metadata: {split_docs[0].metadata}")
  
  return split_docs

In [10]:
chunks=split_documents(all_pdf_documents)
chunks

Split 2 documents into 2 chunks

Example chunk:
Content: Machine Learning Basics 
 
Machine learning is a subset of artificial intelligence that enables systems to learn and 
improve 
from experience without being explicitly programmed. It focuses on develo...
Metadata: {'producer': 'PyPDF', 'creator': 'Microsoft Word', 'creationdate': '2025-10-21T13:51:10+00:00', 'author': 'aayush', 'moddate': '2025-10-21T13:51:10+00:00', 'source': '../data/pdf_files/Machine Learning Basics.pdf', 'total_pages': 1, 'page': 0, 'page_label': '1', 'source_file': 'Machine Learning Basics.pdf', 'file_type': 'pdf'}


[Document(metadata={'producer': 'PyPDF', 'creator': 'Microsoft Word', 'creationdate': '2025-10-21T13:51:10+00:00', 'author': 'aayush', 'moddate': '2025-10-21T13:51:10+00:00', 'source': '../data/pdf_files/Machine Learning Basics.pdf', 'total_pages': 1, 'page': 0, 'page_label': '1', 'source_file': 'Machine Learning Basics.pdf', 'file_type': 'pdf'}, page_content='Machine Learning Basics \n \nMachine learning is a subset of artificial intelligence that enables systems to learn and \nimprove \nfrom experience without being explicitly programmed. It focuses on developing \ncomputer programs \nthat can access data and use it to learn for themselves. \n \nTypes of Machine Learning: \n1. Supervised Learning: Learning with labeled data \n2. Unsupervised Learning: Finding patterns in unlabeled data \n3. Reinforcement Learning: Learning through rewards and penalties \n \nApplications include image recognition, speech processing, and recommendation \nsystems.'),
 Document(metadata={'producer': 'PyP

### Embedding And vectorStoreDB
