## RAG Pipeline: Data Ingestion to vector DB Pipeline

In [1]:
import os
from langchain_community.document_loaders import PyPDFLoader, PyMuPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from pathlib import Path

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Read all PDFs inside the directory
def process_all_pdfs(pdf_directory):
    """Process all PDF files in a directory"""
    all_documents = []
    pdf_dir = Path(pdf_directory)
    
    # Find all PDF files recursively
    pdf_files = list(pdf_dir.glob("**/*.pdf"))
    
    print(f"Found {len(pdf_files)} PDF files to process")
    
    for pdf_file in pdf_files:
        print(f"\nProcessing: {pdf_file.name}")
        try:
            loader = PyPDFLoader(str(pdf_file))
            documents = loader.load()
            
            # Add source information to metadata
            for doc in documents:
                doc.metadata['source_file'] = pdf_file.name
                doc.metadata['file_type'] = 'pdf'
            
            all_documents.extend(documents)
            print(f"  ✓ Loaded {len(documents)} pages")
            
        except Exception as e:
            print(f"  ✗ Error: {e}")
    
    print(f"\nTotal documents loaded: {len(all_documents)}")
    return all_documents

# Process all PDFs in the data directory
all_pdf_documents = process_all_pdfs("../data")

Found 4 PDF files to process

Processing: finetune.pdf
  ✓ Loaded 15 pages

Processing: incontext_learning.pdf
  ✓ Loaded 19 pages

Processing: metrics.pdf
  ✓ Loaded 12 pages

Processing: modular_rag.pdf
  ✓ Loaded 17 pages

Total documents loaded: 63


In [3]:
all_pdf_documents

[Document(metadata={'producer': 'pikepdf 8.15.1', 'creator': 'arXiv GenPDF (tex2pdf:4177c2c)', 'creationdate': '', 'author': 'Zhan Peng Lee; Andre Lin; Calvin Tan', 'doi': 'https://doi.org/10.48550/arXiv.2505.10792', 'license': 'http://arxiv.org/licenses/nonexclusive-distrib/1.0/', 'ptex.fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.28 (TeX Live 2025) kpathsea version 6.4.1', 'title': 'Finetune-RAG: Fine-Tuning Language Models to Resist Hallucination in Retrieval-Augmented Generation', 'trapped': '/False', 'arxivid': 'https://arxiv.org/abs/2505.10792v3', 'source': '..\\data\\pdf_files\\finetune.pdf', 'total_pages': 15, 'page': 0, 'page_label': '1', 'source_file': 'finetune.pdf', 'file_type': 'pdf'}, page_content='Finetune-RAG: Fine-Tuning Language Models to\nResist Hallucination in Retrieval-Augmented\nGeneration\nZhan Peng Lee\nPints AI Labs\nzhanpeng.lee@pints.co\nAndre Lin∗\nPints AI Labs\nandre_lin@u.nus.edu\nandrelim444@gmail.com\nCalvin Tan\nPints AI Labs\ncalvin@pin

In [4]:
# Text splitting get into chunks

def split_documents(documents, chunk_size=1000, chunk_overlap=200):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        separators=["\n\n", "\n", " ", ""]
    )
    split_docs=text_splitter.split_documents(documents)
    print(f"Split {len(documents)} documents into {len(split_docs)} chunks")
    if split_docs:
        print(f"Example chunk:")
        print(f"{split_docs[0].page_content[200]}...")
        print(f"Metadata: {split_docs[0].metadata}")
    return split_docs

In [5]:
chunks=split_documents(all_pdf_documents)
chunks

Split 63 documents into 320 chunks
Example chunk:
l...
Metadata: {'producer': 'pikepdf 8.15.1', 'creator': 'arXiv GenPDF (tex2pdf:4177c2c)', 'creationdate': '', 'author': 'Zhan Peng Lee; Andre Lin; Calvin Tan', 'doi': 'https://doi.org/10.48550/arXiv.2505.10792', 'license': 'http://arxiv.org/licenses/nonexclusive-distrib/1.0/', 'ptex.fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.28 (TeX Live 2025) kpathsea version 6.4.1', 'title': 'Finetune-RAG: Fine-Tuning Language Models to Resist Hallucination in Retrieval-Augmented Generation', 'trapped': '/False', 'arxivid': 'https://arxiv.org/abs/2505.10792v3', 'source': '..\\data\\pdf_files\\finetune.pdf', 'total_pages': 15, 'page': 0, 'page_label': '1', 'source_file': 'finetune.pdf', 'file_type': 'pdf'}


[Document(metadata={'producer': 'pikepdf 8.15.1', 'creator': 'arXiv GenPDF (tex2pdf:4177c2c)', 'creationdate': '', 'author': 'Zhan Peng Lee; Andre Lin; Calvin Tan', 'doi': 'https://doi.org/10.48550/arXiv.2505.10792', 'license': 'http://arxiv.org/licenses/nonexclusive-distrib/1.0/', 'ptex.fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.28 (TeX Live 2025) kpathsea version 6.4.1', 'title': 'Finetune-RAG: Fine-Tuning Language Models to Resist Hallucination in Retrieval-Augmented Generation', 'trapped': '/False', 'arxivid': 'https://arxiv.org/abs/2505.10792v3', 'source': '..\\data\\pdf_files\\finetune.pdf', 'total_pages': 15, 'page': 0, 'page_label': '1', 'source_file': 'finetune.pdf', 'file_type': 'pdf'}, page_content='Finetune-RAG: Fine-Tuning Language Models to\nResist Hallucination in Retrieval-Augmented\nGeneration\nZhan Peng Lee\nPints AI Labs\nzhanpeng.lee@pints.co\nAndre Lin∗\nPints AI Labs\nandre_lin@u.nus.edu\nandrelim444@gmail.com\nCalvin Tan\nPints AI Labs\ncalvin@pin

## Embedding and vectorStoreDB

In [6]:
import numpy as np
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
import uuid
from typing import List, Dict, Any, Tuple
from sklearn.metrics.pairwise import cosine_similarity

In [9]:
class EmbeddingManager:
    def __init__(self, model_name: str="all-MiniLM-L6-v2"):
        '''
        Initialize the embedding manager 
        
        Args:
            model_name (str): HuggingFace model name for sentence embeddings
        '''
        self.model_name = model_name
        self.model=None
        self._load_model() 
    
    def _load_model(self):
        '''Load the SentenceTransformer model'''
        try:
            print(f"Loading embedding model: {self.model_name}")
            self.model = SentenceTransformer(self.model_name)
            print(f"Model loaded successfully. Embedding dimension: {self.model.get_sentence_embedding_dimension()}")
        except Exception as e:
            print(f"Error loading model {self.model_name}: {e}")
            raise e
    
    def generate_embeddings(self, texts:List[str]) -> np.ndarray:
        '''
        Generate embeddings for a list of texts
        
        Args:
            texts (List[str]): List of text strings to embed
        
        Returns:
            numpy array of embeddings with shape (len(texts), embedding_dim)
        '''
        if not self.model:
            raise ValueError("Embedding model is not loaded.")
        embeddings = self.model.encode(texts, show_progress_bar=True)
        print(f"Generated embeddings with shape: {embeddings.shape}")
        return embeddings
    
# Initialize embedding manager
embedding_manager = EmbeddingManager()

Loading embedding model: all-MiniLM-L6-v2
Model loaded successfully. Embedding dimension: 384


In [10]:
embedding_manager

<__main__.EmbeddingManager at 0x1f21dfa4890>

## VectorStore

In [11]:
class VectorStore:
    '''Manages document embeddings in chromaDB vector store'''
    
    def __init__(self, collection_name:str='pdf_documents', persist_directory:str='../data/vector_store'):
        '''
        Initialize the vector store
        
        Args:
            collection_name (str): Name of the chromaDB collection
            persist_directory (str): Directory to persist chromaDB data
        '''
        self.collection_name = collection_name
        self.persist_directory = persist_directory
        self.client = None
        self.collection = None
        self._initialize_store()
    
    def _initialize_store(self):
        '''Initialize chromaDB client and collection'''
        try:
            # Create persistwnt ChromaDB client
            os.makedirs(self.persist_directory, exist_ok=True)
            self.client = chromadb.PersistentClient(path=self.persist_directory)
            
            # Get or create collection
            self.collection = self.client.get_or_create_collection(
                name=self.collection_name,
                metadata={"description": "PDF document embeddings for RAG"}
            )
            print(f"Vector store initialized with collection: {self.collection_name}")
            print(f"Existing documents in store: {self.collection.count()}")
        except Exception as e:
            print(f"Error initializing vector store: {e}")
            raise e
        
    def add_documents(self, documents: List[Any], embeddings: np.ndarray):
        """
        Add documents and their embeddings to the vector store
        
        Args:
            documents: List of LangChain documents
            embeddings: Corresponding embeddings for the documents
        """
        if len(documents) != len(embeddings):
            raise ValueError("Number of documents must match number of embeddings")
        
        print(f"Adding {len(documents)} documents to vector store...")
        
        # Prepare data for ChromaDB
        ids = []
        metadatas = []
        documents_text = []
        embeddings_list = []
        
        for i, (doc, embedding) in enumerate(zip(documents, embeddings)):
            # Generate unique ID
            doc_id = f"doc_{uuid.uuid4().hex[:8]}_{i}"
            ids.append(doc_id)
            
            # Prepare metadata
            metadata = dict(doc.metadata)
            metadata['doc_index'] = i
            metadata['content_length'] = len(doc.page_content)
            metadatas.append(metadata)
            
            # Document content
            documents_text.append(doc.page_content)
            
            # Embedding
            embeddings_list.append(embedding.tolist())
        
        # Add to collection
        try:
            self.collection.add(
                ids=ids,
                embeddings=embeddings_list,
                metadatas=metadatas,
                documents=documents_text
            )
            print(f"Successfully added {len(documents)} documents to vector store")
            print(f"Total documents in collection: {self.collection.count()}")
            
        except Exception as e:
            print(f"Error adding documents to vector store: {e}")
            raise

vectorstore=VectorStore()
vectorstore        
    

Vector store initialized with collection: pdf_documents
Existing documents in store: 0


<__main__.VectorStore at 0x1f21dc516d0>

In [13]:
chunks

[Document(metadata={'producer': 'pikepdf 8.15.1', 'creator': 'arXiv GenPDF (tex2pdf:4177c2c)', 'creationdate': '', 'author': 'Zhan Peng Lee; Andre Lin; Calvin Tan', 'doi': 'https://doi.org/10.48550/arXiv.2505.10792', 'license': 'http://arxiv.org/licenses/nonexclusive-distrib/1.0/', 'ptex.fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.28 (TeX Live 2025) kpathsea version 6.4.1', 'title': 'Finetune-RAG: Fine-Tuning Language Models to Resist Hallucination in Retrieval-Augmented Generation', 'trapped': '/False', 'arxivid': 'https://arxiv.org/abs/2505.10792v3', 'source': '..\\data\\pdf_files\\finetune.pdf', 'total_pages': 15, 'page': 0, 'page_label': '1', 'source_file': 'finetune.pdf', 'file_type': 'pdf'}, page_content='Finetune-RAG: Fine-Tuning Language Models to\nResist Hallucination in Retrieval-Augmented\nGeneration\nZhan Peng Lee\nPints AI Labs\nzhanpeng.lee@pints.co\nAndre Lin∗\nPints AI Labs\nandre_lin@u.nus.edu\nandrelim444@gmail.com\nCalvin Tan\nPints AI Labs\ncalvin@pin

In [16]:
# Convert the text to embeddings
texts = [doc.page_content for doc in chunks]

# Generate embeddings
embeddings = embedding_manager.generate_embeddings(texts)

# Store in vector store
vectorstore.add_documents(chunks, embeddings)

Batches: 100%|██████████| 10/10 [00:11<00:00,  1.15s/it]


Generated embeddings with shape: (320, 384)
Adding 320 documents to vector store...
Successfully added 320 documents to vector store
Total documents in collection: 320


In [17]:
class RAGRetriever:
    '''Handles query-based retrieval from the vector store'''
    
    def __init__(self, vector_store: VectorStore, embedding_manager: EmbeddingManager):
        '''
        Initialize the RAG retriever
        
        Args:
            vector_store (VectorStore): Instance of the vector store
            embedding_manager (EmbeddingManager): Instance of the embedding manager
        '''
        self.vector_store = vector_store
        self.embedding_manager = embedding_manager
    
    def retrieve(self, query: str, top_k: int = 5, score_threshold: float = 0.0) -> List[Dict[str, Any]]:
        """
        Retrieve relevant documents for a query
        
        Args:
            query: The search query
            top_k: Number of top results to return
            score_threshold: Minimum similarity score threshold
            
        Returns:
            List of dictionaries containing retrieved documents and metadata
        """
        print(f"Retrieving documents for query: '{query}'")
        print(f"Top K: {top_k}, Score threshold: {score_threshold}")
        
        # Generate query embedding
        query_embedding = self.embedding_manager.generate_embeddings([query])[0]
        
        # Search in vector store
        try:
            results = self.vector_store.collection.query(
                query_embeddings=[query_embedding.tolist()],
                n_results=top_k
            )
            
            # Process results
            retrieved_docs = []
            
            if results['documents'] and results['documents'][0]:
                documents = results['documents'][0]
                metadatas = results['metadatas'][0]
                distances = results['distances'][0]
                ids = results['ids'][0]
                
                for i, (doc_id, document, metadata, distance) in enumerate(zip(ids, documents, metadatas, distances)):
                    # Convert distance to similarity score (ChromaDB uses cosine distance)
                    similarity_score = 1 - distance
                    
                    if similarity_score >= score_threshold:
                        retrieved_docs.append({
                            'id': doc_id,
                            'content': document,
                            'metadata': metadata,
                            'similarity_score': similarity_score,
                            'distance': distance,
                            'rank': i + 1
                        })
                
                print(f"Retrieved {len(retrieved_docs)} documents (after filtering)")
            else:
                print("No documents found")
            
            return retrieved_docs
            
        except Exception as e:
            print(f"Error during retrieval: {e}")
            return []

rag_retriever=RAGRetriever(vectorstore,embedding_manager)

In [23]:
rag_retriever.retrieve("What is In context learning?")

Retrieving documents for query: 'What is In context learning?'
Top K: 5, Score threshold: 0.0


Batches: 100%|██████████| 1/1 [00:00<00:00, 16.59it/s]

Generated embeddings with shape: (1, 384)
Retrieved 5 documents (after filtering)





[{'id': 'doc_2d9d7fcd_60',
  'content': 'poses better ways of formulating the problem (Zhao\net al., 2021; Holtzman et al., 2021; Min et al.,\n2021a), better ways of choosing labeled exam-\nples for the demonstrations (Liu et al., 2021; Lu\net al., 2021; Rubin et al., 2021), meta-training\nwith an explicit in-context learning objective (Chen\net al., 2021; Min et al., 2021b), and learning to\nfollow instructions as a variant of in-context learn-\ning (Mishra et al., 2021b; Efrat and Levy, 2020;\nWei et al., 2022a; Sanh et al., 2022). At the\nsame time, some work reports brittleness and over-\nsensitivity for in-context learning (Lu et al., 2021;\nZhao et al., 2021; Mishra et al., 2021a).\nRelatively less work has been done to understand\nwhy in-context learning works. Xie et al. (2022)\nprovide theoretical analysis that in-context learn-\ning can be formalized as Bayesian inference that\nCirculation revenue has increased by 5% in Finland.         \\n    Positive \nPanostaja did not dis