#### RAG Pipeline - Data Ingestion to Vector DB

In [4]:
import os 
from langchain_community.document_loaders import PyMuPDFLoader , PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from pathlib import Path

In [5]:
# Reading PDF files

def process_pdf(pdf_dir):
    '''Process PDF files in a directory'''
    
    all_doc = []
    pdf_directory = Path(pdf_dir)
    
    pdf_files = list(pdf_directory.glob("**/*.pdf"))
    print(f"Found {len(pdf_files)} PDF files")
    
    for pdf_file in pdf_files:
        print(f"Processing {pdf_file.name}")
        try:
            loader = PyPDFLoader(str(pdf_file))
            documents = loader.load()
            
            # Add Source info
            for doc in documents:
                doc.metadata["source_file"] = pdf_file.name
                doc.metadata["file_type"] = 'pdf'
                
            all_doc.extend(documents)
            print(f"Loaded {len(documents)} pages")
        
        except Exception as e:
            print(f"Error processing {pdf_file.name}: {e}")
    
    print(f"Total documents: {len(all_doc)}")
    return all_doc

# Process PDF files
all_doc = process_pdf("../data/pdf_files")

Found 2 PDF files
Processing machine_learning.pdf
Loaded 1 pages
Processing python_intro.pdf
Loaded 1 pages
Total documents: 2


In [6]:
all_doc

[Document(metadata={'producer': 'Microsoft® Word for Microsoft 365', 'creator': 'Microsoft® Word for Microsoft 365', 'creationdate': '2025-10-30T18:01:39+05:30', 'author': 'HINATA SHOYO', 'moddate': '2025-10-30T18:01:39+05:30', 'source': '..\\data\\pdf_files\\machine_learning.pdf', 'total_pages': 1, 'page': 0, 'page_label': '1', 'source_file': 'machine_learning.pdf', 'file_type': 'pdf'}, page_content='Machine Learning Basics \n \nMachine learning is a subset of artificial intelligence that enables systems to learn and \nimprove \nfrom experience without being explicitly programmed. It focuses on developing computer \nprograms \nthat can access data and use it to learn for themselves. \n \nTypes of Machine Learning: \n1. Supervised Learning: Learning with labeled data \n2. Unsupervised Learning: Finding patterns in unlabeled data \n3. Reinforcement Learning: Learning through rewards and penalties \n \nApplications include image recognition, speech processing, and recommendation systems'

#### Text Splitting - get into CHUNKS

In [7]:
def split_documents(documents , chunk_size=1000 , chunk_overlap=200):
    '''Splits doc into chunks for better performance'''
    
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
        separators=["\n\n", "\n", " ", ""]
    )
    
    split_docs = text_splitter.split_documents(documents)
    print(f"Split {len(documents)} documents into {len(split_docs)} chunks")
    
    # Simple example
    if split_docs:
        print(f"Content: {split_docs[0].page_content}")
        print(f"Metadata: {split_docs[0].metadata}")
        
    return split_docs

In [8]:
chunks = split_documents(all_doc)

Split 2 documents into 2 chunks
Content: Machine Learning Basics 
 
Machine learning is a subset of artificial intelligence that enables systems to learn and 
improve 
from experience without being explicitly programmed. It focuses on developing computer 
programs 
that can access data and use it to learn for themselves. 
 
Types of Machine Learning: 
1. Supervised Learning: Learning with labeled data 
2. Unsupervised Learning: Finding patterns in unlabeled data 
3. Reinforcement Learning: Learning through rewards and penalties 
 
Applications include image recognition, speech processing, and recommendation systems
Metadata: {'producer': 'Microsoft® Word for Microsoft 365', 'creator': 'Microsoft® Word for Microsoft 365', 'creationdate': '2025-10-30T18:01:39+05:30', 'author': 'HINATA SHOYO', 'moddate': '2025-10-30T18:01:39+05:30', 'source': '..\\data\\pdf_files\\machine_learning.pdf', 'total_pages': 1, 'page': 0, 'page_label': '1', 'source_file': 'machine_learning.pdf', 'file_type': 'pd

#### Embedding and Vector Store DB

In [9]:
import numpy as np
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
import uuid
from typing import List , Dict , Any , Tuple
from sklearn.metrics.pairwise import cosine_similarity

  from .autonotebook import tqdm as notebook_tqdm


In [10]:
class EmbeddingManager:
    '''Handles document embeddings generation using SentenceTransformers'''
    
    def __init__(self, model_name: str = "all-MiniLM-L6-v2"):
        '''
        Intialize it
        Args : model_name : HuggingFace model for sentence embedding
        '''
        self.model_name = model_name
        self.model = None
        self._load_model()
        
    def _load_model(self):
        '''Protected function to load the model'''
        try:
            print(f"Loading model {self.model_name}")
            self.model = SentenceTransformer(self.model_name)
            print(f"Model {self.model_name} loaded successfully\nEmbedding Dimensions: {self.model.get_sentence_embedding_dimension()}")
        
        except Exception as e:
            print(f"Error loading model {self.model_name}: {e}")
            raise
    
    def generate_embeddings(self, texts: List[str]) -> np.ndarray:
        '''
        Generate embeddings for a list of texts
        Args : texts : List of texts to generate embeddings for
        Returns : np.ndarray : Embeddings for the texts
        '''
        if not self.model:
            raise ValueError("Model not loaded")
        
        print(f"Generating embeddings for {len(texts)} texts")
        embeddings = self.model.encode(texts , show_progress_bar=True)
        print(f"Embeddings generated successfully with shape {embeddings.shape}")
        return embeddings
    
# Intialize the EmbeddingManager
embedding_manager = EmbeddingManager()
embedding_manager

Loading model all-MiniLM-L6-v2


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


Model all-MiniLM-L6-v2 loaded successfully
Embedding Dimensions: 384


<__main__.EmbeddingManager at 0x23bc48b7380>

#### Vector Store

In [14]:
class VectorStore:
    '''Manages doc embedding in a Chroma DB vector store'''
    
    def __init__(self, collection_name: str = "pdf_documents",  persist_directory: str = "../data/vector_store"):
        '''
        Intializes the Chroma DB vector store
        Args:
            collection_name (str): Name of the collection to be created
            persist_directory (str): Directory to persist the collection            
        '''
        self.collection_name = collection_name
        self.persist_directory = persist_directory
        self.client = None
        self.collection = None
        self._intialize_store()
    
    def _intialize_store(self):
        '''Intializes the Chroma DB client and collection'''
        
        try:
            '''Create persist directory if it doesn't exist'''
            os.makedirs(self.persist_directory, exist_ok=True)
            self.client = chromadb.PersistentClient(path=self.persist_directory)
            
            # Get or create collection
            self.collection = self.client.get_or_create_collection(
                name=self.collection_name,
                metadata = {'description' : "PDF Document embeddings for RAG"}
                )
            
            print(f"Vector store initialized at {self.persist_directory}")
            print(f"Collection name: {self.collection_name}")
            print(f"Existing documents in collection: {self.collection.count()}")
        
        except Exception as e:
            print(f"Error initializing vector store: {e}")
            raise
        
    def add_documents(self, documents: List[Any], embeddings: np.ndarray):
        """
        Add documents and their embeddings to the vector store
        
        Args:
            documents: List of LangChain documents
            embeddings: Corresponding embeddings for the documents
        """
        if len(documents) != len(embeddings):
            raise ValueError("Number of documents must match number of embeddings")
        
        print(f"Adding {len(documents)} documents to vector store...")
        
        # Prepare data for ChromaDB
        ids = []
        metadatas = []
        documents_text = []
        embeddings_list = []
        
        for i, (doc, embedding) in enumerate(zip(documents, embeddings)):
            # Generate unique ID
            doc_id = f"doc_{uuid.uuid4().hex[:8]}_{i}"
            ids.append(doc_id)
            
            # Prepare metadata
            metadata = dict(doc.metadata)
            metadata['doc_index'] = i
            metadata['content_length'] = len(doc.page_content)
            metadatas.append(metadata)
            
            # Document content
            documents_text.append(doc.page_content)
            
            # Embedding
            embeddings_list.append(embedding.tolist())
        
        # Add to collection
        try:
            self.collection.add(
                ids=ids,
                embeddings=embeddings_list,
                metadatas=metadatas,
                documents=documents_text
            )
            print(f"Successfully added {len(documents)} documents to vector store")
            print(f"Total documents in collection: {self.collection.count()}")
            
        except Exception as e:
            print(f"Error adding documents to vector store: {e}")
            raise

vectorstore=VectorStore()
vectorstore    

Vector store initialized at ../data/vector_store
Collection name: pdf_documents
Existing documents in collection: 0


<__main__.VectorStore at 0x23bc5196e40>

In [15]:
# Converting Text to Embeddings

texts = [doc.page_content for doc in chunks]
texts 

['Machine Learning Basics \n \nMachine learning is a subset of artificial intelligence that enables systems to learn and \nimprove \nfrom experience without being explicitly programmed. It focuses on developing computer \nprograms \nthat can access data and use it to learn for themselves. \n \nTypes of Machine Learning: \n1. Supervised Learning: Learning with labeled data \n2. Unsupervised Learning: Finding patterns in unlabeled data \n3. Reinforcement Learning: Learning through rewards and penalties \n \nApplications include image recognition, speech processing, and recommendation systems',
 'Python Programming Introduction \n \nPython is a high-level, interpreted programming language known for its simplicity and \nreadability. \nCreated by Guido van Rossum and first released in 1991, Python has become one of the \nmost popular \nprogramming languages in the world. \n \nKey Features: \n- Easy to learn and use \n- Extensive standard library \n- Cross-platform compatibility \n- Strong c

In [16]:
# Generate the Embeddings
embeddings = embedding_manager.generate_embeddings(texts)

# Store in VectorDB
vectorstore.add_documents(chunks, embeddings)

Generating embeddings for 2 texts


Batches: 100%|██████████| 1/1 [00:00<00:00,  4.16it/s]


Embeddings generated successfully with shape (2, 384)
Adding 2 documents to vector store...
Successfully added 2 documents to vector store
Total documents in collection: 2


### Retriever Pipleine from VectorStore

In [17]:
class RAGRetriever:
    '''Handles query-based retrieval from vector store'''
    
    def __init__(self, vector_store: VectorStore , embedding_manager: EmbeddingManager):
        '''
        Intialize the retriever
        Args:
            vector_store: VectorStore containing the embeddings
            embedding_manager: EmbeddingManager to handle embeddings
        '''
        self.vector_store = vector_store
        self.embedding_manager = embedding_manager
    
    def retrieve(self, query: str, top_k: int = 5, score_threshold: float = 0.0) -> List[Dict[str, Any]]:
        """
        Retrieve relevant documents for a query
        
        Args:
            query: The search query
            top_k: Number of top results to return
            score_threshold: Minimum similarity score threshold
            
        Returns:
            List of dictionaries containing retrieved documents and metadata
        """
        print(f"Retrieving documents for query: '{query}'")
        print(f"Top K: {top_k}, Score threshold: {score_threshold}")
        
        # Generate query embedding
        query_embedding = self.embedding_manager.generate_embeddings([query])[0]
        
        # Search in vector store
        try:
            results = self.vector_store.collection.query(
                query_embeddings=[query_embedding.tolist()],
                n_results=top_k
            )
            
            # Process results
            retrieved_docs = []
            
            if results['documents'] and results['documents'][0]:
                documents = results['documents'][0]
                metadatas = results['metadatas'][0]
                distances = results['distances'][0]
                ids = results['ids'][0]
                
                for i, (doc_id, document, metadata, distance) in enumerate(zip(ids, documents, metadatas, distances)):
                    # Convert distance to similarity score (ChromaDB uses cosine distance)
                    similarity_score = 1 - distance
                    
                    if similarity_score >= score_threshold:
                        retrieved_docs.append({
                            'id': doc_id,
                            'content': document,
                            'metadata': metadata,
                            'similarity_score': similarity_score,
                            'distance': distance,
                            'rank': i + 1
                        })
                
                print(f"Retrieved {len(retrieved_docs)} documents (after filtering)")
            else:
                print("No documents found")
            
            return retrieved_docs
            
        except Exception as e:
            print(f"Error during retrieval: {e}")
            return []

rag_retriever=RAGRetriever(vectorstore,embedding_manager)
rag_retriever    

<__main__.RAGRetriever at 0x23bab114d70>

In [18]:
rag_retriever.retrieve(query="What is the python?")

Retrieving documents for query: 'What is the python?'
Top K: 5, Score threshold: 0.0
Generating embeddings for 1 texts


Batches: 100%|██████████| 1/1 [00:00<00:00,  1.59it/s]

Embeddings generated successfully with shape (1, 384)
Retrieved 1 documents (after filtering)





[{'id': 'doc_96f4ab67_1',
  'content': 'Python Programming Introduction \n \nPython is a high-level, interpreted programming language known for its simplicity and \nreadability. \nCreated by Guido van Rossum and first released in 1991, Python has become one of the \nmost popular \nprogramming languages in the world. \n \nKey Features: \n- Easy to learn and use \n- Extensive standard library \n- Cross-platform compatibility \n- Strong community support \n \nPython is widely used in web development, data science, artificial intelligence, and \nautomation.',
  'metadata': {'total_pages': 1,
   'page_label': '1',
   'creator': 'Microsoft® Word for Microsoft 365',
   'producer': 'Microsoft® Word for Microsoft 365',
   'file_type': 'pdf',
   'source': '..\\data\\pdf_files\\python_intro.pdf',
   'creationdate': '2025-10-30T18:02:12+05:30',
   'doc_index': 1,
   'author': 'HINATA SHOYO',
   'content_length': 504,
   'moddate': '2025-10-30T18:02:12+05:30',
   'page': 0,
   'source_file': 'pytho

In [21]:
rag_retriever.retrieve(query="machine learning")

Retrieving documents for query: 'machine learning'
Top K: 5, Score threshold: 0.0
Generating embeddings for 1 texts


Batches: 100%|██████████| 1/1 [00:00<00:00, 33.46it/s]

Embeddings generated successfully with shape (1, 384)
Retrieved 1 documents (after filtering)





[{'id': 'doc_76e7c3a2_0',
  'content': 'Machine Learning Basics \n \nMachine learning is a subset of artificial intelligence that enables systems to learn and \nimprove \nfrom experience without being explicitly programmed. It focuses on developing computer \nprograms \nthat can access data and use it to learn for themselves. \n \nTypes of Machine Learning: \n1. Supervised Learning: Learning with labeled data \n2. Unsupervised Learning: Finding patterns in unlabeled data \n3. Reinforcement Learning: Learning through rewards and penalties \n \nApplications include image recognition, speech processing, and recommendation systems',
  'metadata': {'producer': 'Microsoft® Word for Microsoft 365',
   'creator': 'Microsoft® Word for Microsoft 365',
   'doc_index': 0,
   'content_length': 581,
   'page': 0,
   'file_type': 'pdf',
   'source_file': 'machine_learning.pdf',
   'moddate': '2025-10-30T18:01:39+05:30',
   'total_pages': 1,
   'source': '..\\data\\pdf_files\\machine_learning.pdf',
  

#### Integration VectorDB + Context pipeline with LLM Output

In [24]:
# Simple RAG pipeline with Groq LLM

from langchain_groq import ChatGroq
import os
from dotenv import load_dotenv
load_dotenv()

# Intialize the Groq LLM
groq_api_key = os.getenv("GROQ_API_KEY")

llm = ChatGroq(groq_api_key=groq_api_key , model = "llama-3.1-8b-instant" , temperature=0.1 , max_tokens=1024)

# RAG function ( retreive context + generate answer )
def rag_simple(query , retriever , llm , top_k = 3):
    
    # Retrieve relevant context
    results = retriever.retrieve(query , top_k = top_k)
    context = "\n\n".join([doc['content'] for doc in results]) if results else ""
    if not context:
        return "No relevant context found"
    
    # Generate answer
    prompt=f"""Use the following context to answer the question concisely.
        Context:
        {context}

        Question: {query}

        Answer:"""
    
    response = llm.invoke([prompt.format(context=context,query=query)])
    return response.content

In [25]:
answer = rag_simple("What is python?" , rag_retriever , llm)
print(answer)

Retrieving documents for query: 'What is python?'
Top K: 3, Score threshold: 0.0
Generating embeddings for 1 texts


Batches: 100%|██████████| 1/1 [00:00<00:00, 47.80it/s]

Embeddings generated successfully with shape (1, 384)
Retrieved 1 documents (after filtering)





Python is a high-level, interpreted programming language known for its simplicity and readability.


#### Enhanced RAG Pipeline Features

In [26]:
# --- Enhanced RAG Pipeline Features ---
def rag_advanced(query, retriever, llm, top_k=5, min_score=0.2, return_context=False):
    """
    RAG pipeline with extra features:
    - Returns answer, sources, confidence score, and optionally full context.
    """
    results = retriever.retrieve(query, top_k=top_k, score_threshold=min_score)
    if not results:
        return {'answer': 'No relevant context found.', 'sources': [], 'confidence': 0.0, 'context': ''}
    
    # Prepare context and sources
    context = "\n\n".join([doc['content'] for doc in results])
    sources = [{
        'source': doc['metadata'].get('source_file', doc['metadata'].get('source', 'unknown')),
        'page': doc['metadata'].get('page', 'unknown'),
        'score': doc['similarity_score'],
        'preview': doc['content'][:300] + '...'
    } for doc in results]
    confidence = max([doc['similarity_score'] for doc in results])
    
    # Generate answer
    prompt = f"""Use the following context to answer the question concisely.\nContext:\n{context}\n\nQuestion: {query}\n\nAnswer:"""
    response = llm.invoke([prompt.format(context=context, query=query)])
    
    output = {
        'answer': response.content,
        'sources': sources,
        'confidence': confidence
    }
    if return_context:
        output['context'] = context
    return output

# Example usage:
result = rag_advanced("Machine Learning", rag_retriever, llm, top_k=3, min_score=0.1, return_context=True)
print("Answer:", result['answer'])
print("Sources:", result['sources'])
print("Confidence:", result['confidence'])
print("Context Preview:", result['context'][:300])


Retrieving documents for query: 'Machine Learning'
Top K: 3, Score threshold: 0.1
Generating embeddings for 1 texts


Batches: 100%|██████████| 1/1 [00:00<00:00, 21.51it/s]

Embeddings generated successfully with shape (1, 384)
Retrieved 1 documents (after filtering)





Answer: Machine learning is a subset of artificial intelligence that enables systems to learn and improve from experience without being explicitly programmed.
Sources: [{'source': 'machine_learning.pdf', 'page': 0, 'score': 0.14411473274230957, 'preview': 'Machine Learning Basics \n \nMachine learning is a subset of artificial intelligence that enables systems to learn and \nimprove \nfrom experience without being explicitly programmed. It focuses on developing computer \nprograms \nthat can access data and use it to learn for themselves. \n \nTypes of Machin...'}]
Confidence: 0.14411473274230957
Context Preview: Machine Learning Basics 
 
Machine learning is a subset of artificial intelligence that enables systems to learn and 
improve 
from experience without being explicitly programmed. It focuses on developing computer 
programs 
that can access data and use it to learn for themselves. 
 
Types of Machin
