## Data ingestion Pipeline

In [23]:
import os
from langchain_community.document_loaders import PyPDFLoader, PyMuPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from pathlib import Path
from sentence_transformers import SentenceTransformer
import uuid
import chromadb
from groq import Groq

#### Document loading

In [24]:
from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader

In [25]:
directory_path = r"D:\RAG\project\data\pdfs"
loader = DirectoryLoader(
    directory_path,
    glob="**/*.pdf",  # matches all PDFs in directory and subdirectories
    loader_cls=PyPDFLoader,
    show_progress=False  # optional: shows loading progress
)

# Load all documents
documents = loader.load()

In [26]:
docs = loader.load()
docs

[Document(metadata={'producer': 'pdfTeX-1.40.25', 'creator': 'LaTeX with acmart 2020/02/22 v1.70 Typesetting articles for the Association for Computing Machinery and hyperref 2023-04-22 v7.00x Hypertext links for LaTeX', 'creationdate': '2024-05-29T00:16:52+00:00', 'author': 'Rishi Kesav Mohan, Risheek Rakshit Sukumar Kanmani, Krishna Anandan Ganesan, and Nisha Ramasubramanian', 'keywords': '', 'moddate': '2024-05-29T00:16:52+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) kpathsea version 6.3.5', 'subject': '', 'title': 'Evaluating NoSQL Databases for OLAP Workloads: A Benchmarking Study of MongoDB, Redis, Kudu and ArangoDB', 'trapped': '/False', 'source': 'D:\\RAG\\project\\data\\pdfs\\[Big data]_Evaluating NoSQL Databases for OLAP Workloads.pdf', 'total_pages': 7, 'page': 0, 'page_label': '1'}, page_content='Evaluating NoSQL Databases for OLAP Workloads: A\nBenchmarking Study of MongoDB, Redis, Kudu and ArangoDB\nRishi Kesav Mohan\nrkmohan

#### Adding two new fields in metadata of each loaded document

In [27]:
for i, doc in enumerate(documents):
    doc.metadata["file_type"] = "pdf"
    doc.metadata["file_name"] = doc.metadata.get("source", "").split("\\")[-1]

documents

[Document(metadata={'producer': 'pdfTeX-1.40.25', 'creator': 'LaTeX with acmart 2020/02/22 v1.70 Typesetting articles for the Association for Computing Machinery and hyperref 2023-04-22 v7.00x Hypertext links for LaTeX', 'creationdate': '2024-05-29T00:16:52+00:00', 'author': 'Rishi Kesav Mohan, Risheek Rakshit Sukumar Kanmani, Krishna Anandan Ganesan, and Nisha Ramasubramanian', 'keywords': '', 'moddate': '2024-05-29T00:16:52+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) kpathsea version 6.3.5', 'subject': '', 'title': 'Evaluating NoSQL Databases for OLAP Workloads: A Benchmarking Study of MongoDB, Redis, Kudu and ArangoDB', 'trapped': '/False', 'source': 'D:\\RAG\\project\\data\\pdfs\\[Big data]_Evaluating NoSQL Databases for OLAP Workloads.pdf', 'total_pages': 7, 'page': 0, 'page_label': '1', 'file_type': 'pdf', 'file_name': '[Big data]_Evaluating NoSQL Databases for OLAP Workloads.pdf'}, page_content='Evaluating NoSQL Databases for OLAP 

#### Chunking

In [28]:
def chunk_documents(documents, chunk_size=1000, chunk_overlap=200):
    """
    Split documents into chunks using RecursiveCharacterTextSplitter.

    Args:
        documents: List of LangChain Document objects
        chunk_size: Maximum size of each chunk (in characters)
        chunk_overlap: Number of characters to overlap between chunks

    Returns:
        List of chunked Document objects
    """
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
        separators=["\n\n", "\n", ". ", " ", ""]
    )

    return splitter.split_documents(documents)

In [29]:
chunks = chunk_documents(docs, chunk_size=1000, chunk_overlap=200)
print(f"Total chunks: {len(chunks)} created from Total documents: {len(docs)}")

Total chunks: 559 created from Total documents: 159


In [30]:
chunks

[Document(metadata={'producer': 'pdfTeX-1.40.25', 'creator': 'LaTeX with acmart 2020/02/22 v1.70 Typesetting articles for the Association for Computing Machinery and hyperref 2023-04-22 v7.00x Hypertext links for LaTeX', 'creationdate': '2024-05-29T00:16:52+00:00', 'author': 'Rishi Kesav Mohan, Risheek Rakshit Sukumar Kanmani, Krishna Anandan Ganesan, and Nisha Ramasubramanian', 'keywords': '', 'moddate': '2024-05-29T00:16:52+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) kpathsea version 6.3.5', 'subject': '', 'title': 'Evaluating NoSQL Databases for OLAP Workloads: A Benchmarking Study of MongoDB, Redis, Kudu and ArangoDB', 'trapped': '/False', 'source': 'D:\\RAG\\project\\data\\pdfs\\[Big data]_Evaluating NoSQL Databases for OLAP Workloads.pdf', 'total_pages': 7, 'page': 0, 'page_label': '1'}, page_content='Evaluating NoSQL Databases for OLAP Workloads: A\nBenchmarking Study of MongoDB, Redis, Kudu and ArangoDB\nRishi Kesav Mohan\nrkmohan

#### Embedding and vectorStoreDB

In [31]:
def make_embeddings(chunks):
    """
    Generate embeddings for document chunks using all-MiniLM-L6-v2.

    Returns:
        List of embeddings (numpy arrays)
    """
    model = SentenceTransformer("all-MiniLM-L6-v2")
    
    texts = [chunk.page_content for chunk in chunks]
    embeddings = model.encode(texts, show_progress_bar=True)
    
    return embeddings

In [32]:
embeddings = make_embeddings(chunks)

Loading weights: 100%|██████████| 103/103 [00:00<00:00, 816.55it/s, Materializing param=pooler.dense.weight]                             
[1mBertModel LOAD REPORT[0m from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m
Batches: 100%|██████████| 18/18 [00:18<00:00,  1.05s/it]


In [33]:
def store_in_chromadb(chunks, embeddings, collection_name="mit_documents"):
    """
    Store chunks and embeddings in ChromaDB.
    """

    client = chromadb.PersistentClient("../data/vector_store")
    collection = client.get_or_create_collection(name=collection_name)

    ids        = [str(uuid.uuid4()) for _ in chunks]
    metadatas  = [chunk.metadata for chunk in chunks]
    documents  = [chunk.page_content for chunk in chunks]
    embeds     = [embedding.tolist() for embedding in embeddings]  # chromadb expects plain list, not numpy array

    collection.add(
        ids        = ids,
        metadatas  = metadatas,
        documents  = documents,
        embeddings = embeds
    )

    print(f"Successfully stored {len(chunks)} chunks in ChromaDB collection: '{collection_name}'")
    return collection

In [34]:
collection = store_in_chromadb(chunks, embeddings)

Successfully stored 559 chunks in ChromaDB collection: 'mit_documents'


### Retriever pipeline from Vectorstore (chromaDB)

In [45]:
class Retriever:
    
    def __init__(self, collection, model_name="all-MiniLM-L6-v2", groq_api_key="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"):
        self.collection = collection
        self.model      = SentenceTransformer(model_name)
        self.groq       = Groq(api_key=groq_api_key)
    
    def retrieve(self, user_query: str, top_k: int = 5, threshold: float = 0.6) -> list:
        query_embedding = self.model.encode(user_query).tolist()
        
        results = self.collection.query(
            query_embeddings = [query_embedding],
            n_results         = top_k,
            include           = ["documents", "metadatas", "distances"]
        )
        
        filtered = []
        for i in range(len(results["documents"][0])):
            if results["distances"][0][i] < threshold:
                filtered.append({
                    "content"  : results["documents"][0][i],
                    "metadata" : results["metadatas"][0][i],
                    "distance" : results["distances"][0][i]
                })
        
        return filtered

    def ask(self, user_query: str, top_k: int = 5) -> str:
        """
        Retrieve relevant chunks and pass them as context to Groq LLM.

        Args:
            user_query: The user's question
            top_k: Number of chunks to retrieve

        Returns:
            LLM's response as a string
        """
        # step 1: retrieve relevant chunks
        retrieved_docs = self.retrieve(user_query, top_k=top_k)
        
        if not retrieved_docs:
            return "I could not find any relevant information in the documents to answer your question."
        
        # step 2: build context from retrieved chunks
        context = "\n\n".join([doc["content"] for doc in retrieved_docs])
        
        # step 3: build prompt
        prompt = f"""You are a helpful assistant. Answer the user's question based only on the context provided below.
            If the answer is not in the context, say "I don't know based on the provided documents."

            Context:
            {context}

            Question: {user_query}
            Answer:"""
        
        # step 4: call groq LLM
        response = self.groq.chat.completions.create(
            model    = "openai/gpt-oss-20b",
            messages = [{"role": "user", "content": prompt}]
        )
        
        return response.choices[0].message.content

In [46]:
retriever = Retriever(collection=collection)
similarity_threshold = 0.5
retrieved_docs = []

result = retriever.ask("What is React Virtual DOM?", top_k=3)

print(f"Answer is {result}")

Loading weights: 100%|██████████| 103/103 [00:00<00:00, 987.06it/s, Materializing param=pooler.dense.weight]                              
[1mBertModel LOAD REPORT[0m from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


Answer is **React Virtual DOM** is a lightweight, in‑memory representation of the real DOM that React maintains. It has the same structure and characteristics as a real DOM, but it is not rendered to the screen immediately. React updates this virtual DOM first, determines the minimal set of changes needed, and then applies those changes to the actual DOM. This process is much faster than manipulating the real DOM directly, improving rendering performance.
