## Data ingestion Pipeline

In [1]:
import os
from langchain_community.document_loaders import PyPDFLoader, PyMuPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from pathlib import Path
from sentence_transformers import SentenceTransformer
import uuid
import chromadb
import chromadb

  from .autonotebook import tqdm as notebook_tqdm


#### Document loading

In [2]:
from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader

In [3]:
directory_path = r"D:\RAG\project\data\pdfs"
loader = DirectoryLoader(
    directory_path,
    glob="**/*.pdf",  # matches all PDFs in directory and subdirectories
    loader_cls=PyPDFLoader,
    show_progress=False  # optional: shows loading progress
)

# Load all documents
documents = loader.load()

In [4]:
docs = loader.load()
docs

[Document(metadata={'producer': 'Acrobat Distiller 9.5.2 (Windows)', 'creator': 'Microsoft PowerPoint', 'creationdate': '2012-12-07T16:50:08+05:30', 'author': 'Kocur, George | Cassa, Chris | Gonzalez, Marta', 'moddate': '2012-12-11T16:55:17+05:30', 'subject': '', 'title': 'Arrays, Arraylists', 'source': 'D:\\RAG\\project\\data\\pdfs\\arrays_and_arraylists.pdf', 'total_pages': 13, 'page': 0, 'page_label': '1'}, page_content='1 \n1.00 Lecture 11 \nArrays and ArrayLists \nReading for next time: Big Java: sections 13.1-13.4 \nArrays \n•\u202f Arrays are a simple data structure \n•\u202f Arrays store a set of values of the same type \n–\u202f Built-in types (int, double, etc.) or  \n–\u202f Objects (Students, Engines, etc.) \n•\u202f Arrays are part of the Java language  \n–\u202f Arrays are objects, not primitives like iint or double. \n–\u202f They are declared in the same way as other objects \nint[] intArray= new int[20];      //\x02Irregular verb\x02\n–\u202f The array object has an in

#### Adding two new fields in metadata of each loaded document

In [5]:
for i, doc in enumerate(documents):
    doc.metadata["file_type"] = "pdf"
    doc.metadata["file_name"] = doc.metadata.get("source", "").split("\\")[-1]

documents

[Document(metadata={'producer': 'Acrobat Distiller 9.5.2 (Windows)', 'creator': 'Microsoft PowerPoint', 'creationdate': '2012-12-07T16:50:08+05:30', 'author': 'Kocur, George | Cassa, Chris | Gonzalez, Marta', 'moddate': '2012-12-11T16:55:17+05:30', 'subject': '', 'title': 'Arrays, Arraylists', 'source': 'D:\\RAG\\project\\data\\pdfs\\arrays_and_arraylists.pdf', 'total_pages': 13, 'page': 0, 'page_label': '1', 'file_type': 'pdf', 'file_name': 'arrays_and_arraylists.pdf'}, page_content='1 \n1.00 Lecture 11 \nArrays and ArrayLists \nReading for next time: Big Java: sections 13.1-13.4 \nArrays \n•\u202f Arrays are a simple data structure \n•\u202f Arrays store a set of values of the same type \n–\u202f Built-in types (int, double, etc.) or  \n–\u202f Objects (Students, Engines, etc.) \n•\u202f Arrays are part of the Java language  \n–\u202f Arrays are objects, not primitives like iint or double. \n–\u202f They are declared in the same way as other objects \nint[] intArray= new int[20];    

#### Chunking

In [6]:
def chunk_documents(documents, chunk_size=1000, chunk_overlap=200):
    """
    Split documents into chunks using RecursiveCharacterTextSplitter.

    Args:
        documents: List of LangChain Document objects
        chunk_size: Maximum size of each chunk (in characters)
        chunk_overlap: Number of characters to overlap between chunks

    Returns:
        List of chunked Document objects
    """
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
        separators=["\n\n", "\n", ". ", " ", ""]
    )

    return splitter.split_documents(documents)

In [7]:
chunks = chunk_documents(docs, chunk_size=1000, chunk_overlap=200)
print(f"Total chunks: {len(chunks)} created from Total documents: {len(docs)}")

Total chunks: 79 created from Total documents: 68


In [8]:
chunks

[Document(metadata={'producer': 'Acrobat Distiller 9.5.2 (Windows)', 'creator': 'Microsoft PowerPoint', 'creationdate': '2012-12-07T16:50:08+05:30', 'author': 'Kocur, George | Cassa, Chris | Gonzalez, Marta', 'moddate': '2012-12-11T16:55:17+05:30', 'subject': '', 'title': 'Arrays, Arraylists', 'source': 'D:\\RAG\\project\\data\\pdfs\\arrays_and_arraylists.pdf', 'total_pages': 13, 'page': 0, 'page_label': '1'}, page_content='1 \n1.00 Lecture 11 \nArrays and ArrayLists \nReading for next time: Big Java: sections 13.1-13.4 \nArrays \n•\u202f Arrays are a simple data structure \n•\u202f Arrays store a set of values of the same type \n–\u202f Built-in types (int, double, etc.) or  \n–\u202f Objects (Students, Engines, etc.) \n•\u202f Arrays are part of the Java language  \n–\u202f Arrays are objects, not primitives like iint or double. \n–\u202f They are declared in the same way as other objects \nint[] intArray= new int[20];      //\x02Irregular verb\x02\n–\u202f The array object has an in

#### Embedding and vectorStoreDB

In [9]:
def make_embeddings(chunks):
    """
    Generate embeddings for document chunks using all-MiniLM-L6-v2.

    Returns:
        List of embeddings (numpy arrays)
    """
    model = SentenceTransformer("all-MiniLM-L6-v2")
    
    texts = [chunk.page_content for chunk in chunks]
    embeddings = model.encode(texts, show_progress_bar=True)
    
    return embeddings

In [10]:
embeddings = make_embeddings(chunks)

Loading weights: 100%|██████████| 103/103 [00:00<00:00, 1036.49it/s, Materializing param=pooler.dense.weight]                             
[1mBertModel LOAD REPORT[0m from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m
Batches: 100%|██████████| 3/3 [00:01<00:00,  1.72it/s]


In [None]:
def store_in_chromadb(chunks, embeddings, collection_name="mit_documents"):
    """
    Store chunks and embeddings in ChromaDB.
    """

    client = chromadb.PersistentClient("../data/vector_store")
    collection = client.get_or_create_collection(name=collection_name)

    ids        = [str(uuid.uuid4()) for _ in chunks]
    metadatas  = [chunk.metadata for chunk in chunks]
    documents  = [chunk.page_content for chunk in chunks]
    embeds     = [embedding.tolist() for embedding in embeddings]  # chromadb expects plain list, not numpy array

    collection.add(
        ids        = ids,
        metadatas  = metadatas,
        documents  = documents,
        embeddings = embeds
    )

    print(f"Successfully stored {len(chunks)} chunks in ChromaDB collection: '{collection_name}'")
    return collection

In [12]:
collection = store_in_chromadb(chunks, embeddings)

Successfully stored 79 chunks in ChromaDB collection: 'mit_documents'


### Retriever pipeline from Vectorstore (chromaDB)

In [None]:
class Retriever:
    
    def __init__(self, collection, model_name="all-MiniLM-L6-v2"):
        self.collection = collection
        self.model = SentenceTransformer(model_name)
    
    def retrieve(self, user_query: str, top_k: int = 5) -> list:
        """
        Convert user query to embedding and retrieve top_k similar chunks from ChromaDB.

        Args:
            user_query: The user's question/query string
            top_k: Number of top results to retrieve

        Returns:
            List of top_k matching documents with their metadata and distances
        """
        query_embedding = self.model.encode(user_query).tolist()
        
        results = self.collection.query(
            query_embeddings = [query_embedding],
            n_results         = top_k,
            include           = ["documents", "metadatas", "distances", "embeddings"]
        )
        
        return results

In [25]:
retriever = Retriever(collection=collection)
similarity_threshold = 0.55
retrieved_docs = []

results = retriever.retrieve("What are the phases of project management?", top_k=3)

# accessing results
for i in range(len(results["documents"][0])):
    if((1 - results['distances'][0][i]) >= similarity_threshold):
        print(f"--- Result {i+1} ---")
        print(f"Content  : {results['documents'][0][i]}")
        print(f"Metadata : {results['metadatas'][0][i]}")
        print(f"Distance : {results['distances'][0][i]}")
        print(f"Similarity score : {1 - results['distances'][0][i]}")
        retrieved_docs.append(results['documents'][0][i])

Loading weights: 100%|██████████| 103/103 [00:00<00:00, 937.53it/s, Materializing param=pooler.dense.weight]                             
[1mBertModel LOAD REPORT[0m from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


--- Result 1 ---
Content  : Project ManagementProject Management
1. The phases of1. The phases of
-- DevelopmentDevelopment
-- Close OutClose Out
Resource Scheduling
Simulation
Project Monitoring and Control
Changes and Claims
Earned Value Analysis
Quality Reviews and Audits
Project Monitoring and Control
-How to track your project costs, schedule (time), and other resources
- Helps ascertain whether targets are being met
- Needed so that due changes can be made to schedule as and when necessary
Metadata : {'moddate': '2009-09-03T12:03:52+05:30', 'company': 'MIT', 'creationdate': '2009-09-03T11:59:35+05:30', 'creator': 'Acrobat PDFMaker 7.0 for PowerPoint', 'keywords': '', 'manager': '', 'page': 16, 'comments': '', 'subject': '', 'source': 'D:\\RAG\\project\\data\\pdfs\\project_management.pdf', 'category': '', 'page_label': '17', 'total_pages': 21, 'producer': 'Acrobat Distiller 7.0 (Windows)', 'title': 'Lecture 1'}
Distance : 0.43159639835357666
Similarity score : 0.5684036016464233
-

In [26]:
retrieved_docs

['Project ManagementProject Management\n1. The phases of1. The phases of\n-- DevelopmentDevelopment\n-- Close OutClose Out\nResource Scheduling\nSimulation\nProject Monitoring and Control\nChanges and Claims\nEarned Value Analysis\nQuality Reviews and Audits\nProject Monitoring and Control\n-How to track your project costs, schedule (time), and other resources\n- Helps ascertain whether targets are being met\n- Needed so that due changes can be made to schedule as and when necessary',
 'Project Management Project Management \n1. The phases of1. The phases of\n-- DevelopmentDevelopment\n-- Close OutClose Out\nResource Scheduling\nSimulation\nBasics of Project Monitoring and Control\nChanges and Claims\nEarned Value Analysis\nQuality Reviews and Audits']