In [45]:
from langchain_core.documents import Document

In [46]:
doc=Document(
    page_content="this is the main text content I am using to create RAG",
    metadata={
        "source":"exmaple.txt",
        "pages":1,
        "author":"Krish Naik",
        "date_created":"2025-01-01"
    }
)
doc

Document(metadata={'source': 'exmaple.txt', 'pages': 1, 'author': 'Krish Naik', 'date_created': '2025-01-01'}, page_content='this is the main text content I am using to create RAG')

In [47]:
## Create a simple txt file
import os
os.makedirs("../data/text_files",exist_ok=True)

In [48]:
sample_texts={
    "../data/text_files/python_intro.txt":"""Python Programming Introduction

Python is a high-level, interpreted programming language known for its simplicity and readability.
Created by Guido van Rossum and first released in 1991, Python has become one of the most popular
programming languages in the world.

Key Features:
- Easy to learn and use
- Extensive standard library
- Cross-platform compatibility
- Strong community support

Python is widely used in web development, data science, artificial intelligence, and automation.""",
    
    "../data/text_files/machine_learning.txt": """Machine Learning Basics

Machine learning is a subset of artificial intelligence that enables systems to learn and improve
from experience without being explicitly programmed. It focuses on developing computer programs
that can access data and use it to learn for themselves.

Types of Machine Learning:
1. Supervised Learning: Learning with labeled data
2. Unsupervised Learning: Finding patterns in unlabeled data
3. Reinforcement Learning: Learning through rewards and penalties

Applications include image recognition, speech processing, and recommendation systems
    
    
    """

}

for filepath,content in sample_texts.items():
    with open(filepath,'w',encoding="utf-8") as f:
        f.write(content)

print("✅ Sample text files created!")

✅ Sample text files created!


In [49]:

from langchain.document_loaders import TextLoader

from langchain_community.document_loaders import TextLoader

loader=TextLoader("../data/text_files/python_intro.txt",encoding="utf-8")
document=loader.load()
print(document)

[Document(metadata={'source': '../data/text_files/python_intro.txt'}, page_content='Python Programming Introduction\n\nPython is a high-level, interpreted programming language known for its simplicity and readability.\nCreated by Guido van Rossum and first released in 1991, Python has become one of the most popular\nprogramming languages in the world.\n\nKey Features:\n- Easy to learn and use\n- Extensive standard library\n- Cross-platform compatibility\n- Strong community support\n\nPython is widely used in web development, data science, artificial intelligence, and automation.')]


In [51]:
from langchain_community.document_loaders import DirectoryLoader

## load all the text files from the directory
dir_loader=DirectoryLoader(
    "../data/text_files",
    glob="**/*.txt", ## Pattern to match files  
    loader_cls= TextLoader, ##loader class to use
    loader_kwargs={'encoding': 'utf-8'},
    show_progress=False

)

documents=dir_loader.load()
documents

[Document(metadata={'source': '..\\data\\text_files\\machine_learning.txt'}, page_content='Machine Learning Basics\n\nMachine learning is a subset of artificial intelligence that enables systems to learn and improve\nfrom experience without being explicitly programmed. It focuses on developing computer programs\nthat can access data and use it to learn for themselves.\n\nTypes of Machine Learning:\n1. Supervised Learning: Learning with labeled data\n2. Unsupervised Learning: Finding patterns in unlabeled data\n3. Reinforcement Learning: Learning through rewards and penalties\n\nApplications include image recognition, speech processing, and recommendation systems\n\n\n    '),
 Document(metadata={'source': '..\\data\\text_files\\python_intro.txt'}, page_content='Python Programming Introduction\n\nPython is a high-level, interpreted programming language known for its simplicity and readability.\nCreated by Guido van Rossum and first released in 1991, Python has become one of the most popu

In [50]:
from langchain_community.document_loaders import PyPDFLoader, PyMuPDFLoader

## load all the text files from the directory
dir_loader=DirectoryLoader(
    "../data/pdf",
    glob="**/*.pdf", ## Pattern to match files  
    loader_cls= PyMuPDFLoader, ##loader class to use
    show_progress=False

)

pdf_documents=dir_loader.load()
pdf_documents

[Document(metadata={'producer': 'Skia/PDF m138', 'creator': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36', 'creationdate': '2025-07-29T05:09:41+00:00', 'source': '..\\data\\pdf\\SkinGPT_ AI-Powered Skin Disease Medication Report Generator - Google Docs.pdf', 'file_path': '..\\data\\pdf\\SkinGPT_ AI-Powered Skin Disease Medication Report Generator - Google Docs.pdf', 'total_pages': 6, 'format': 'PDF 1.4', 'title': 'SkinGPT: AI-Powered Skin Disease Medication Report Generator - Google Docs', 'author': '', 'subject': '', 'keywords': '', 'moddate': '2025-07-29T05:09:41+00:00', 'trapped': '', 'modDate': "D:20250729050941+00'00'", 'creationDate': "D:20250729050941+00'00'", 'page': 0}, page_content='\u202dSkinGPT: AI-Powered Skin Disease\u202c\n\u202dMedication Report Generator\u202c\n\u202dSIGN:\u202c'),
 Document(metadata={'producer': 'Skia/PDF m138', 'creator': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 

In [52]:
type(pdf_documents[0])

langchain_core.documents.base.Document

In [53]:
import numpy as np
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
import uuid
from typing import List,Dict,Any,Tuple
from sklearn.metrics.pairwise import cosine_similarity


In [54]:
class EmbeddingsManger:
    def __init__(self,model_name:str="all-MiniLM-L6-v2"):
        self.model_name=model_name
        self.model=None
        self._load_model()
    def _load_model(self):
        try:
            self.model=SentenceTransformer(self.model_name)
            print(f"✅ Model {self.model_name} loaded successfully!")
        except Exception as e:
            print(f"❌ Error loading model {self.model_name}: {e}")
    def get_embeddings(self,texts:List[str])->np.ndarray:
        if not self.model:
            raise ValueError("Model is not loaded.")
        embeddings=self.model.encode(texts,show_progress_bar=True)
        return embeddings
embeding=EmbeddingsManger()
embeding

✅ Model all-MiniLM-L6-v2 loaded successfully!


<__main__.EmbeddingsManger at 0x228666895e0>

In [55]:
class SimpleVectorStore:
    def __init__(self,collection_name:str="my_collection",persist_directory:str="../data/vector_db"):
        self.collection_name=collection_name
        self.persist_directory=persist_directory
        self.client=None
        self.collection=None
        self._initialize_store()
    def _initialize_store(self):
        try:
            os.makedirs(self.persist_directory,exist_ok=True)
            self.client=chromadb.PersistentClient(
                path=self.persist_directory)
            self.collection=self.client.get_or_create_collection(
                name=self.collection_name,
                metadata={"description":"A simple vector store"}
            )
            print(f"✅ Vector store {self.collection_name} initialized successfully!")
        except Exception as e:
            print(f"❌ Error initializing vector store {self.collection_name}: {e}")
    def add_documents(self,documents:List[Any],embeddings:np.ndarray):
        if not self.collection:
            raise ValueError("Collection is not initialized.")
        if len(documents)!=len(embeddings):
            raise ValueError("Number of documents and embeddings must be the same.")
        ids=[]
        metadatas=[]  
        doc_test=[]
        emmbeddings_list=[]
        for i,(doc,embeddings) in enumerate(zip(documents,embeddings)):
            docids=f"doc_{uuid.uuid4().hex[:8]}_{i}"
            ids.append(docids)
            metadata=dict(doc.metadata)
            metadata['doc_index'] = i
            metadatas.append(metadata)
            doc_test.append(doc.page_content)
            emmbeddings_list.append(embeddings.tolist())
        try:
            self.collection.add(
                ids=ids,
                metadatas=metadatas,
                documents=doc_test,
                embeddings=emmbeddings_list
            )
            print(f"✅ Added {len(documents)} documents to the vector store.")
        except Exception as e:
            print(f"❌ Error adding documents to the vector store: {e}")
            raise 
vectorstore=SimpleVectorStore()
vectorstore

✅ Vector store my_collection initialized successfully!


<__main__.SimpleVectorStore at 0x2280e9b56d0>

In [56]:
texts=[doc.page_content for doc in documents]
embedings=EmbeddingsManger().get_embeddings(texts)
vectorstore.add_documents(documents,embedings)



✅ Model all-MiniLM-L6-v2 loaded successfully!


Batches: 100%|██████████| 1/1 [00:00<00:00, 16.05it/s]

✅ Added 2 documents to the vector store.





In [57]:
class RAGRetriver:
    def __init__(self,vector_store:SimpleVectorStore,embeddings_manager:EmbeddingsManger):
        self.vector_store=vector_store
        self.embeddings_manager=embeddings_manager
    def retrieve(self,query:str,top_k:int=3,score_threshold:float=0.0)->List[Dict[str,Any]]:
        query_embedding=self.embeddings_manager.get_embeddings([query])[0]
        try:
            results=self.vector_store.collection.query(
                query_embeddings=[query_embedding.tolist()],
                n_results=top_k,
            )
            retrived_docs=[]
            if results['documents'] and results['documents'][0]:
                documents=results['documents'][0]
                metadatas=results['metadatas'][0]
                distances=results['distances'][0]
                for i,(doc_id,document,metadata,distances) in enumerate(zip(results['ids'][0],documents,metadatas,distances)):
                    score=1 - distances
                    if score>=score_threshold:
                        retrived_docs.append({
                            "id":doc_id,
                            "document":document,
                            "metadata":metadata,
                            "score":score,
                            "distance":distances,
                            "rank":i+1
                        })
                print(f"✅ Retrieved {len(retrived_docs)}")
                      
            else:
                print("⚠️ No documents found.")
            return retrived_docs
        except Exception as e:
            print(f"❌ Error during retrieval: {e}")
            return []
ragretriver=RAGRetriver(vectorstore,embeding)
ragretriver

            

                        




<__main__.RAGRetriver at 0x2280e941880>

In [59]:
ragretriver.retrieve("what is machinelearning",top_k=2,score_threshold=0.1)

Batches: 100%|██████████| 1/1 [00:00<00:00, 58.82it/s]

✅ Retrieved 2





[{'id': 'doc_86fec92c_0',
  'document': 'Machine Learning Basics\n\nMachine learning is a subset of artificial intelligence that enables systems to learn and improve\nfrom experience without being explicitly programmed. It focuses on developing computer programs\nthat can access data and use it to learn for themselves.\n\nTypes of Machine Learning:\n1. Supervised Learning: Learning with labeled data\n2. Unsupervised Learning: Finding patterns in unlabeled data\n3. Reinforcement Learning: Learning through rewards and penalties\n\nApplications include image recognition, speech processing, and recommendation systems\n\n\n    ',
  'metadata': {'doc_index': 0,
   'source': '..\\data\\text_files\\machine_learning.txt'},
  'score': 0.15277695655822754,
  'distance': 0.8472230434417725,
  'rank': 1},
 {'id': 'doc_b8618a23_0',
  'document': 'Machine Learning Basics\n\nMachine learning is a subset of artificial intelligence that enables systems to learn and improve\nfrom experience without being