## Data Ingestion

 LangChain Document Loader

In [None]:
from  langchain_core.documents import Document


In [None]:
from langchain_core.documents import Document

doc =Document(
    page_content="this is the official page content.",
    metadata={
        "source":"example.txt",
        "pages":1,
        "author":"Krish Naik",
        "date_created":"2025-01-01"
    }
)

print(doc)

In [None]:
## create a simple txt file

import os 
os.makedirs("../updatedgenerativeai/text_files",exist_ok=True)

In [None]:
sample_text = {
    "../updatedgenerativeai/text_files/python.txt":"""python is a high level interpreted programming language
    key Feature of python:
    1. Easy to learn and use
    2. Large standard library
    3. Platform independent
    4. Open source
    5. Large community
    6. Dynamic typing
    7. Automatic memory management
    8. Object oriented
    9. Interpreted
    10. High level
    """
}

In [None]:
for filepath,content  in sample_text.items():
    with open(filepath,"w",encoding="utf-8") as f:
        f.write(content)
    print(f"Created {filepath}")

In [None]:
from langchain_community.document_loaders import TextLoader

loader = TextLoader("../updatedgenerativeai/text_files/python.txt")
docs = loader.load()
docs

In [None]:
### Directory Loader
from langchain_community.document_loaders import DirectoryLoader
loader = DirectoryLoader("../updatedgenerativeai/text_files", glob="*.txt",loader_cls=TextLoader,show_progress=True)
dir_docs = loader.load()
dir_docs

In [None]:
from langchain_community.document_loaders import PyPDFLoader,PyMuPDFLoader
loader = DirectoryLoader("../updatedgenerativeai/pdf",
glob="**/*.pdf",
loader_cls=PyMuPDFLoader,
show_progress=True)
docs = loader.load()
docs

RAG System Pipeline

In [None]:


import os
from langchain_community.document_loaders import PyMuPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

from pathlib import Path


In [None]:
## Read All The pdf inside the directory

def process_all_pdf(pdf_directory):
    """Process all The PDF files in the given directory"""
    all_documents = []
    pdf_dir=Path(pdf_directory)
    pdf_files = list(pdf_dir.glob("**/*.pdf"))
    print(f"Found {len(pdf_files)} PDF files")

    for pdf_file in pdf_files:
        print(f"Processing {pdf_file.name}")
        try :
            loader = PyMuPDFLoader(str(pdf_file))
            docs = loader.load()
            for doc in docs:
                doc.metadata["source"] = pdf_file.name
                doc.metadata["file_type"] = "pdf"
                
            all_documents.extend(docs)
            print(f"Processed {pdf_file.name} ({len(docs)} pages)")
        except Exception as e:
            print(f"Error processing {pdf_file.name}: {e}")

    print(f"\nTotal documents processed: {len(all_documents)}")
    return all_documents
            
            

In [None]:
all_documents = process_all_pdf("../updatedgenerativeai/pdf")

In [None]:
all_documents

In [None]:
### Text Splitter

def split_documents(documents,chunk_size=1000,chunk_overlap=200):
    """Split documents into chunks"""
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size,chunk_overlap=chunk_overlap,length_function=len, separators=["\n\n", "\n", " ", ""])

    split_docs=text_splitter.split_documents(documents)
    print(f"Split {len(documents)} documents into {len(split_docs)} chunks")
    if split_docs:
        print("\nExample chunk:")
        print(f"Content: {split_docs[0].page_content[:200]}...")
        print(f"Metadata: {split_docs[0].metadata}")
    return split_docs

In [None]:
chunks=split_documents(all_documents)

In [None]:
chunks

In [None]:
## Lets do The Embedding

import numpy as np
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
import uuid
from typing import List, Dict, Any, Tuple
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
class EmbeddingManager:
    def __init__(self,model_name :str ="all-MiniLM-L6-v2"):
        """Initialize the embedding manager
        Args:
            model_name (str, optional): The name of the model to use. Defaults to "all-MiniLM-L6-v2".
        """
        self.model_name = model_name
        self.model = None
        self._load_model()

    def _load_model(self):
        """Load  the SentenceTransformer model"""
        try:
            self.model =  SentenceTransformer(self.model_name)
            print(f"Model loaded successfully: {self.model.get_sentence_embedding_dimension()}")
        except Exception as e:
            print(f"Failed to load model: {self.model_name}: {e}")
            raise

    def get_embedding(self,text:List[str]):
        """ 
        Generate the embeddings for the given text
        Args : 
        texts:List of text strings to embed
        returns : 
        numpy array of embeddings with shape (num_texts,embedding_dim)
        """

        if not self.model:
            raise ValueError("Model not loaded. Please load the model first.")
        print(f"Generating embeddings for {len(texts)} texts...")
        embeddings=self.model.encode(texts,show_progress_bar=True)
        print(f"Embeddings shape: {embeddings.shape}")
        return embeddings




In [None]:
## Initialize Embedding Manager

embedding_manager=EmbeddingManager()
embedding_manager

## Vector Store


In [None]:
import os
import uuid
import chromadb
import numpy as np
from typing import List, Any


class VectorStore:
    """Manages document embeddings and retrieval using ChromaDB"""

    def __init__(
        self,
        collection_name: str = "pdf_documents",
        persist_directory: str = "../updatedgenerativeai/vector_store",
    ):
        self.collection_name = collection_name
        self.persist_directory = persist_directory
        self.client = None
        self.collection = None
        self._initialize_store()

    def _initialize_store(self):
        """Initialize the ChromaDB client and collection"""
        try:
            os.makedirs(self.persist_directory, exist_ok=True)

            #  Correct client initialization
            self.client = chromadb.PersistentClient(
                path=self.persist_directory
            )

            #  Correct collection creation
            self.collection = self.client.get_or_create_collection(
                name=self.collection_name,
                metadata={"description": "PDF documents embeddings for RAG"},
            )

            print(f"Vector store initialized: {self.collection_name}")
            print(f"Existing documents: {self.collection.count()}")

        except Exception as e:
            print(f"Error initializing vector store: {e}")
            raise

    def add_documents(self, documents: List[Any], embeddings: np.ndarray):
        """Add documents and embeddings to the vector store"""

        if len(documents) != len(embeddings):
            raise ValueError("Documents and embeddings count mismatch")

        ids = []
        metadatas = []
        documents_text = []
        embeddings_list = []

        for i, (doc, embedding) in enumerate(zip(documents, embeddings)):
            doc_id = f"doc_{uuid.uuid4().hex[:8]}_{i}"
            ids.append(doc_id)

            metadata = dict(doc.metadata)
            metadata["doc_index"] = i
            metadata["content_length"] = len(doc.page_content)
            metadatas.append(metadata)

            documents_text.append(doc.page_content)
            embeddings_list.append(embedding.tolist())

        try:
            self.collection.add(
                ids=ids,
                embeddings=embeddings_list,
                documents=documents_text,
                metadatas=metadatas,
            )
            print(f"Successfully added {len(ids)} documents")

        except Exception as e:
            print(f"Error adding documents: {e}")
            raise

    def query(self, query_embedding: np.ndarray, k: int = 5):
        """Query similar documents"""
        try:
            return self.collection.query(
                query_embeddings=[query_embedding.tolist()],
                n_results=k,
            )
        except Exception as e:
            print(f"Error querying vector store: {e}")
            raise


In [None]:
vector=VectorStore()

In [None]:
## converts chunks to embeddings

texts=[doc.page_content for doc in chunks]

## Generate Embeddings

embeddings=embedding_manager.get_embedding(texts)

print(embeddings.shape)


In [None]:
##store in vectore dbstore

vector.add_documents(chunks,embeddings)


# Retriever Pipline From VectorStore

In [None]:
from typing import List, Dict, Any


class RAGRetriever:
    """Handles query-based retrieval of relevant documents from a vector store."""

    def __init__(self, vector_store, embedding_manager):
        self.vector_store = vector_store
        self.embedding_manager = embedding_manager

    def retrieve(
        self,
        query: str,
        top_k: int = 5,
        score_threshold: float = 0.0,
    ) -> List[Dict[str, Any]]:
        """
        Retrieve relevant documents based on a query.
        """
        print(f"Retrieving documents for query: {query}")
        print(f"Top k: {top_k}, Score threshold: {score_threshold}")

        # Embed query
        query_embedding = self.embedding_manager.get_embedding([query])[0]

        try:
            results = self.vector_store.collection.query(
                query_embeddings=[query_embedding.tolist()],
                n_results=top_k,
                include=["documents", "metadatas", "distances", "ids"],
            )
        except Exception as e:
            print(f"Error querying vector store: {e}")
            return []

        retrieved_docs = []

        if not results["documents"] or not results["documents"][0]:
            print("No documents retrieved")
            return []

        documents = results["documents"][0]
        metadatas = results["metadatas"][0]
        distances = results["distances"][0]
        ids = results["ids"][0]

        for i, (doc_id, doc, metadata, distance) in enumerate(
            zip(ids, documents, metadatas, distances)
        ):
            # Assumes cosine distance
            similarity = 1.0 - distance

            if similarity >= score_threshold:
                retrieved_docs.append(
                    {
                        "id": doc_id,
                        "document": doc,
                        "metadata": metadata,
                        "similarity_score": similarity,
                        "distance": distance,
                        "rank": i + 1,
                    }
                )

        print(f"Retrieved {len(retrieved_docs)} documents")
        return retrieved_docs


In [None]:
from typing import List, Dict, Any

class RAGRetriever:
    def __init__(self, vector_store, embedding_manager):
        self.vector_store = vector_store
        self.embedding_manager = embedding_manager

    def retrieve(self, query: str, top_k: int = 5, score_threshold: float = 0.0) -> List[Dict[str, Any]]:
        print("\n--- RETRIEVE DEBUG ---")
        print("Query:", query)
        print("Collection count:", self.vector_store.collection.count())

        query_embedding = self.embedding_manager.get_embedding([query])[0]
        print("Query embedding dim:", len(query_embedding))

        results = self.vector_store.collection.query(
            query_embeddings=[query_embedding.tolist()],
            n_results=top_k,
            include=["documents", "metadatas", "distances"],
        )

        # Print raw shapes
        print("Raw keys:", results.keys())
        print("IDs returned:", len(results["ids"][0]) if results.get("ids") else None)
        print("Docs returned:", len(results["documents"][0]) if results.get("documents") else None)
        print("Distances:", results["distances"][0] if results.get("distances") else None)

        retrieved_docs = []
        for i, (doc_id, doc, metadata, distance) in enumerate(
            zip(results["ids"][0], results["documents"][0], results["metadatas"][0], results["distances"][0])
        ):
            # Keep your conversion, but print it
            similarity = 1.0 - distance
            print(f"Rank {i+1}: distance={distance:.4f}, similarity={similarity:.4f}")

            if similarity >= score_threshold:
                retrieved_docs.append({
                    "id": doc_id,
                    "document": doc,
                    "metadata": metadata,
                    "similarity_score": similarity,
                    "distance": distance,
                    "rank": i + 1,
                })

        print("Returned after threshold:", len(retrieved_docs))
        print("--- END DEBUG ---\n")
        return retrieved_docs


In [None]:
vector_store = VectorStore(
    collection_name="pdf_documents",
    persist_directory="../updatedgenerativeai/vector_store"
)


In [None]:
embedding_manager = EmbeddingManager()


In [None]:
rag_retriever = RAGRetriever(
    vector_store=vector_store,
    embedding_manager=embedding_manager
)


In [None]:
rag_retriever.retrieve("what  is his strong skill Asfand?")

# integration of Vectrdb context pipeline with LLM

In [None]:
### simple RAG system pipeline

from langchain_groq import ChatGroq
import os
from dotenv import load_dotenv

load_dotenv()

llm = ChatGroq(model_name="llama-3.3-70b-versatile", groq_api_key=os.getenv("GROQ_API_KEY"))

retriever = RAGRetriever(vector_store, embedding_manager)

In [None]:
## simple RAG function :retireve context + generate response

def reg_simple(query,retriever,llm,top_k=3):
    # retriever the context 
    results = retriever.retrieve(query,top_k=top_k)
    context = "\n\n".join([doc['document'] for doc in results]) if results else "No context found"
    # generate response
    prompt = f"""Answer the following question based on the context provided:
    Context: {context}
    Question: {query}
    Answer: """

    response = llm.invoke([prompt.format(context=context,query=query)])
    return response.content

In [None]:
anse=reg_simple("who is asfand? his strongest quality",retriever,llm)
print(anse)