In [1]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader
from pathlib import Path

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def process_all_pdfs(pdf_dir):
    all_documents = []
    pdf_dir = Path(pdf_dir)
    
    pdf_files = list(pdf_dir.glob("**/*.pdf"))
    
    print(f"Found {len(pdf_files)} file to process")
    
    for pdf_file in pdf_files:
        print(f"filename {pdf_file.name}")
        
        try:
            loader = PyPDFLoader(str(pdf_file))
            documents = loader.load()
            
            for doc  in documents:
                doc.metadata["source_file"] = pdf_file.name
                doc.metadata["file_type"] = 'pdf'
                
            all_documents.extend(documents)
            
            print(f"Loaded {len(documents)} pages")
            
        except Exception as e:
            print("error ", e)
    print(f"\n Total document loaded: {len(all_documents)}")
    return all_documents


In [3]:
all_pdf_docs = process_all_pdfs("../data")

Found 3 file to process
filename Deep learning_251010_124945.pdf
Loaded 58 pages
filename disease-handbook-complete.pdf
Loaded 86 pages
filename District wise providers list in Male-Female Sterlisation.pdf
Loaded 19 pages

 Total document loaded: 163


# chunking

In [4]:
def split_documents(documents):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024, 
                                                   chunk_overlap=20, 
                                                   length_function=len, 
                                                   separators=["\n\n", "\n", " ", ""])
    
    split_doc = text_splitter.split_documents(documents)
    print(f"Split {len(documents)} documents into {len(split_doc)} chunks")
    
    if split_doc:
        print("\n Exmaple chunks")
        print(f"Content: {split_doc[0].page_content[:200]}....")
        print(f"meta: {split_doc[0].metadata}....")
    
    return split_doc

In [5]:
chunks = split_documents(all_pdf_docs)

Split 163 documents into 434 chunks

 Exmaple chunks
Content: Disease Handbook 
for 
Childcare Providers 
 
 
 
 
 
New Hampshire Department of Health and Human Services 
Division of Public Health Services 
Bureau of Infectious Disease Control 
29 Hazen Drive 
C....
meta: {'producer': 'Adobe PDF Library 11.0', 'creator': 'Acrobat PDFMaker 11 for Word', 'creationdate': '2018-05-15T17:07:23-04:00', 'author': 'Jane Bertolone', 'company': 'State of New Hampshire', 'keywords': '', 'moddate': '2018-05-30T15:36:28-04:00', 'title': '', 'source': '..\\data\\pdf\\disease-handbook-complete.pdf', 'total_pages': 86, 'page': 0, 'page_label': '1', 'source_file': 'disease-handbook-complete.pdf', 'file_type': 'pdf'}....


# embeddings

In [6]:
import numpy as np
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
import uuid
from typing import List, Dict, Any, Tuple
from sklearn.metrics.pairwise import cosine_similarity

In [8]:
class EmbeddingManager:
    def __init__(self, model_name:str="all-MiniLM-L6-v2"):
        self.model_name = model_name
        self.model = None
        self._load_model()
        
    def _load_model(self):
        try:
            print("model name : ", self.model_name)
            self.model = SentenceTransformer(self.model_name)
            print(f"Model load sucessfully. Embedding dim: {self.model.get_sentence_embedding_dimension()}")
        except Exception as e:
            print("errror", e)
            
    def generate_embeddings(self, texts:List[str]) -> np.array:
        if not self.model:
            raise ValueError("Model not found")
        
        print(f"Generating embeddings for {len(texts)} texts...")
        embeddings = self.model.encode(texts, show_progress_bar=True)
        print(f"Generating embeddings dim :  {embeddings.shape} ")
        
        return embeddings


embedding_manager = EmbeddingManager()
embedding_manager
        

model name :  all-MiniLM-L6-v2
Model load sucessfully. Embedding dim: 384


<__main__.EmbeddingManager at 0x27ecf87ae90>

# vector store

In [15]:
import os

class VectorStore:
    def __init__(self, collection_name: str="pdf_documents", persistant_dir:str="../data/vector_store"):
        self.collection_name = collection_name
        self.persistant_dir = persistant_dir
        self.client = None
        self.collection = None
        self._initialize_store()
        
    def _initialize_store(self):
        try:
            # create a persistant chroma client
            os.makedirs(self.persistant_dir, exist_ok= True)
            self.client = chromadb.PersistentClient(path=self.persistant_dir)
            
            #create collection
            self.collection = self.client.get_or_create_collection(
                name= self.collection_name,
                metadata={"description" : "pdf document embedding for rag"}
            )
            
            print(f"Vector initioalized. collection: {self.collection_name}")
            print(f"Exsition doc in collection: {self.collection.count()}")
            
        except Exception as e:
            print("error" ,e)
            
            
    def add_documents(self, documents: List[Any], embedding: np.array):
        if len(documents) !=len(embedding):
            raise ValueError("mismatch in count")
        
        print(f"adding {len(documents)} documents to vector")
        
        
        #preare db
        # Prepare data for ChromaDB
        ids = []
        metadatas = []
        documents_text = []
        embeddings_list = []

        for i, (doc, embedding) in enumerate(zip(documents, embedding)):
            # Generate unique ID
            doc_id = f"doc_{uuid.uuid4().hex[:8]}_{i}"
            ids.append(doc_id)

            # Prepare metadata
            metadata = dict(doc.metadata)
            metadata['doc_index'] = i
            metadata['content_length'] = len(doc.page_content)
            metadatas.append(metadata)

            # Document content
            documents_text.append(doc.page_content)

            # Embedding
            embeddings_list.append(embedding.tolist())

        # Add to collection
        try:
            self.collection.add(
                ids=ids,
                embeddings=embeddings_list,
                metadatas=metadatas,
                documents=documents_text
            )

            print(f"‚úÖ Successfully added {len(documents)} documents to vector store")
            print(f"üìö Total documents in collection: {self.collection.count()}")

        except Exception as e:
            print(f"‚ùå Error adding documents to vector store: {e}")
            raise


In [16]:
vector_store = VectorStore()
vector_store

Vector initioalized. collection: pdf_documents
Exsition doc in collection: 0


<__main__.VectorStore at 0x27ecfdff280>

In [17]:
## convert text to embeddings
texts = [doc.page_content for doc in chunks]

#generate embedding
embeddings = embedding_manager.generate_embeddings(texts)

#store in vectore DB
vector_store.add_documents(chunks, embeddings)

Generating embeddings for 434 texts...


Batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 14/14 [00:11<00:00,  1.23it/s]


Generating embeddings dim :  (434, 384) 
adding 434 documents to vector
‚úÖ Successfully added 434 documents to vector store
üìö Total documents in collection: 434


# Rag retrival

In [18]:
class RAGRetriever:
    """Handles query-based retrieval from the vector store"""
    
    def __init__(self, vector_store: VectorStore, embedding_manager: EmbeddingManager):
        """
        Initialize the retriever
        
        Args:
            vector_store: Vector store containing document embeddings
            embedding_manager: Manager for generating query embeddings
        """
        self.vector_store = vector_store
        self.embedding_manager = embedding_manager

    def retrieve(self, query: str, top_k: int = 5, score_threshold: float = 0.0) -> List[Dict[str, Any]]:
        """
        Retrieve relevant documents for a query
        
        Args:
            query: The search query
            top_k: Number of top results to return
            score_threshold: Minimum similarity score threshold
            
        Returns:
            List of dictionaries containing retrieved documents and metadata
        """
        print(f"Retrieving documents for query: '{query}'")
        print(f"Top K: {top_k}, Score threshold: {score_threshold}")
        
        # Generate query embedding
        query_embedding = self.embedding_manager.generate_embeddings([query])[0]
        
        # Search in vector store
        try:
            results = self.vector_store.collection.query(
                query_embeddings=[query_embedding.tolist()],
                n_results=top_k
            )
            
            # Process results
            retrieved_docs = []
            
            if results['documents'] and results['documents'][0]:
                documents = results['documents'][0]
                metadatas = results['metadatas'][0]
                distances = results['distances'][0]
                ids = results['ids'][0]
                
                for i, (doc_id, document, metadata, distance) in enumerate(zip(ids, documents, metadatas, distances)):
                    # Convert distance to similarity score (ChromaDB uses cosine distance)
                    similarity_score = 1 - distance
                    
                    if similarity_score >= score_threshold:
                        retrieved_docs.append({
                            'id': doc_id,
                            'content': document,
                            'metadata': metadata,
                            'similarity_score': similarity_score,
                            'distance': distance,
                            'rank': i + 1
                        })
                
                print(f"Retrieved {len(retrieved_docs)} documents (after filtering)")
            else:
                print("No documents found")
            
            return retrieved_docs
            
        except Exception as e:
            print(f"Error during retrieval: {e}")
            return []

rag_retriever=RAGRetriever(vector_store,embedding_manager)

In [19]:
rag_retriever

<__main__.RAGRetriever at 0x27ecfdfa680>

In [None]:
rag_retriever.retrieve("What is Lyme Disease ?")

Retrieving documents for query: 'What is Lyme Disease  ?'
Top K: 5, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00, 96.08it/s]

Generating embeddings dim :  (1, 384) 
Retrieved 4 documents (after filtering)





[{'id': 'doc_09d8f7ff_131',
  'content': 'LYME DISEASE (cont.) \nDivision of Public Health Services  Disease Handbook for Childcare Providers \nBureau of Infectious Disease Control   REVISED ‚ÄìJanuary 2018 \nLYME DISEASE \n \nLyme disease is caused by a bacterium, Borrelia \nburgdorferi.  In New England it is transmitted by \na certain type of tick, commonly called the deer \ntick or black -legged tick (scientific name: \nIxodes \nscapularis).  Lyme disease may cause sympt oms \naffecting the skin, nervous system, heart and/or \njoints of an individual.  The NH Department of \nHealth & Health Services made Lyme disease \nreportable in October 1990.  During recent years, \nthe incidence of Lyme disease has increased in \nNew Hampshire. \n \nWho gets this disease? \nThe bacterium that causes Lyme disease is \ntransmitted within the natural cycle of the deer \ntick, which feed on animals such as mice, \nopossums, dogs and deer.  Certain stages of the \ntick ‚Äì especially the nymph and a

In [None]:
import os
from dotenv import load_dotenv
load_dotenv()
api_key = "xxxxxxxxxxxxx"


In [41]:
from langchain_groq import ChatGroq
from langchain_core.prompts import PromptTemplate
from langchain_core.messages import HumanMessage, SystemMessage


In [None]:
### Simple RAG pipeline with Groq LLM
from langchain_groq import ChatGroq
import os
from dotenv import load_dotenv
load_dotenv()

### Initialize the Groq LLM (set your GROQ_API_KEY in environment)
groq_api_key = "xxxxxxxxxxxxxxxxxxxxx"

llm=ChatGroq(groq_api_key=groq_api_key,model_name="llama-3.3-70b-versatile",temperature=0.1,max_tokens=1024)

## 2. Simple RAG function: retrieve context + generate response
def rag_simple(query,retriever,llm,top_k=3):
    ## retriever the context
    results=retriever.retrieve(query,top_k=top_k)
    context="\n\n".join([doc['content'] for doc in results]) if results else ""
    if not context:
        return "No relevant context found to answer the question."
    
    ## generate the answwer using GROQ LLM
    prompt=f"""Use the following context to answer the question concisely.
        Context:
        {context}

        Question: {query}

        Answer:"""
    
    response=llm.invoke([prompt.format(context=context,query=query)])
    return response.content

In [46]:
answer=rag_simple("What is Lyme?",rag_retriever,llm)
print(answer)

Retrieving documents for query: 'What is Lyme?'
Top K: 3, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00, 98.12it/s]

Generating embeddings dim :  (1, 384) 
Retrieved 3 documents (after filtering)





Lyme disease is a bacterial illness caused by Borrelia burgdorferi, transmitted by the deer tick or black-legged tick, affecting the skin, nervous system, heart, and/or joints.


In [None]:
### get the context from the retriever and pass it to the LLM

rag_retriever.retrieve("Unified Multi-task Learning Framework")

Retrieving documents for query: 'Unified Multi-task Learning Framework'
Top K: 5, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00, 100.01it/s]

Generating embeddings dim :  (1, 384) 
Retrieved 0 documents (after filtering)





[]