In [None]:
from langchain_community.document_loaders import DirectoryLoader,PyMuPDFLoader

loader = DirectoryLoader("../Single-Source/papers",glob="**/*.pdf",loader_cls=PyMuPDFLoader)
docs = loader.load()

print(f"Loaded {len(docs)} documents")

In [None]:
## Reading all  the documents from the directory

import os
from dotenv import load_dotenv
load_dotenv()
from langchain_text_splitters import RecursiveCharacterTextSplitter
from pathlib import Path


def read_all_docs(directory="../Single-Source/papers"):
    """Read all the documents from the papers directory"""
    all_documents = []
    pdf_dir=Path(directory)
    pdf_files=list(pdf_dir.glob("**/*.pdf"))

    for pdf_file in pdf_files:
        try: 
            loader= PyMuPDFLoader(str(pdf_file))
            docs=loader.load()
            for doc in docs:
                doc.metadata["source"]=pdf_file
                doc.metadata["file_name"]=pdf_file.name
                all_documents.append(doc)
        except Exception as e:
            print(f"Error reading {pdf_file}: {e}")
    return all_documents

In [None]:
all_documents=read_all_docs()
print(f"\nRead {len(all_documents)} documents")

In [None]:
## making chunkings of the documents

def make_chunks(document,chunk_size=1000,chunk_overlap=200):
    """Make chunks of the documents""" 
    text_splitter=RecursiveCharacterTextSplitter(chunk_size=chunk_size,chunk_overlap=chunk_overlap)
    chunks = text_splitter.split_documents(document)
    if chunks:
        print(f"Document split into {len(chunks)} chunks")
        print(f"First chunk: {chunks[0].page_content}")
        print(f"Last chunk: {chunks[-1].page_content}")
    return chunks

In [None]:
chunkss=make_chunks(all_documents)

In [None]:
## Lets makes the Embeddings
import numpy as np
from sentence_transformers import SentenceTransformer
from  typing import List,Dict,Any,Tuple
import chromadb
import uuid
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
class EmbeddingManager:
    """Make the embeddings for the documents""" 
    def __init__(self,model_name="all-MiniLM-L6-v2"):
        self.model_name=model_name
        self.model=None
        self._load_model()
    
    def _load_model(self):
        """ Load the model""" 
        try:
            self.model=SentenceTransformer(self.model_name)
            print(f"Model {self.model_name} loaded successfully")
        except Exception as e:
            print(f"Error loading model: {e}")
            raise
        

    def get_embeddings(self,text:List[str]):
        """ 
        Generate the embeddings for the given text
        Args : 
        texts:List of text strings to embed
        returns : 
        numpy array of embeddings with shape (num_texts,embedding_dim)
        """
        if self.model is None:
            raise ValueError("Model not loaded. Please load the model first")
        embeddings=self.model.encode(text,show_progress_bar=True)
        return embeddings


In [None]:

embedding_manager=EmbeddingManager()
embedding_manager

In [None]:
## Lets create  a vector base
import chromadb
import os
import uuid
import numpy as np
from typing import List, Any

class VectorBase:
    """VectorBase class to store and retrieve embeddings""" 
    def __init__(self,collection_name:str="pdf_documents",persist_directory:str="./vector_store"):
        self.collection_name=collection_name
        self.persist_directory=persist_directory
        self.client=None
        self.collection=None
        self._initialize_client()
    def _initialize_client(self):
        try:
            os.makedirs(self.persist_directory,exist_ok=True)
            self.client=chromadb.PersistentClient(
                path=self.persist_directory
            )
            self.colletion=chromadb.get_or_create_collection(
                name=self.collection_name,
                metadata={"description": "PDF documents embeddings for RAG"}
            )
            print(f"Vector store initialized: {self.collection_name}")
            print(f"Existing documents: {self.collection.count()}")
        except Exception as e:
            print(f"Error initializing vector store: {e}")
    def add_documents(self,documents:List[Any],embeddings:np.ndarray):
        """Add documents to the vector store""" 
        if len(documents)!=len(embeddings):
            raise ValueError("Number of documents must match number of embeddings")
        ids=[]
        documents_text=[]
        embeddings_list=[]
        metadatas=[]
        for i,doc in enumerate(zip(documents,embeddings)):                                                                                                          

            doc_id=f"doc_{uuid.uuid4().hex[:8]}_{i}"
            ids.append(doc_id)
            metadata=dict(doc.metadata)
            metadata["doc_index"]=i
            metadata["content_length"]=len(doc.page_content)
            metadatas.append(metadata)
            documents_text.append(doc.page_content)
            embeddings_list.append(embeddings.tolist())

        try:
            self.collection.add(
                ids=ids,
                embeddings=embeddings_list,
                metadatas=metadatas,
                documents=documents_text
            )

            print(f"Added {len(documents)} documents to vector store")

        except Exception as e:
            print(f"Error adding documents to vector store: {e}")


    def query(self,query:np.ndarray,k:int=5):
        """ Query the vector store for similar documents"""
        try:
            results=self.collection.query(
                query_embeddings=query,
                n_results=k
            )
            return results
        except Exception as e:
            print(f"Error querying vector store: {e}")
            return None

In [None]:
## Retrieve and Generate

class RAGSystem:
    def __init__(self,VectorBase,EmbeddingManager):
        self.vector_base=VectorBase
        self.embedding_manager=EmbeddingManager

    def retrieve(self,query:str,top_k:int=5,score_threshold:float=0.0)->List[Dict[Any]]:
        query_embedding=self.embedding_manager.get_embedding([query])[0]
        results=self.vector_base.collection.query(
            query_embeddings=[query_embedding.tolist()],
            n_results=top_k,
            include=["metadatas","documents","distances"]
        )
        retrieved_docs=[]
        for i,(doc_id,doc,metadata,distance) in enumerate(zip(results["doc_id"][0],results["metadatas"][0],results["documents"][0])):
             # Keep your conversion, but print it
            similarity = 1.0 - distance
            print(f"Rank {i+1}: distance={distance:.4f}, similarity={similarity:.4f}")

            if similarity >= score_threshold:
                retrieved_docs.append({
                    "id": doc_id,
                    "document": doc,
                    "metadata": metadata,
                    "similarity_score": similarity,
                    "distance": distance,
                    "rank": i + 1,
                })

        return retrieved_docs


In [None]:
### simple RAG system pipeline

from langchain_groq import ChatGroq
import os
from dotenv import load_dotenv

load_dotenv()

llm = ChatGroq(model_name="llama-3.3-70b-versatile", groq_api_key=os.getenv("GROQ_API_KEY"))

retriever = RAGRetriever(vector_store, embedding_manager)

In [None]:
## simple RAG function :retireve context + generate response

def reg_simple(query,retriever,llm,top_k=3):
    # retriever the context 
    results = retriever.retrieve(query,top_k=top_k)
    context = "\n\n".join([doc['document'] for doc in results]) if results else "No context found"
    # generate response
    prompt = f"""Answer the following question based on the context provided:
    Context: {context}
    Question: {query}
    Answer: """

    response = llm.invoke([prompt.format(context=context,query=query)])
    return response.content