In [2]:
from langchain_core.documents import Document
from langchain_community.document_loaders import DirectoryLoader
from langchain_community.document_loaders import PyPDFLoader,PyMuPDFLoader
from pathlib import Path
import os

def process_all_pdfs(pdf_directory):
    """Process all PDF files in a directory"""
    all_documents = []
    pdf_dir = Path(pdf_directory)
    
    # Find all PDF files recursively
    pdf_files = list(pdf_dir.glob("**/*.pdf"))
    
    print(f"Found {len(pdf_files)} PDF files to process")
    
    for pdf_file in pdf_files:
        print(f"\nProcessing: {pdf_file.name}")
        try:
            loader = PyPDFLoader(str(pdf_file))
            documents = loader.load()
            
            # Add source information to metadata
            for doc in documents:
                doc.metadata['source_file'] = pdf_file.name
                doc.metadata['file_type'] = 'pdf'
            
            all_documents.extend(documents)
            print(f"  ✓ Loaded {len(documents)} pages")
            
        except Exception as e:
            print(f"  ✗ Error: {e}")
    
    print(f"\nTotal documents loaded: {len(all_documents)}")
    return all_documents

all_pdfs = process_all_pdfs("data")

  from .autonotebook import tqdm as notebook_tqdm


Found 5 PDF files to process

Processing: cellular_biology.pdf
  ✓ Loaded 23 pages

Processing: exvivo.pdf
  ✓ Loaded 19 pages

Processing: mice_in_bion.pdf
  ✓ Loaded 15 pages

Processing: microgravity_pelvic_bone.pdf
  ✓ Loaded 15 pages

Processing: stem_cells_in_microgravity.pdf
  ✓ Loaded 18 pages

Total documents loaded: 90


In [3]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
def split_documents(documents,chunk_size=1000,chunk_overlap=200):
    """Split documents into smaller chunks for better RAG performance"""
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
        separators=["\n\n", "\n", " ", ""]
    )
    split_docs = text_splitter.split_documents(documents)
    print(f"Split {len(documents)} documents into {len(split_docs)} chunks")
    
    # Show example of a chunk
    if split_docs:
        print(f"\nExample chunk:")
        print(f"Content: {split_docs[0].page_content[:200]}...")
        print(f"Metadata: {split_docs[0].metadata}")
    
    return split_docs

In [4]:
chunks = split_documents(all_pdfs)


Split 90 documents into 484 chunks

Example chunk:
Content: Academic Editors: John Lawler and
Khaled Kamal
Received: 21 February 2025
Revised: 21 March 2025
Accepted: 25 March 2025
Published: 27 March 2025
Citation: López Garzón, N.A.;
Pinzón-Fernández, M.V .;...
Metadata: {'producer': 'pdfTeX-1.40.25', 'creator': 'LaTeX with hyperref', 'creationdate': '2025-03-27T16:39:39+08:00', 'author': 'Nelson Adolfo López Garzón, María Virginia Pinzón-Fernández, Jhan S. Saavedra T., Humberto A. Nati-Castillo, Marlon Arias-Intriago, Camila Salazar-Santoliva and Juan S. Izquierdo-Condoy', 'keywords': 'microgravity; tissue effects; immune system; cardiomyocytes; cancer biology; human health', 'moddate': '2025-03-27T09:43:26+01:00', 'subject': 'Microgravity, defined by minimal gravitational forces, represents a unique environment that profoundly influences biological systems, including human cells. This review examines the effects of microgravity on biological processes and their implications for hum

In [5]:
import numpy as np
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
import uuid
from typing import List,Dict,Any,Tuple
from sklearn.metrics.pairwise import cosine_similarity

In [6]:
class EmbeddingManager:
    def __init__(self,model_name: str = 'all-MiniLM-L6-v2'):
        self.model_name = model_name
        self.model = None
        self._load_model()

    def _load_model(self):
        try:
            self.model = SentenceTransformer(self.model_name)
        except Exception as e:
            raise ValueError(f"Error loading model {self.model_name}: {e}")

    def generate_embeddings(self,texts:List[str]) -> np.ndarray:
        if not self.model:
            raise ValueError("Model is not loaded.")
        try:
            embeddings = self.model.encode(texts, convert_to_numpy=True)
            return embeddings
        except Exception as e:
            raise ValueError(f"Error generating embeddings: {e}")

em = EmbeddingManager()
em            

<__main__.EmbeddingManager at 0x27e927e81a0>

In [7]:
import os
class VectorStore:
    def __init__(self,collection_name:str="documents",persist_directory:str="data/vector_db"):
        self.collection_name = collection_name
        self.persist_directory = persist_directory
        self.client = None
        self.collection = None
        self._initialize_store()

    def _initialize_store(self):
        try:
            os.makedirs(self.persist_directory,exist_ok=True)
            self.client = chromadb.PersistentClient(path=self.persist_directory)
            self.collection = self.client.get_or_create_collection(
                name=self.collection_name,
                metadata={"description": "pdf embeddings for rag"}
            )
            print(f"Vector store initialized at {self.persist_directory}")
            print(f"Collection '{self.collection_name}' is ready.")
            print(f"Existing Documents in collection: {self.collection.count()}")
        except Exception as e:
            print(f"Error initializing vector store: {e}")
            raise

    def add_documents(self,documents:List[Any],embeddings:np.ndarray):
        if len(documents) != len(embeddings):
            raise ValueError("Number of documents and embeddings must match.")
        print(f"Adding {len(documents)} documents to the vector store...")
        ids = []
        metadatas = []
        documents_text = []
        embeddings_list = []

        for i,(doc,emb) in enumerate(zip(documents,embeddings)):
            doc_id = f"doc_{uuid.uuid4().hex[:8]}_{i}"
            ids.append(doc_id)
            metadata = dict(doc.metadata)
            metadata['doc_index'] = i
            metadata['content_length'] = len(doc.page_content)
            metadatas.append(metadata)
            documents_text.append(doc.page_content)
            embeddings_list.append(emb.tolist())
        try:
            self.collection.add(
                ids=ids,
                metadatas=metadatas,
                documents=documents_text,
                embeddings=embeddings_list
            )
            print(f"Successfully added {len(documents)} documents.")
            print(f"Total Documents in collection: {self.collection.count()}")
        except Exception as e:
            print(f"Error adding documents: {e}")
            raise

vectorstore = VectorStore()
vectorstore

: 

In [1]:
text = [doc.page_content for doc in chunks]
embeddings = em.generate_embeddings(text)
vectorstore.add_documents(chunks,embeddings)

NameError: name 'chunks' is not defined

In [11]:
class RAGRetriever:
    def __init__(self,vectorstore,embeddingmanager):
        self.vectorstore = vectorstore
        self.embeddingmanager = embeddingmanager

    def retrieve(self,query:str,top_k:int = 5, score_threshold:float = 0.0)->List[Dict[str,Any]]:
        qemb = self.embeddingmanager.generate_embeddings([query])[0]
        try:
            results = self.vectorstore.collection.query(
                query_embeddings=[qemb.tolist()],
                n_results=top_k
            )
            retrieved = []
            if results['distance'] and results['distance'][0]:
                documents = results['documents'][0]
                metadatas = results['metadatas'][0]
                distances = results['distances'][0]
                ids = results['ids'][0]
                for i,(doc_id,doc,meta,dist) in enumerate(zip(ids,documents,metadatas,distances)):
                    score = 1 - dist
                    if score >= score_threshold:
                        retrieved.append({
                            'id': doc_id,
                            'content': doc,
                            'metadata': meta,
                            'similarity_score': score,
                            'rank': i+1
                        })
                print(f"Retrieved {len(retrieved)} documents for the query.")
            else:
                print("No documents retrieved.")
            return retrieved
        except Exception as e:
            print(f"Error during retrieval: {e}")
            return []        
            

In [None]:
from langchain_google_genai import ChatGoogleGenerativeAI
import os
from dotenv import load_dotenv
load_dotenv()
gemini_api_key = os.getenv("GOOGLE_API_KEY")

llm = ChatGoogleGenerativeAI(model_name="gemini-1.5-flash",temperature=0.1,max_output_tokens=1024,google_api_key=gemini_api_key)

def rag_func(query,retriever,llm,top_k=3):
    res = retriever.retrieve(query,top_k=top_k)
    context = "\n\n".join(doc['content'] for doc in res) if res else ""
    sources = [{
        'source': doc['metadata'].get('source_file',doc['metadata'].get('source','unknown')),
        'page': doc['metadata'].get('page', 'unknown'),
        'preview': doc['content'][:300]+'...' if doc['content'] else ''
    } for doc in res]
    if not context:
        return "No relevant documents found in the knowledge base."
    prompt = f"Use the following context to answer the question:\n\nContext:\n{context}\n\nQuestion: {query}\n\nAnswer:"
    response = llm.invoke(prompt.format(context=context,query=query))
    output = {
        'answer':response.content,
        'sources': sources
    }
    return output
