In [3]:
import numpy as np
import pandas as pd
import os

from dotenv import load_dotenv
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.chains import RetrievalQA
from langchain_google_genai import GoogleGenerativeAI
from langchain_google_genai import ChatGoogleGenerativeAI

In [10]:
load_dotenv()
gemini_api_key = os.getenv("GEMINI_API_KEY")
os.environ["GOOGLE_API_KEY"] = gemini_api_key

Okay, let's craft a Retrieval-Augmented Generation (RAG) application in Python using Langchain to query PDF documents. This will involve several steps:

1.  **Loading PDF Documents:** We'll use `PyPDFLoader` from Langchain to load the PDF files.
2.  **Text Splitting:** We'll split the loaded text into smaller chunks using `RecursiveCharacterTextSplitter` to manage context size for the language model.
3.  **Embedding Generation:** We'll generate embeddings for these chunks using a model like OpenAI's `OpenAIEmbeddings`.
4.  **Vector Store Creation:** We'll store the embeddings in a vector database (e.g., Chroma).
5.  **Retrieval:**  We'll retrieve relevant chunks from the vector store based on a user's query.
6.  **Language Model Integration:** We'll use a language model (e.g., OpenAI's GPT) to generate an answer based on the retrieved context and the user's query.

In [5]:
# ----------------------------------------------------------------------------
# INDEXING
# ----------------------------------------------------------------------------

def load_pdf(pdf_path):
    """Loads a PDF document from the given path."""
    loader = PyPDFLoader(pdf_path)
    documents = loader.load()
    return documents

def split_text(documents, chunk_size=1000, chunk_overlap=200):
    """Splits the documents into chunks."""
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    texts = text_splitter.split_documents(documents)
    return texts

def create_embeddings():
    """Creates embeddings using HuggingFaceEmbeddings.  Good for Gemini"""
    embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")  # Or another suitable model
    return embeddings

def create_vectorstore(texts, embeddings, persist_directory="chroma_db"):
    """Creates a Chroma vectorstore from the texts and embeddings."""
    vectordb = Chroma.from_documents(documents=texts, embedding=embeddings, persist_directory=persist_directory)
    vectordb.persist()  # Persist the vectorstore to disk
    return vectordb

def build_index(pdf_path, chunk_size=1000, chunk_overlap=200, persist_directory="chroma_db"):
    """Builds the index from the PDF document."""
    documents = load_pdf(pdf_path)
    texts = split_text(documents, chunk_size, chunk_overlap)
    embeddings = create_embeddings()
    vectordb = create_vectorstore(texts, embeddings, persist_directory)
    return vectordb


In [12]:
# ----------------------------------------------------------------------------
# QUERYING
# ----------------------------------------------------------------------------

def load_existing_index(persist_directory="chroma_db"):
    """Loads an existing Chroma vectorstore from disk."""
    embeddings = create_embeddings() # Make sure you use the same embedding model used during indexing
    vectordb = Chroma(persist_directory=persist_directory, embedding_function=embeddings)
    return vectordb

def query_index(vectordb, query, chain_type="stuff", k=4, model_name="gemini-2.0-flash"):  # Added model_name
    """Queries the vectorstore and returns the answer."""
    #llm = GoogleGenerativeAI(model=model_name)  #Using generative model
    llm = ChatGoogleGenerativeAI(model=model_name, temperature=0.3) #Using chat generative model
    qa = RetrievalQA.from_chain_type(
        llm=llm,
        chain_type=chain_type,
        retriever=vectordb.as_retriever(search_kwargs={"k": k}),
        return_source_documents=True  # Optional:  Return the source documents used for the answer
    )
    result = qa({"query": query})
    return result

In [8]:
# ----------------------------------------------------------------------------
# MAIN EXECUTION
# ----------------------------------------------------------------------------

if __name__ == "__main__":
    pdf_file_path = "./data/murli-2025-02-21.pdf"  # Replace with your PDF file path.  Example:  "my_document.pdf"
    persist_directory = "chroma_db"  # Directory to store the Chroma database

    # 1. Indexing (Create the vectorstore)
    #    - Only needs to be done once (or when the PDF content changes)
    if not os.path.exists(persist_directory):  # Only build if the database doesn't exist
        print("Creating new index...")
        vectordb = build_index(pdf_file_path, persist_directory=persist_directory)
        print("Index creation complete.")
    else:
        print("Loading existing index...")
        vectordb = load_existing_index(persist_directory)
        print("Index loaded.")

Creating new index...


  embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")  # Or another suitable model


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling%2Fconfig.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Index creation complete.


  vectordb.persist()  # Persist the vectorstore to disk


In [13]:
# 2. Querying (Use the vectorstore to answer questions)
query = "What are the main concepts discussed in the document?"  # Replace with your query
result = query_index(vectordb, query)

print("Question:", query)
print("Answer:", result["result"])

Question: What are the main concepts discussed in the document?
Answer: The document discusses the following main concepts:

*   **The Father as the giver of inheritance and the one who takes everyone back home:** The Father is the one who provides the inheritance and guides souls back to the "home of the Father" and the "home of souls," which is the abode of silence.
*   **Earning an imperishable income:** The Father enables individuals to earn an imperishable income for the future, contrasting with the perishable income provided by gurus and holy men.
*   **Understanding and imbibing knowledge:** Sensible children are able to imbibe the knowledge given by the Father.
*   **Remembering the Father and the inheritance:** The main thing is to remember the Father and the inheritance. The essence that remains is “Manmanabhav”.
*   **The Father as the Ocean of Knowledge and the Creator:** The Father is the Ocean of Knowledge and the Creator, not Shri Krishna.
*   **Establishing a kingdom an