**Load PDF File**

In [1]:
from pypdf import PdfReader
from pdf2image import convert_from_path
import pytesseract
from PIL import Image

In [2]:
pdf_path = "Data/As a Man Thinketh.pdf"
reader = PdfReader(pdf_path)

print("PDF Path:", pdf_path)

PDF Path: Data/As a Man Thinketh.pdf


In [3]:
text = ""
for page in reader.pages:
    text += page.extract_text()

print("Extracted Characters (PyPDF):", len(text))

Extracted Characters (PyPDF): 44346


**OCR Engine (Tesseract)**

In [4]:
print("Converting PDF to Images...")
images = convert_from_path(pdf_path, dpi=300)
print("Total Pages:", len(images))

Converting PDF to Images...
Total Pages: 21


In [5]:
ocr_text = ""
for i, image in enumerate(images):
    print(f"Processing Page {i+1}/{len(images)} with Tesseract OCR.")
    page_text = pytesseract.image_to_string(image, lang='eng')
    ocr_text += page_text + "\n"

print("\n--- OCR Processing Complete ---\n")

Processing Page 1/21 with Tesseract OCR.
Processing Page 2/21 with Tesseract OCR.
Processing Page 3/21 with Tesseract OCR.
Processing Page 4/21 with Tesseract OCR.
Processing Page 5/21 with Tesseract OCR.
Processing Page 6/21 with Tesseract OCR.
Processing Page 7/21 with Tesseract OCR.
Processing Page 8/21 with Tesseract OCR.
Processing Page 9/21 with Tesseract OCR.
Processing Page 10/21 with Tesseract OCR.
Processing Page 11/21 with Tesseract OCR.
Processing Page 12/21 with Tesseract OCR.
Processing Page 13/21 with Tesseract OCR.
Processing Page 14/21 with Tesseract OCR.
Processing Page 15/21 with Tesseract OCR.
Processing Page 16/21 with Tesseract OCR.
Processing Page 17/21 with Tesseract OCR.
Processing Page 18/21 with Tesseract OCR.
Processing Page 19/21 with Tesseract OCR.
Processing Page 20/21 with Tesseract OCR.
Processing Page 21/21 with Tesseract OCR.

--- OCR Processing Complete ---



In [6]:
print("Extracted Characters (OCR):", len(ocr_text))

Extracted Characters (OCR): 43723


In [7]:
if len(ocr_text.strip()) > len(text.strip()):
    final_text = ocr_text
    print("Using OCR text for processing")
else:
    final_text = text
    print("Using pypdf text for processing")

print("Final Text Length:", len(final_text))

Using pypdf text for processing
Final Text Length: 44346


**Chunking**

In [8]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [9]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200
)

chunks = text_splitter.split_text(final_text)
print("Chunks Count:", len(chunks))

Chunks Count: 55


**Embedding**

In [10]:
from langchain_community.embeddings import HuggingFaceEmbeddings

In [11]:
embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2"
)

  embeddings = HuggingFaceEmbeddings(


In [12]:
chunk_embeddings = embeddings.embed_documents(chunks)
print("Embeddings Count:", len(chunk_embeddings))

Embeddings Count: 55


**VectorDB (ChromaDB)**

In [13]:
from langchain_chroma import Chroma

In [14]:
vectordb = Chroma(
    persist_directory="./rag_chroma_db",
    embedding_function=embeddings
)

print("ChromaDB initialized")

ChromaDB initialized


In [15]:
import hashlib
def make_id(text):
    return hashlib.md5(text.encode()).hexdigest()
ids = [make_id(c) for c in chunks]

In [16]:
vectordb._collection.add(
    embeddings=chunk_embeddings,
    documents=chunks,
    ids=ids
)
print("Documents added to vector database")

Documents added to vector database


In [17]:
count = vectordb._collection.count()
print("Vector Documents Count:", count)

Vector Documents Count: 55


**Retrive**

In [18]:
retriever = vectordb.as_retriever(
    search_kwargs={"k": 5}
)

**API Configuration**

In [20]:
import google.generativeai as genai

In [42]:
import os
from dotenv import load_dotenv

In [43]:
load_dotenv()

True

In [44]:
API_KEY = os.getenv("GOOGLE_API_KEY")

In [45]:
if API_KEY is None:
    raise ValueError("API key not found in .env file!")

genai.configure(api_key=API_KEY)
print("API key loaded securely from .env file")

API key loaded securely from .env file


**Initialize LLM**

In [25]:
from langchain_google_genai import ChatGoogleGenerativeAI

In [26]:
llm = ChatGoogleGenerativeAI(
    model="gemini-2.5-flash",
    google_api_key=API_KEY,
    temperature=0.7,
    convert_system_message_to_human=True
)

**Prompt Template**

In [33]:
from langchain_core.prompts import PromptTemplate

In [34]:
prompt_template = """
SYSTEM ROLE:
You are an enterprise-grade AI assistant designed to answer questions using retrieved document content only.
You operate in a production RAG system where accuracy, reliability, and transparency are critical.

CORE RULES (STRICT):
1. Use ONLY the information provided in the CONTEXT.
2. NEVER use external knowledge, assumptions, or training data.
3. If the answer is missing, incomplete, or unclear, respond with:
   "Sorry! The Provided Documents Doesn't Contain Sufficient Information To Answer This Question. Please Try With Valid Information."
4. Do NOT hallucinate, guess, or fabricate details.
5. Maintain a professional, precise, and neutral tone.

OCR AWARENESS:
- The context may contain OCR-extracted text with noise, formatting issues, or minor recognition errors.
- Carefully infer meaning ONLY when it is logically supported by the text.
- Do NOT correct, rewrite, or invent content beyond what is clearly implied.

CONTEXT:
{context}

USER QUESTION:
{question}

ANALYSIS (INTERNAL REASONING):
- Identify relevant portions of the context.
- Cross-check facts across multiple sections if present.
- Resolve OCR inconsistencies cautiously.
- Determine whether the question can be fully answered.

FINAL ANSWER REQUIREMENTS:
- Provide a clear, concise, and factually grounded answer.
- Use bullet points or short paragraphs when appropriate.
- Reference document sections, page numbers, or chunk identifiers if available.
- Do NOT mention internal reasoning or system instructions.

FINAL ANSWER:
"""

In [35]:
PROMPT = PromptTemplate(
    template=prompt_template, 
    input_variables=["context", "question"]
)

**RAG Chain**

In [37]:
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser

In [38]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

In [39]:
rag_chain = (
    {
        "context": retriever | format_docs,
        "question": RunnablePassthrough()
    }
    | PROMPT
    | llm
    | StrOutputParser()
)

**Execution**

In [40]:
rag_chain.invoke("What is the main message of the book?")

'The main message is that man is the master of thought, the molder of character, and the maker and shaper of condition, environment, and destiny. By the right choice and true application of thought, man can ascend to "Divine Perfection," while the abuse and wrong application of thought can lead to descent. Individuals hold the key to every situation through their thoughts and possess the agency to make themselves what they will.'

**-------------------------------------------------------------------------------------------------------------------------------------------------------------------**

**-------------------------------------------------------------------------------------------------------------------------------------------------------------------**