In [1]:
import os
import getpass
import warnings
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.chat_models import ChatOpenAI
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
from langchain.prompts import PromptTemplate
from langchain_google_genai import ChatGoogleGenerativeAI
warnings.filterwarnings("ignore")

print("--- Initializing Setup ---")


--- Initializing Setup ---


In [2]:
# ✅ Set your Gemini API key (no ADC needed)
os.environ["GOOGLE_API_KEY"] = "AIzaSyCmVuOO3WaW1h-bAJyH6YPQJqlGln_Bdu4"

In [3]:
pdf_dir = r"C:\Users\agama\OneDrive\Desktop\manuals"

if not os.path.exists(pdf_dir):
    raise FileNotFoundError(f"Directory not found: {pdf_dir}")

print("PDF directory set to:", pdf_dir)

PDF directory set to: C:\Users\agama\OneDrive\Desktop\manuals


In [4]:
print("\n--- Starting Document Ingestion and Preprocessing ---")

#  Load Documents 
pdf_dir = r"C:\Users\agama\OneDrive\Desktop\manuals"  # Your manuals folder

if not os.path.exists(pdf_dir):
    raise FileNotFoundError(f"Directory not found: {pdf_dir}")

pdf_docs_paths = [os.path.join(pdf_dir, f) for f in os.listdir(pdf_dir) if f.endswith(".pdf")]

all_docs = []
for doc_path in pdf_docs_paths:
    loader = PyPDFLoader(doc_path)
    pages = loader.load_and_split()
    all_docs.extend(pages)

if not all_docs:
    raise ValueError(f"No PDF documents found in '{pdf_dir}'. Please check the folder.")

print(f"Loaded {len(all_docs)} pages from {len(pdf_docs_paths)} PDF documents.")


--- Starting Document Ingestion and Preprocessing ---
Loaded 142 pages from 3 PDF documents.


In [5]:
# split Documents into Chunks 
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=100
)
chunked_docs = text_splitter.split_documents(all_docs)
print(f"Split {len(all_docs)} pages into {len(chunked_docs)} chunks.")


Split 142 pages into 294 chunks.


In [6]:
print("\n--- Initializing Indexing Process ---")


--- Initializing Indexing Process ---


In [7]:
# Initialize Embedding Model 
model_name = "sentence-transformers/all-MiniLM-L6-v2"
model_kwargs = {'device': 'cpu'} # Change to 'cuda' for GPU
encode_kwargs = {'normalize_embeddings': False}
embeddings = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)




In [8]:
print("Embedding model loaded.")

Embedding model loaded.


In [9]:
db_path = "faiss_index"
if os.path.exists(db_path):
    print("Loading existing vector store...")
    db = FAISS.load_local(db_path, embeddings, allow_dangerous_deserialization=True)
    print("Vector store loaded.")
else:
    print("Creating new vector store...")
    db = FAISS.from_documents(chunked_docs, embeddings)
    db.save_local(db_path)
    print(f"Vector store created and saved to {db_path}.")

Loading existing vector store...
Vector store loaded.


In [10]:
# Create a retriever from the vector store
retriever = db.as_retriever(search_kwargs={'k': 4})

In [11]:
print("\n--- Building the Conversational RAG Chain ---")


--- Building the Conversational RAG Chain ---


In [12]:
# ✅ Initialize Gemini LLM
llm = ChatGoogleGenerativeAI(
    model="gemini-1.5-flash",   # or "gemini-1.5-pro"
    temperature=0.1,
    convert_system_message_to_human=True
)

# ✅ Memory for chat history
memory = ConversationBufferMemory(
    memory_key="chat_history",
    return_messages=True
)

print("✅ Gemini initialized successfully")

✅ Gemini initialized successfully


In [13]:
custom_prompt_template = """
You are a helpful assistant that answers questions using the provided context from product manuals.

Guidelines:
- Only use the given context to answer. Do not use outside knowledge.
- If the answer is not contained in the context, say: "I don’t know based on the available manuals."
- Always cite your source at the end of the answer in the format: [SOURCE: document_name, PAGE: page_number].
- If multiple sources are relevant, include all of them.
- Keep answers clear, concise, and user-friendly.

Context:
{context}

Chat History:
{chat_history}

Question:
{question}

Answer:
"""

In [14]:
CUSTOM_QUESTION_PROMPT = PromptTemplate.from_template(custom_prompt_template)

print("\n--- Building the Conversational RAG Chain ---")

qa_chain = ConversationalRetrievalChain.from_llm(
    llm=llm,
    retriever=retriever,
    memory=memory,
    combine_docs_chain_kwargs={"prompt": CUSTOM_QUESTION_PROMPT}
)

print("--- RAG Chain Ready ---")


--- Building the Conversational RAG Chain ---
--- RAG Chain Ready ---


In [15]:
query = "How are you?"
result = qa_chain.invoke({"question": query})

print("\nAnswer:", result["answer"])


Answer: I don’t know based on the available manuals.
