<a href="https://colab.research.google.com/github/amalks02/finwise-genai-capstone/blob/task-04-rag-advanced-memory/task_4_rag_memory_retrieval.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install dependencies
!pip install langchain langchain-google-genai faiss-cpu sentence-transformers pypdf langchain-community

import os
from pypdf import PdfReader   # Better PDF parser
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.retrievers.multi_query import MultiQueryRetriever
import google.generativeai as genai

# ============= 1. Set API Key Manually =============
GOOGLE_API_KEY = "AIzaSyBxOCf3v8px5yoqjPEKQmRWrl-9EShPo9c"   # 🔑 replace with your actual Gemini API key

# Configure Google GenAI SDK
genai.configure(api_key=GOOGLE_API_KEY)

# ============= 2. Load PDF and split text =============
def get_pdf_text(pdf_docs):
    text = ""
    for pdf in pdf_docs:
        reader = PdfReader(pdf)
        for page in reader.pages:
            text += page.extract_text() or ""
    return text

def split_text(text, chunk_size=500, overlap=50):
    words = text.split()
    chunks = []
    for i in range(0, len(words), chunk_size - overlap):
        chunk = " ".join(words[i:i + chunk_size])
        chunks.append(chunk)
    return chunks

# ============= 3. Build FAISS Vector DB =============
embedder = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

def create_vector_store(chunks):
    return FAISS.from_texts(chunks, embedding=embedder)

# ============= 4. Setup LLM (Gemini Pro with manual key) =============
llm = ChatGoogleGenerativeAI(
    model="gemini-2.0-flash",
    temperature=0.3,
    google_api_key=GOOGLE_API_KEY   # ✅ this avoids RefreshError
)

# ============= 5. Add MultiQueryRetriever =============
def build_retriever(vectorstore):
    return MultiQueryRetriever.from_llm(
        retriever=vectorstore.as_retriever(search_kwargs={"k": 3}),
        llm=llm
    )

# ============= 6. Add Memory =============
memory = ConversationBufferMemory(
    memory_key="chat_history",
    return_messages=True,
    output_key='answer' # Add this line to explicitly set the output key
)

# ============= 7. Build Conversational Chain =============
def build_conversational_chain(vectorstore):
    retriever = build_retriever(vectorstore)
    return ConversationalRetrievalChain.from_llm(
        llm=llm,
        retriever=retriever,
        memory=memory,
        return_source_documents=True
    )

# ============= 8. Run Q&A Session =============
pdf_docs = ["kyc_compliance_report.pdf"]   # replace with your PDF file
raw_text = get_pdf_text(pdf_docs)
chunks = split_text(raw_text)

if not chunks:
    raise ValueError("⚠️ No extractable text found in the PDF. If it's scanned, use OCR.")

vectorstore = create_vector_store(chunks)
qa_chain = build_conversational_chain(vectorstore)

print("✅ PDF ready! Start asking questions...\n")

while True:
    query = input("Your Question (or 'exit'): ")
    if query.lower() in ["exit", "quit"]:
        break
    result = qa_chain({"question": query})
    print("\nAnswer:", result["answer"])
    print("-" * 60)