In [None]:

# RAG Chatbot on Google Colab (Demo Version)

# ✅ STEP 1: Install Dependencies
!pip install faiss-cpu scikit-learn PyPDF2 langchain transformers streamlit-ngrok -q

# ✅ STEP 2: Upload your PDF
from google.colab import files
uploaded = files.upload()

import PyPDF2
pdf_text = ""
for filename in uploaded:
    reader = PyPDF2.PdfReader(open(filename, "rb"))
    for page in reader.pages:
        pdf_text += page.extract_text()

# ✅ STEP 3: Chunk the text
from langchain.text_splitter import RecursiveCharacterTextSplitter
splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
chunks = splitter.split_text(pdf_text)

# ✅ STEP 4: Embed with TF-IDF + Index with FAISS
from sklearn.feature_extraction.text import TfidfVectorizer
import faiss
import numpy as np

vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(chunks).toarray().astype("float32")
index = faiss.IndexFlatL2(vectors.shape[1])
index.add(vectors)

# ✅ STEP 5: Define query+retrieval+mock LLM response
def get_top_k_chunks(query, k=3):
    query_vec = vectorizer.transform([query]).toarray().astype("float32")
    D, I = index.search(query_vec, k)
    return [chunks[i] for i in I[0]]

def mock_generate_response(context_chunks, query):
    context = "\n\n".join(context_chunks)
    return f"**Your Query:** {query}\n\n**Contextual Answer:**\n- {context[:300]}...\n\n(Structured response simulated here)"

# ✅ STEP 6: Interactive input
while True:
    query = input("Ask a question (or type 'exit'): ")
    if query.lower() == "exit":
        break
    top_chunks = get_top_k_chunks(query)
    print(mock_generate_response(top_chunks, query))

# ✅ BONUS: Run Streamlit in Colab (Optional)
