In [None]:
# ------------------ INSTALL DEPENDENCIES ------------------
!pip install streamlit PyPDF2 faiss-cpu sentence-transformers python-dotenv google-generativeai pyngrok -q

# ------------------ STREAMLIT APP FILE ------------------
streamlit_code = """
import os
import faiss
import numpy as np
import PyPDF2
import streamlit as st
from dotenv import load_dotenv
from sentence_transformers import SentenceTransformer
import google.generativeai as genai

# Load environment variables
load_dotenv()
genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))

# Initialize embedder
embedder = SentenceTransformer("all-MiniLM-L6-v2")

# ---------------- PDF Processing ----------------
def get_pdf_text(pdf_docs):
    \"\"\"Extract text from uploaded PDFs.\"\"\"
    text = ""
    for pdf in pdf_docs:
        try:
            pdf_reader = PyPDF2.PdfReader(pdf)
            for page in pdf_reader.pages:
                text += page.extract_text() or ""
        except Exception as e:
            st.error(f"Error reading {pdf.name}: {e}")
    return text

def split_text(text, chunk_size=500, overlap=50):
    \"\"\"Split text into overlapping word chunks.\"\"\"
    words = text.split()
    chunks = []
    for i in range(0, len(words), chunk_size - overlap):
        chunk = " ".join(words[i:i + chunk_size])
        chunks.append(chunk)
    return chunks

# ---------------- FAISS Indexing ----------------
@st.cache_resource
def create_vector_store(chunks):
    \"\"\"Create and cache FAISS index for text chunks.\"\"\"
    embeddings = embedder.encode(chunks)
    dim = embeddings.shape[1]
    index = faiss.IndexFlatL2(dim)
    index.add(np.array(embeddings).astype('float32'))
    return index, chunks

def search_chunks(index, chunks, query, top_k=3):
    \"\"\"Retrieve top matching chunks for a query.\"\"\"
    query_embedding = embedder.encode([query])
    D, I = index.search(np.array(query_embedding).astype('float32'), k=top_k)
    return [chunks[i] for i in I[0]]

# ---------------- Gemini Query ----------------
def generate_answer(context, question):
    \"\"\"Generate an answer using Gemini model.\"\"\"
    full_prompt = f\"\"\"
    Answer the question using only the information provided in the context.
    Be accurate and detailed. If the answer is not in the context, say:
    'The answer is not provided in the context.'

    Context:
    {context}

    Question:
    {question}

    Answer:
    \"\"\"
    model = genai.GenerativeModel(model_name="gemini-2.5-flash")
    response = model.generate_content(full_prompt)
    return response.text.strip()

# ---------------- Streamlit UI ----------------
def main():
    st.set_page_config("Chat with PDF - Gemini 2.5", layout="wide")
    st.title("📄 Chat with your PDF using Gemini 2.5 Flash")

    with st.sidebar:
        st.header("📤 Upload PDFs")
        pdf_docs = st.file_uploader("Upload PDF files", type=["pdf"], accept_multiple_files=True)

        if st.button("📚 Process PDFs"):
            if not pdf_docs:
                st.warning("Please upload at least one PDF file.")
            else:
                with st.spinner("Reading and indexing..."):
                    raw_text = get_pdf_text(pdf_docs)
                    if not raw_text.strip():
                        st.error("No extractable text found in the uploaded PDFs.")
                    else:
                        chunks = split_text(raw_text)
                        index, chunk_list = create_vector_store(chunks)
                        st.session_state.index = index
                        st.session_state.chunks = chunk_list
                        st.success("✅ PDFs processed! You can now ask questions.")

    user_question = st.text_input("💬 Ask a question about your PDFs:")
    if user_question:
        if "index" not in st.session_state:
            st.warning("Please upload and process a PDF first.")
        else:
            with st.spinner("Thinking..."):
                top_chunks = search_chunks(st.session_state.index, st.session_state.chunks, user_question)
                context_text = " ".join(top_chunks)
                answer = generate_answer(context_text, user_question)
                st.subheader("📝 Answer")
                st.write(answer)

if __name__ == "__main__":
    main()
"""

with open("app.py", "w") as f:
    f.write(streamlit_code)

# ------------------ RUN STREAMLIT + NGROK ------------------
from pyngrok import ngrok

# Set your Gemini API key here in Colab - Replace with your actual key or use Colab Secrets
os.environ["GOOGLE_API_KEY"] = "YOUR_GEMINI_API_KEY" # Consider using Colab secrets for this

# Start ngrok tunnel
# Replace YOUR_AUTHTOKEN with your actual ngrok authtoken if you have one for persistent URLs
# ngrok.set_auth_token("YOUR_AUTHTOKEN") # Uncomment and replace if needed
ngrok.set_auth_token("315HtXLPcxhoHCUGkFoRzHEJqXI_7vZJiGch18iDAc5d62pYU") # ADD YOUR NGROK AUTHTOKEN HERE
public_url = ngrok.connect(8501)
print(f"Streamlit app is running here: {public_url}")

# Run Streamlit
!streamlit run app.py --server.port 8501