<a href="https://colab.research.google.com/github/abubakarkhanlakhwera/GenAI/blob/main/pdf_reader/RAg_pdf.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install pypdf sentence-transformers faiss-cpu groq streamlit


Collecting pypdf
  Downloading pypdf-5.3.0-py3-none-any.whl.metadata (7.2 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.4 kB)
Collecting groq
  Downloading groq-0.18.0-py3-none-any.whl.metadata (14 kB)
Collecting streamlit
  Downloading streamlit-1.42.0-py2.py3-none-any.whl.metadata (8.9 kB)
Collecting watchdog<7,>=2.1.5 (from streamlit)
  Downloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.3/44.3 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloadi

In [15]:
import os
import pickle
import faiss
import numpy as np
import streamlit as st
from pypdf import PdfReader
from sentence_transformers import SentenceTransformer
from nltk.tokenize import sent_tokenize
import nltk
from groq import Groq

# Download necessary NLTK resources
nltk.download("punkt")
nltk.download("punkt_tab")  # Added to ensure the 'punkt_tab' resource is available

# Initialize the embedding model
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# Option 1: Set your API key as an environment variable
# os.environ["GROQ_API_KEY"] = "api_key"
client = Groq(api_key=os.getenv("GROQ_API_KEY"))

# Option 2: Pass the API key directly
# client = Groq(api_key="your_actual_api_key_here")

# Function to extract text from a PDF file-like object
def extract_text_from_pdf(pdf_file):
    reader = PdfReader(pdf_file)
    text = " ".join([page.extract_text() for page in reader.pages if page.extract_text()])
    return text

# Function to create text chunks from the extracted text
def create_chunks(text, chunk_size=5):
    sentences = sent_tokenize(text)
    chunks = [" ".join(sentences[i:i + chunk_size]) for i in range(0, len(sentences), chunk_size)]
    return chunks

# Function to create a FAISS index from the text chunks
def create_faiss_index(chunks):
    embeddings = np.array([embedding_model.encode(chunk) for chunk in chunks])
    dimension = embeddings.shape[1]
    index = faiss.IndexFlatL2(dimension)
    index.add(embeddings)
    return index

# Function to search the FAISS index for relevant chunks given a query
def search_faiss(query, index, chunks, top_k=3):
    query_embedding = embedding_model.encode(query).reshape(1, -1)
    distances, indices = index.search(query_embedding, top_k)
    retrieved_chunks = [chunks[i] for i in indices[0]]
    return retrieved_chunks

# Function to get an answer from the Groq API using the retrieved chunks as context
def get_groq_response(query, index, chunks):
    retrieved_docs = search_faiss(query, index, chunks)
    context = " ".join(retrieved_docs)
    response = client.chat.completions.create(
        messages=[{"role": "user", "content": f"Context: {context}\nQuestion: {query}"}],
        model="llama-3.3-70b-versatile",
    )
    return response.choices[0].message.content

# Streamlit App UI
st.title("RAG-based QA App")

# Upload PDF file using Streamlit file uploader
if "index" not in st.session_state or "chunks" not in st.session_state:
    uploaded_file = st.file_uploader("Upload a PDF file", type="pdf")
    if uploaded_file is not None:
        with st.spinner("Processing PDF..."):
            # Extract text and create chunks
            pdf_text = extract_text_from_pdf(uploaded_file)
            chunks = create_chunks(pdf_text)
            # Create a FAISS index for the chunks
            index = create_faiss_index(chunks)
            # Store in session state for reuse
            st.session_state["index"] = index
            st.session_state["chunks"] = chunks
        st.success("PDF processed successfully!")
    else:
        st.info("Please upload a PDF file to continue.")
        st.stop()  # Stop execution until a file is uploaded

# Input for the user’s query
query = st.text_input("Enter your question:")

if st.button("Get Answer"):
    if query:
        with st.spinner("Fetching answer..."):
            answer = get_groq_response(query, st.session_state["index"], st.session_state["chunks"])
        st.write("### Answer:")
        st.write(answer)
    else:
        st.warning("Please enter a question.")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [16]:
!wget -qO- ipv4.icanhazip.com

34.91.223.35


In [None]:
# @title Default title text
!streamlit run app.py & npx  localtunnel --port 8501


Collecting usage statistics. To deactivate, set browser.gatherUsageStats to false.
[0m
[1G[0K⠙[1G[0K⠹[1G[0K⠸[1G[0K⠼[1G[0K⠴[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Local URL: [0m[1mhttp://localhost:8501[0m
[34m  Network URL: [0m[1mhttp://172.28.0.12:8501[0m
[34m  External URL: [0m[1mhttp://34.91.223.35:8501[0m
[0m
[1G[0K⠦[1G[0K⠧[1G[0K⠇[1G[0Kyour url is: https://short-guests-design.loca.lt
2025-02-11 16:09:04.549153: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1739290144.579634   12229 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1739290144.588887   12229 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when on