<a href="https://colab.research.google.com/github/aiswarya-1422/AI-SMART-SEARCH-for-Kerala-GOVT-/blob/main/pdf_upload.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install required packages
!pip install gradio pdfplumber PyMuPDF transformers sentence-transformers faiss-cpu pytesseract

import gradio as gr
import pdfplumber
import pytesseract
from PIL import Image
import fitz  # PyMuPDF
from transformers import pipeline
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np

# Initialize models
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

# Store document texts and embeddings
document_texts = []
document_embeddings = None

# Function to extract text from PDF (both scanned and text-based)
def extract_text_from_pdf(pdf_file):
    text = ""
    # Try using pdfplumber for text PDFs
    with pdfplumber.open(pdf_file) as pdf:
        for page in pdf.pages:
            page_text = page.extract_text()
            if page_text:
                text += page_text + "\n"

    # If text is empty, try OCR using PyMuPDF and Tesseract
    if not text.strip():
        doc = fitz.open(pdf_file)
        for page in doc:
            pix = page.get_pixmap()
            img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
            text += pytesseract.image_to_string(img) + "\n"
    return text

# Function to summarize text
def summarize_text(text):
    # Split text if too long
    max_chunk = 1000
    summaries = []
    for i in range(0, len(text), max_chunk):
        chunk = text[i:i+max_chunk]
        summary = summarizer(chunk, max_length=150, min_length=40, do_sample=False)
        summaries.append(summary[0]['summary_text'])
    return " ".join(summaries)

# Function to handle PDF upload and summary
def process_pdf(pdf_file):
    global document_texts, document_embeddings
    text = extract_text_from_pdf(pdf_file.name)
    document_texts = text.split("\n")
    embeddings = embedding_model.encode(document_texts)
    document_embeddings = np.array(embeddings).astype("float32")
    summary = summarize_text(text)
    return summary

# Function to answer user query
def answer_query(user_query):
    global document_texts, document_embeddings
    if not document_texts or document_embeddings is None:
        return "Please upload a PDF first!"

    query_emb = embedding_model.encode([user_query]).astype("float32")

    # Search for most relevant chunk using cosine similarity
    index = faiss.IndexFlatIP(document_embeddings.shape[1])
    index.add(document_embeddings)
    D, I = index.search(query_emb, k=1)
    answer = document_texts[I[0][0]]
    return answer

# Gradio interface
with gr.Blocks() as demo:
    gr.Markdown("## Kerala Finance Department PDF Smart Search & Summarizer")
    with gr.Row():
        pdf_input = gr.File(label="Upload PDF", file_types=[".pdf"])
        summary_output = gr.Textbox(label="PDF Summary", lines=10)
    pdf_btn = gr.Button("Generate Summary")

    gr.Markdown("### Ask Questions about PDF")
    query_input = gr.Textbox(label="Enter your question")
    answer_output = gr.Textbox(label="Answer", lines=5)
    query_btn = gr.Button("Get Answer")

    pdf_btn.click(process_pdf, inputs=pdf_input, outputs=summary_output)
    query_btn.click(answer_query, inputs=query_input, outputs=answer_output)

demo.launch()


Collecting pdfplumber
  Downloading pdfplumber-0.11.7-py3-none-any.whl.metadata (42 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/42.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting PyMuPDF
  Downloading pymupdf-1.26.4-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.1 kB)
Collecting pytesseract
  Downloading pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)
Collecting pdfminer.six==20250506 (from pdfplumber)
  Downloading pdfminer_six-20250506-py3-none-any.whl.metadata (4.2 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-4.30.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.5/48.5 kB

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Device set to use cpu


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://fc28b47ca264f371c3.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


