# Project — Part 2: Gradio: Single Image

Abhinav Kumar
12/7/2025

In [12]:
import cv2
import pytesseract
from PIL import Image
import numpy as np
import requests
import textwrap
import gradio as gr
import re

OLLAMA_BASE_URL = "http://host.docker.internal:11434"
MODEL_NAME = "qwen2.5:latest"


In [13]:
def ocr_page_image(pil_image: Image.Image) -> str:
    """Run Tesseract OCR on a full paper page screenshot (PIL image)."""
    img = np.array(pil_image)
    if img.ndim == 3 and img.shape[2] == 3:
        img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)

    scale = 1.5
    img = cv2.resize(
        img,
        (int(img.shape[1] * scale), int(img.shape[0] * scale)),
        interpolation=cv2.INTER_CUBIC,
    )

    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    _, thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_OTSU + cv2.THRESH_BINARY)

    text = pytesseract.image_to_string(thresh, lang="eng")
    return text


In [14]:
def call_qwen(system_prompt: str, user_prompt: str) -> str:
    """
    Call local Qwen via Ollama's /api/generate endpoint.
    """
    prompt = textwrap.dedent(f"""
    System: {system_prompt}

    User: {user_prompt}

    Assistant:
    """)

    payload = {
        "model": MODEL_NAME,
        "prompt": prompt,
        "stream": False,
    }

    resp = requests.post(f"{OLLAMA_BASE_URL}/api/generate", json=payload)
    if not resp.ok:
        print("Ollama error status:", resp.status_code)
        print("Ollama error body:", resp.text[:500])
        resp.raise_for_status()

    data = resp.json()
    return data.get("response", "").strip()

test_answer = call_qwen(
    "You are a concise assistant.",
    "Reply with the single word OK."
)
print("Qwen test:", repr(test_answer))


Qwen test: 'OK'


In [23]:
def ocr_table_image(image: Image.Image) -> str:
    """
    Run OCR on an image that primarily contains a table.
    Uses Tesseract, similar to ocr_page_image.
    """
    if image is None:
        return ""

    img = np.array(image)
    if img.ndim == 3 and img.shape[2] == 3:
        img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)

    # Slight upscale
    scale = 1.5
    img = cv2.resize(
        img,
        (int(img.shape[1] * scale), int(img.shape[0] * scale)),
        interpolation=cv2.INTER_CUBIC,
    )

    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    _, thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_OTSU + cv2.THRESH_BINARY)

    text = pytesseract.image_to_string(thresh, lang="eng")
    return text


def explain_table_from_image(image: Image.Image) -> str:
    """
    Given an image containing (mostly) a table, OCR it and ask Qwen
    to explain what the table is telling us.
    """
    if image is None:
        return "Please upload an image that clearly shows the table."

    table_text = ocr_table_image(image)
    if not table_text.strip():
        return "I couldn't read any text from the table image. Try a clearer or closer crop."

    system_prompt = (
        "You are an AI tutor helping a student interpret tables in research papers. "
        "You focus on trends, comparisons, and the main message of the table."
    )

    user_prompt = textwrap.dedent(f"""
    Here is noisy OCR text from a table in a research paper. It may have imperfect alignment
    and formatting. Cells are mostly separated by spaces, and rows by newlines.

    ---- TABLE OCR TEXT ----
    {table_text}
    ---- END TABLE OCR TEXT ----

    Task:
    1. Reconstruct in your head what this table likely looks like (rows, columns).
    2. Explain, in clear language, what this table is telling us:
       - What quantities or metrics are being compared?
       - Which rows/conditions/models perform best?
       - What patterns or trade-offs are visible?
    3. If you can infer it, say what the takeaway is in the context of a model or method comparison.

    Be concise but specific: 2–4 short paragraphs max.
    """)

    answer = call_qwen(system_prompt, user_prompt)
    return answer


In [26]:
def most_detrimental_ablation_from_image(image: Image.Image) -> str:
    """
    Given an image containing an ablation table, OCR it and ask Qwen
    to identify which ablation hurts performance the most and why.
    """
    if image is None:
        return "Please upload an image of the ablation table."

    table_text = ocr_table_image(image)
    if not table_text.strip():
        return "I couldn't read any text from the ablation table. Try a clearer or closer crop."

    system_prompt = (
        "You are an AI researcher analyzing ablation studies in computer vision papers. "
        "You are careful and explicit about how you interpret the numbers."
    )

    user_prompt = textwrap.dedent(f"""
    Below is noisy OCR text from an ablation study table in a research paper.
    It may have imperfect alignment and formatting, but rows generally correspond
    to different variants of a model, and columns to metrics.

    ---- ABLATION TABLE OCR TEXT ----
    {table_text}
    ---- END ABLATION TABLE OCR TEXT ----

    Task:
    1. Reconstruct in your head which row is the 'full model' or baseline
       (the configuration with all components turned on, if possible).
    2. For each ablated variant (rows where one or more components are removed
       or changed), look at the main performance metric (e.g., mIoU, mAP, PSNR).
    3. Identify which ablation (i.e., which removed/changed component) causes
       the largest drop in that main metric compared to the full model.
    4. Explain in clear language:
       - Which ablation is the most detrimental (name the component / setting).
       - How large the performance drop is (roughly, from X to Y).
       - Why this suggests that component is especially important for the method.

    If there is ambiguity (e.g., multiple metrics or no clear baseline), explain
    how you resolved it and what reasonable assumption you made.

    Keep the answer to 2–4 short paragraphs.
    """)

    answer = call_qwen(system_prompt, user_prompt)
    return answer


In [15]:
def split_into_paragraphs(text: str) -> list[str]:
    """
    Split OCR text into coarse 'paragraphs' based on blank lines.
    Filters out very short fragments.
    """
    raw_paragraphs = re.split(r"\n\s*\n", text)
    paragraphs = []
    for p in raw_paragraphs:
        clean = p.strip()

        if len(clean) >= 60:
            paragraphs.append(clean)
    return paragraphs


def heuristic_score(paragraph: str) -> float:
    """
    Very simple heuristic to guess importance:
    - Gives points for key section-like words.
    - Bonus for mentions of 'we propose', 'our method', etc.
    """
    p = paragraph.lower()
    score = 0.0

    keywords = {
        "abstract": 3.0,
        "introduction": 3.0,
        "method": 2.0,
        "approach": 2.0,
        "efficientvit": 2.0,
        "attention": 1.5,
        "multi-scale": 1.5,
        "dense prediction": 1.5,
        "segmentation": 1.0,
        "experiment": 1.0,
        "results": 1.0,
        "conclusion": 2.0,
    }

    for kw, w in keywords.items():
        if kw in p:
            score += w

    contribution_phrases = [
        "we propose",
        "we present",
        "in this paper",
        "our method",
        "our approach",
        "we introduce",
    ]
    for phrase in contribution_phrases:
        if phrase in p:
            score += 3.0

    score += len(paragraph) / 100.0

    return score


In [16]:
def explain_highlighted_text_from_image(image: Image.Image, highlighted_text: str) -> str:
    """
    Given a page image and a user-provided highlighted passage (as text),
    run OCR for context and ask Qwen to explain that passage in a tutorial style.
    """
    if image is None:
        return "Please upload a page image first."

    if not highlighted_text or not highlighted_text.strip():
        return "Please paste or type the highlighted text you want explained."

    page_text = ocr_page_image(image)

    system_prompt = (
        "You are an AI tutor helping a graduate student understand an AI research paper. "
        "You explain ideas clearly, with intuition and a simple example when useful."
    )

    user_prompt = textwrap.dedent(f"""
    Here is OCR text from a page of a research paper. It may contain noise; ignore formatting
    mistakes and focus on the main ideas.

    ---- PAGE TEXT (CONTEXT) ----
    {page_text}
    ---- END PAGE TEXT ----

    The user highlighted this specific passage on the page:

    ---- HIGHLIGHTED PASSAGE ----
    {highlighted_text}
    ---- END HIGHLIGHTED PASSAGE ----

    Task:
    1. Explain what this highlighted passage is saying in simple terms.
    2. Give the intuition behind it and why it matters in the context of the paper.
    3. Provide a small concrete example if it helps.

    Keep the explanation within 2–4 short paragraphs.
    """)

    answer = call_qwen(system_prompt, user_prompt)
    return answer


In [17]:
def answer_question(image: Image.Image, question: str) -> str:
    """
    Given a page image and a natural language question,
    run OCR then ask Qwen to answer using the page text.
    """
    if image is None:
        return "Please upload a page image first."

    page_text = ocr_page_image(image)

    system_prompt = (
        "You are an AI tutor helping a student understand an AI research paper. "
        "You only use information that can reasonably be inferred from the OCR text."
    )

    user_prompt = textwrap.dedent(f"""
    Here is OCR text from a page of a research paper. It may be noisy; ignore formatting
    mistakes and focus on the main ideas.

    ---- OCR TEXT START ----
    {page_text}
    ---- OCR TEXT END ----

    Question: {question}
    """)

    answer = call_qwen(system_prompt, user_prompt)
    return answer


In [18]:
def auto_highlight_sections_from_image(image: Image.Image, top_k: int = 3) -> str:
    """
    Given a page image, OCR it, pick top-k important paragraphs using a heuristic,
    and then ask Qwen to justify why each one is important.
    """
    if image is None:
        return "Please upload a page image first."

    page_text = ocr_page_image(image)
    paragraphs = split_into_paragraphs(page_text)
    if not paragraphs:
        return "Could not find enough text on this page to highlight."

    scored = [(heuristic_score(p), p) for p in paragraphs]
    scored.sort(reverse=True, key=lambda x: x[0])

    top = scored[:top_k]

    numbered_paras = "\n\n".join(
        f"[{i+1}] {p}" for i, (_, p) in enumerate(top)
    )

    system_prompt = (
        "You are an AI tutor helping a student quickly understand which parts "
        "of a research paper page are most important."
    )

    user_prompt = textwrap.dedent(f"""
    We OCR'd a page from a computer vision paper (e.g., EfficientViT).
    Below are {top_k} candidate important snippets selected by a heuristic.

    For each snippet:
    - Explain in 1–2 sentences what it is talking about.
    - Explain in 1–2 sentences why it is important for understanding the paper.
    - Be concise.

    ---- CANDIDATE 'PURPLE HIGHLIGHTS' ----
    {numbered_paras}
    ---- END SNIPPETS ----

    Produce an answer in this format:

    [1] <short explanation of content>
        <why it is important>

    [2] ...
    """)

    explanation = call_qwen(system_prompt, user_prompt)

    display_text = "Auto-selected important snippets (simulated purple highlights):\n\n"
    for i, (_, p) in enumerate(top):
        display_text += f"[{i+1}] {p}\n\n"

    display_text += "Explanations:\n\n" + explanation
    return display_text


In [27]:
def build_demo():
    with gr.Blocks() as demo:
        gr.Markdown("## CUA v0 – Ask Questions and Explain Highlights for a Paper Page")

        with gr.Row():
            with gr.Column():
                image_input = gr.Image(type="pil", label="Paper page or table screenshot")

                question_input = gr.Textbox(
                    label="Question about this page",
                    placeholder="e.g., 'What is the main idea of this page?'",
                )

                highlight_input = gr.Textbox(
                    label="Highlighted text (paste what you highlighted on the page)",
                    placeholder="Paste the yellow-highlighted sentence or paragraph here.",
                    lines=4,
                )

                with gr.Row():
                    submit_btn = gr.Button("Answer Question")
                    explain_btn = gr.Button("Explain Highlighted Text")

                auto_btn = gr.Button("Auto-highlight important sections (purple)")

                table_btn = gr.Button("Explain table (upload/crop the table)")

                ablation_btn = gr.Button("Find most detrimental ablation")

            with gr.Column():
                answer_output = gr.Textbox(
                    label="Answer / Highlights / Table Explanation",
                    lines=20,
                )

        submit_btn.click(
            fn=answer_question,
            inputs=[image_input, question_input],
            outputs=[answer_output],
        )

        explain_btn.click(
            fn=explain_highlighted_text_from_image,
            inputs=[image_input, highlight_input],
            outputs=[answer_output],
        )

        auto_btn.click(
            fn=auto_highlight_sections_from_image,
            inputs=[image_input],
            outputs=[answer_output],
        )

        table_btn.click(
            fn=explain_table_from_image,
            inputs=[image_input],
            outputs=[answer_output],
        )

        ablation_btn.click(
            fn=most_detrimental_ablation_from_image,
            inputs=[image_input],
            outputs=[answer_output],
        )


    return demo


In [28]:
demo = build_demo()
demo.launch(
    server_name="0.0.0.0",
    server_port=7865,
    share=True,
    inbrowser=False,
)


* Running on local URL:  http://0.0.0.0:7865
* Running on public URL: https://f3cd21bead6361b760.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


