# Project — Part 4: Gradio App

Abhinav Kumar
12/11/2025

In [None]:
from pathlib import Path
from datetime import datetime
import json
import traceback

import requests
from PIL import Image

import gradio as gr

try:
    from pymongo import MongoClient
    MONGO_ENABLED = True
except ImportError:
    MONGO_ENABLED = False

FRAME_PATH = Path("/workspaces/eng-ai-agents/project/frames/latest.png")


In [None]:
mongo_collection = None

if MONGO_ENABLED:
    try:
        mongo_client = MongoClient("mongodb://localhost:27017")
        mongo_db = mongo_client["cua"]
        mongo_collection = mongo_db["interactions"]
        print("Mongo logging enabled.")
    except Exception as e:
        print("Mongo not available, logging disabled:", e)
        mongo_collection = None
else:
    print("pymongo not installed, Mongo logging disabled.")


In [None]:
def load_latest_frame() -> Image.Image:
    """
    Load the most recent frame saved by the WebRTC receiver.
    """
    if not FRAME_PATH.exists():
        raise FileNotFoundError(f"Frame not found at: {FRAME_PATH}")
    return Image.open(FRAME_PATH).convert("RGB")

# try:
#     img = load_latest_frame()
#     print("Loaded latest frame:", img.size)
# except Exception as e:
#     print("Could not load latest frame yet:", e)


In [None]:
import requests
import textwrap

OLLAMA_BASE_URL = "http://host.docker.internal:11434"
MODEL_NAME = "qwen2.5:latest"

def call_qwen(system_prompt: str, user_prompt: str) -> str:
    """
    Call local Qwen via Ollama's /api/generate endpoint.
    """
    prompt = textwrap.dedent(f"""
    System: {system_prompt}

    User: {user_prompt}

    Assistant:
    """)

    payload = {
        "model": MODEL_NAME,
        "prompt": prompt,
        "stream": False,
    }

    resp = requests.post(f"{OLLAMA_BASE_URL}/api/generate", json=payload)
    if not resp.ok:
        print("Ollama error status:", resp.status_code)
        print("Ollama error body:", resp.text[:1000])
        resp.raise_for_status()

    data = resp.json()
    return data.get("response", "").strip()


In [None]:
import cv2
import pytesseract
import numpy as np
from PIL import Image

def ocr_image_to_text(image: Image.Image) -> str:
    """
    Run Tesseract OCR on a full paper page screenshot.

    Input:  PIL.Image (RGB)
    Output: extracted text as a string
    """

    img = np.array(image)   
    img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)

    scale = 1.5
    img = cv2.resize(
        img,
        (int(img.shape[1] * scale), int(img.shape[0] * scale)),
        interpolation=cv2.INTER_CUBIC,
    )

    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    _, thresh = cv2.threshold(
        gray, 0, 255, cv2.THRESH_OTSU | cv2.THRESH_BINARY
    )

    text = pytesseract.image_to_string(thresh, lang="eng")
    return text.strip()


In [None]:
def answer_question_about_current_frame(question: str) -> str:
    """
    High-level CUA function:
      1. Load latest frame from disk.
      2. Run OCR/VLM to get text.
      3. Ask Qwen to answer the user's question based on that text.
      4. Optionally log to MongoDB.
    """
    try:
        image = load_latest_frame()
    except Exception as e:
        return f"Error loading latest frame: {e}"

    try:
        ocr_text = ocr_image_to_text(image)
    except Exception as e:
        tb = traceback.format_exc()
        return f"Error during OCR/VLM processing:\n{e}\n\n{tb}"

    system_prompt = (
        "You are a helpful assistant reading a screenshot of an AI research paper. "
        "You only see the OCR text I give you. Answer the user's question clearly and concisely. "
        "If the answer is not visible in the text, say that you are not sure."
    )

    user_prompt = (
        "Here is the OCR text extracted from the current screen:\n\n"
        f"{ocr_text}\n\n"
        f"User question: {question}\n\n"
        "Please answer in 2–5 sentences."
    )

    try:
        answer = call_qwen(system_prompt, user_prompt)
    except Exception as e:
        tb = traceback.format_exc()
        return f"Error calling Qwen LLM:\n{e}\n\n{tb}"

    if mongo_collection is not None:
        try:
            mongo_collection.insert_one(
                {
                    "timestamp": datetime.utcnow(),
                    "question": question,
                    "answer": answer,
                    "ocr_text": ocr_text,
                }
            )
        except Exception as e:
            print("Mongo logging error:", e)

    return answer


In [None]:
def refresh_frame():
    """
    For the Gradio 'Refresh' button: just reload latest.png.
    """
    try:
        img = load_latest_frame()
        return img
    except Exception as e:
        # Gradio Image can also show None; we return None and let the text explain.
        print("Error refreshing frame:", e)
        return None

def gradio_answer(question: str):
    """
    Thin wrapper around answer_question_about_current_frame for Gradio.
    """
    if not question.strip():
        return "Please enter a question."
    return answer_question_about_current_frame(question)


with gr.Blocks() as demo:
    gr.Markdown(
        """
        # Computer Using Agent — Demo

        1. Make sure your WebRTC receiver is running: `make webrtc`  
        2. Open the screen share page in your browser and start sharing the PDF window.  
        3. Click **Refresh frame** to load the current screen.  
        4. Ask a question about what is visible on the page.
        """
    )

    with gr.Row():
        frame_img = gr.Image(label="Current Screen Frame", type="pil")
        with gr.Column():
            refresh_btn = gr.Button("Refresh frame from latest.png")
            question_box = gr.Textbox(
                label="Question about the current page",
                placeholder="e.g., Explain the highlighted paragraph in simple terms.",
                lines=3,
            )
            answer_box = gr.Markdown(label="Answer")

    refresh_btn.click(fn=refresh_frame, outputs=frame_img)
    question_box.submit(fn=gradio_answer, inputs=question_box, outputs=answer_box)
    ask_btn = gr.Button("Ask")
    ask_btn.click(fn=gradio_answer, inputs=question_box, outputs=answer_box)

demo.launch(
    server_name="0.0.0.0",
    server_port=7861,
    share=False,
    inbrowser=False,
)
