In [28]:
import gradio as gr
import pygetwindow as gw
import pyautogui
import numpy as np
from PIL import Image, ImageDraw
import easyocr
import json
import time

# Initialize OCR Reader
reader = easyocr.Reader(['en'])

# Function to capture screenshot of a selected window
def capture_screenshot(window_title):
    windows = gw.getWindowsWithTitle(window_title)
    if not windows:
        raise ValueError(f"Window with title '{window_title}' not found.")
    
    window = windows[0]
    if window.isMinimized:
        window.restore()
        time.sleep(0.5)
    
    try:
        window.activate()
        time.sleep(0.5)
        x, y, width, height = window.left, window.top, window.width, window.height
        screenshot = pyautogui.screenshot(region=(x, y, width, height))
        return screenshot, (x, y, width, height)
    except gw.PyGetWindowException as e:
        raise RuntimeError(f"Failed to activate window '{window_title}': {e}")

# Function to apply OCR and get initial bounding boxes
def apply_ocr(img, use_ocr=True):
    if use_ocr:
        img_np = np.array(img)
        bounds = reader.readtext(img_np)
        boxes = []
        for bound in bounds:
            top_left, bottom_right = tuple(bound[0][0]), tuple(bound[0][2])
            boxes.append({
                "label": "OCR Box",
                "x": top_left[0],
                "y": top_left[1],
                "width": bottom_right[0] - top_left[0],
                "height": bottom_right[1] - top_left[1]
            })
        return img, boxes
    else:
        return img, []

# Function to update bounding boxes and redraw them on the image
def update_boxes(image, boxes):
    img = image.copy()
    draw = ImageDraw.Draw(img)
    for box in boxes:
        x, y, width, height = box["x"], box["y"], box["width"], box["height"]
        draw.rectangle([x, y, x + width, y + height], outline="red", width=2)
    return img

# Function to retrieve open windows and load initial screenshot with OCR boxes
def configure_window(window_title, use_ocr):
    screenshot, window_size = capture_screenshot(window_title)
    processed_img, boxes = apply_ocr(screenshot, use_ocr)
    return processed_img, boxes

# Save configuration function
def save_configuration(boxes, filename):
    config = {
        "boxes": boxes
    }
    with open(filename, 'w') as f:
        json.dump(config, f)
    return f"Configuration saved to {filename}"

# Gradio Interface
with gr.Blocks() as demo:
    with gr.Tab("Bounding Box Annotation"):
        # Dropdown to select open windows
        window_dropdown = gr.Dropdown(choices=[w.title for w in gw.getAllTitles() if w], label="Select a Window")
        use_ocr = gr.Checkbox(label="Use OCR for Bounding Boxes", value=True)
        load_btn = gr.Button("Load Screenshot")

        # AnnotatedImage to display and interact with bounding boxes
        annotated_img = gr.AnnotatedImage(label="Screenshot with Bounding Boxes")

        # Button to save configuration
        save_btn = gr.Button("Save Configuration")
        save_output = gr.Textbox(label="Save Output")

        # Load screenshot and apply OCR bounding boxes
        load_btn.click(
            fn=configure_window,
            inputs=[window_dropdown, use_ocr],
            outputs=[annotated_img]
        )

        # Update bounding boxes when user edits
        annotated_img.change(
            fn=update_boxes,
            inputs=[annotated_img, annotated_img],
            outputs=annotated_img
        )

        # Save configuration on button click
        save_btn.click(
            fn=save_configuration,
            inputs=[annotated_img, gr.Textbox(value="config.json")],
            outputs=save_output
        )

demo.launch()


Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.


AttributeError: 'AnnotatedImage' object has no attribute 'change'

In [39]:
import gradio as gr
from gradio_image_annotation import image_annotator
import easyocr
import pygetwindow as gw
import json
from PIL import ImageGrab
import numpy as np

# Initialize EasyOCR reader
reader = easyocr.Reader(['en'])

# Capture screenshot of a selected window
def capture_screenshot(window_title):
    window = gw.getWindowsWithTitle(window_title)[0]
    window.activate()
    window_size = (window.left, window.top, window.width, window.height)
    screenshot = ImageGrab.grab(window_size)
    return screenshot, window_size

# Run OCR and create bounding boxes
def generate_bounding_boxes(image, use_ocr):
    image_np = np.array(image)
    boxes = []

    if use_ocr:
        results = reader.readtext(image_np)
        for result in results:
            top_left, bottom_right = result[0][0], result[0][2]
            box = {
                'xmin': int(top_left[0]),
                'ymin': int(top_left[1]),
                'xmax': int(bottom_right[0]),
                'ymax': int(bottom_right[1]),
                'label': result[1]
            }
            boxes.append(box)

    return {'image': image_np, 'boxes': boxes}

# Save configuration to a file
def save_configuration(window_title, window_size, annotations, filename):
    configurations = {
        window_title: {
            'window_size': window_size,
            'annotations': annotations['boxes']
        }
    }
    with open(filename, 'w') as f:
        json.dump(configurations, f)
    return f"Configuration saved to {filename}"

# Gradio UI
with gr.Blocks() as demo:
    gr.Markdown("## Windows Screenshot Annotation and Keystroke Execution")

    # Step 1: Select Window and Capture Screenshot
    window_dropdown = gr.Dropdown(
        choices=[w.title for w in gw.getAllWindows() if w.title],
        label="Select Window"
    )
    capture_button = gr.Button("Capture Screenshot")
    image_output = gr.Image(label="Captured Screenshot", type="pil")

    # Step 2: Display and Annotate Image
    use_ocr = gr.Checkbox(label="Use OCR to Detect Text")
    annotator = image_annotator(label="Annotate Screenshot", label_list=["Text Region"], label_colors=[(0, 255, 0)])

    # Hidden state for storing window size
    window_size_state = gr.State()

    # Step 3: Save Configuration
    filename_input = gr.Textbox(label="Filename to Save Configuration")
    save_button = gr.Button("Save Configuration")
    save_output = gr.Textbox(label="Save Output")

    # Capture screenshot and populate window size state
    def capture_and_process(window_title, use_ocr):
        screenshot, window_size = capture_screenshot(window_title)
        image_data = generate_bounding_boxes(screenshot, use_ocr)
        return screenshot, image_data, window_size

    def save_annotations(annotations, filename, window_title, window_size):
        return save_configuration(window_title, window_size, annotations, filename)

    capture_button.click(
        fn=capture_and_process,
        inputs=[window_dropdown, use_ocr],
        outputs=[image_output, annotator, window_size_state]
    )

    save_button.click(
        fn=save_annotations,
        inputs=[annotator, filename_input, window_dropdown, window_size_state],
        outputs=save_output
    )

demo.launch()

Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.


* Running on local URL:  http://127.0.0.1:7884

To create a public link, set `share=True` in `launch()`.


