##RegionFocus: A Python Implementation of a Visual Grounding Framework

This notebook is an implementation of the research paper "Visual Test-time Scaling for GUI Agent Grounding" ([arXiv:2505.00684](https://arxiv.org/pdf/2505.00684)). The goal is to replicate the paper's core logic (test-time scaling, dynamically zooming, and an 'image-as-map' landmark system), and test its effectiveness on a general-purpose Vision-Language Model.

In [None]:
!wget https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb
!apt-get install -y ./google-chrome-stable_current_amd64.deb chromium-chromedriver

!pip install selenium transformers torch pillow accelerate bitsandbytes

In [None]:
#demo webpage

html_content = """
<style>
  body { padding: 40px; font-family: sans-serif; }
  .btn {
    padding: 20px;
    font-size: 24px;
    border-radius: 8px;
    margin: 20px;
    cursor: pointer;
  }
  /* This is the FAKE button */
  #cancel-btn { background-color: #f77; border: 2px solid #a00; }

  /* This is the REAL button */
  #submit-btn { background-color: #7f7; border: 2px solid #0a0; }
</style>

<body>
  <h1>Submit Your Report</h1>
  <p>Please confirm your submission. One button is fake.</p>

  <button id="cancel-btn" class="btn" onclick="alert('Fake Button Clicked!')">Submit</button>

  <button id="submit-btn" class="btn" onclick="alert('Real Button Clicked!')">Submit</button>

</body>
"""

with open("index.html", "w") as f:
    f.write(html_content)

print("Challenge file 'index.html' created.")

In [None]:
import torch
from transformers import LlavaNextProcessor, LlavaNextForConditionalGeneration, BitsAndBytesConfig
import re
from PIL import Image, ImageDraw, ImageFont

# --- 1. Define 4-bit Quantization Config ---
print("Setting up 4-bit quantization config...")
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
)

# --- 2. Load Model and Processor ---
print("Loading LLaVA model in 4-bit... (This may take a few minutes)")
model_id = "llava-hf/llava-v1.6-vicuna-7b-hf"

processor = LlavaNextProcessor.from_pretrained(model_id)

model = LlavaNextForConditionalGeneration.from_pretrained(
    model_id,
    quantization_config=quantization_config,
    low_cpu_mem_usage=True
)
print("LLaVA model loaded successfully in 4-bit.")


In [None]:
# --- 3. VLM Inference Helper Function ---
def get_vlm_coordinates(image, prompt_text, img_width, img_height):
    """
    Feeds an image and a text prompt to LLaVA and parses
    its response to find [x, y] coordinates.

    HANDLES BOTH [x, y] AND [x1, y1, x2, y2] FORMATS.
    """
    prompt = f"[USER]: <image>\n{prompt_text} Please respond *only* with the coordinates of the center of this element in the format [x, y].\n[ASSISTANT]:"

    inputs = processor(text=prompt, images=image, return_tensors="pt").to("cuda")

    output = model.generate(**inputs, max_new_tokens=100)
    response_text = processor.decode(output[0], skip_special_tokens=True)

    # Regex for 4 numbers (bbox): [x1, y1, x2, y2]
    bbox_match = re.search(r'\[(\d+\.?\d*),\s*(\d+\.?\d*),\s*(\d+\.?\d*),\s*(\d+\.?\d*)\]', response_text)
    # Regex for 2 numbers (center): [x, y]
    center_match = re.search(r'\[(\d+\.?\d*),\s*(\d+\.?\d*)\]', response_text)

    if bbox_match:
        # --- HANDLE BOUNDING BOX ---
        x1 = float(bbox_match.group(1))
        y1 = float(bbox_match.group(2))
        x2 = float(bbox_match.group(3))
        y2 = float(bbox_match.group(4))

        print(f"VLM responded with BBOX: [{x1}, {y1}, {x2}, {y2}]")

        # Check if relative (0-1) and convert
        if 0.0 <= x1 <= 1.0 and 0.0 <= y1 <= 1.0:
            x1_abs = x1 * img_width
            y1_abs = y1 * img_height
            x2_abs = x2 * img_width
            y2_abs = y2 * img_height
        else:
            x1_abs, y1_abs, x2_abs, y2_abs = x1, y1, x2, y2

        # Calculate center point from bbox
        center_x = int((x1_abs + x2_abs) / 2)
        center_y = int((y1_abs + y2_abs) / 2)

        print(f"Calculated center from bbox: [{center_x}, {center_y}]")
        return (center_x, center_y)

    elif center_match:
        # --- HANDLE CENTER POINT (Original Logic) ---
        x_val = float(center_match.group(1))
        y_val = float(center_match.group(2))

        print(f"VLM responded with CENTER: [{x_val}, {y_val}]")

        if 0.0 <= x_val <= 1.0 and 0.0 <= y_val <= 1.0:
            x_abs = int(x_val * img_width)
            y_abs = int(y_val * img_height)
            print(f"Converted relative coords to absolute: [{x_abs}, {y_abs}]")
            return (x_abs, y_abs)
        else:
            print("VLM gave absolute coords.")
            return (int(x_val), int(y_val))

    else:
        # --- HANDLE FAILURE ---
        print(f"VLM response was unparsable: {response_text.split('ASSISTANT:')[-1].strip()}")
        return None

In [None]:
import os
from IPython.display import display

# --- 1. Image-as-Map Drawing Function ---
try:
    font_path = "/usr/share/fonts/truetype/liberation/LiberationMono-Regular.ttf" # Example font path
    font_size = 30
    font = ImageFont.truetype(font_path, font_size)
    print(f"Loaded font: {font_path} with size {font_size}")
except IOError:
    print("Font not found. Using default font (no size/thickness control).")
    try:
        font = ImageFont.load_default()
    except IOError:
        font = None
marker_char = "o"

def add_landmarks_to_image(base_image, coordinates_list, save_path):
    """
    Draws pink markers on a *copy* of an image and saves it.
    Increased size and simulated thickness by drawing multiple times.
    """
    image = base_image.copy().convert("RGB")
    draw = ImageDraw.Draw(image)

    offsets = [(-1, -1), (-1, 1), (1, -1), (1, 1), (0, 0)] # Draw center and corners
    for (x, y) in coordinates_list:
        for ox, oy in offsets:
            draw.text((x - 15 + ox, y - 25 + oy), marker_char, fill="purple", font=font)

    image.save(save_path)
    print(f"Saved map with {len(coordinates_list)} landmark(s) to {save_path}")
    return Image.open(save_path)

# --- 2. Bounding Box Proposal Function ---
def get_fixed_bboxes(focal_point, img_width, img_height):
    """
    Calculates 4 fixed-ratio bounding boxes centered
    on the focal_point, as described in the paper.
    """
    (cx, cy) = focal_point

    ratios = [
        (0.5, 0.5),
        (0.3, 0.3),
        (0.4, 0.8),
        (0.8, 0.4)
    ]

    bboxes = []
    for (w_ratio, h_ratio) in ratios:
        width = w_ratio * img_width
        height = h_ratio * img_height

        x1 = max(0, cx - width / 2)
        y1 = max(0, cy - height / 2)
        x2 = min(img_width, cx + width / 2)
        y2 = min(img_height, cy + height / 2)

        bboxes.append((int(x1), int(y1), int(x2), int(y2)))

    print(f"Generated 4 fixed-ratio bounding boxes around {focal_point}")
    return bboxes

# --- Draw Bounding Boxes ---
def draw_bboxes_on_image(base_image, bboxes, save_path):
    """
    Draws the 4 proposed bounding boxes on an image.
    """
    image = base_image.copy().convert("RGB")
    draw = ImageDraw.Draw(image)

    colors = ["red", "green", "blue", "yellow"]

    for i, box in enumerate(bboxes):
        draw.rectangle(box, outline=colors[i % len(colors)], width=3)

    image.save(save_path)
    print(f"Saved bbox visualization to {save_path}")
    return image

In [None]:
from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.by import By
import time

# --- 1. Setup Selenium Driver ---
options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')

driver = webdriver.Chrome(options=options)

driver.set_window_size(1024, 768)

html_file_path = "file://" + os.path.abspath("index.html")
driver.get(html_file_path)
time.sleep(1)

print("Browser loaded.")

# --- 2. Helper to Check Clicks ---
def check_click_success(coords):
    if not coords:
        return None
    try:
        element = driver.execute_script(
            "return document.elementFromPoint(arguments[0], arguments[1]);",
            coords[0], coords[1]
        )
        if element:
            return element.get_attribute("id")
        return None
    except Exception as e:
        print(f"Error checking click: {e}")
        return None

# =======================================================
# STEP A: INITIAL PREDICTION
# =======================================================
print("\n--- STEP A: Initial Prediction ---")
driver.save_screenshot("screenshot_1.png")
img_1 = Image.open("screenshot_1.png")
img_width, img_height = img_1.size

initial_prompt = "Find the 'Submit' button. It's next to another 'Submit' button."
coords_1 = get_vlm_coordinates(img_1, initial_prompt, img_width, img_height)

# --- VISUAL 1: Show the first (wrong) click ---
if coords_1:
    print("\nVISUAL: Initial VLM Prediction")
    display(add_landmarks_to_image(img_1, [coords_1], "initial_prediction_map.png"))
else:
    print("No initial coordinates found.")

# =======================================================
# STEP B: TRIGGER CONDITION
# =======================================================
print("\n--- STEP B: Trigger Condition Check ---")
clicked_element_id = check_click_success(coords_1)
print(f"Initial click at {coords_1} would hit element: '{clicked_element_id}'")

if clicked_element_id == "submit-btn":
    print("ðŸŽ‰ SUCCESS on the first try! (This is unlikely)")

elif clicked_element_id is None:
    print("FAILURE: VLM did not provide clickable coordinates.")

else:
    print(f"FAILURE: VLM clicked the wrong button ('{clicked_element_id}').")
    print("ðŸ”¥ REGIONFOCUS ACTIVATED ðŸ”¥")

    # =======================================================
    # STEP C: REGIONFOCUS PROCESS
    # =======================================================

    # --- C.1: Image-as-Map (History) ---
    print("\n--- C.1: Marking history on map... ---")
    history_map = add_landmarks_to_image(img_1, [coords_1], "history_map.png")

    # --- VISUAL 2: Show the history map ---
    print("\nVISUAL: History Map (Input for C.2)")
    display(history_map)

    # --- C.2: Focal Point Proposal ---
    print("\n--- C.2: Proposing new focal point... ---")
    focal_point_prompt = f"I tried clicking the 'Submit' button at the pink marker ({marker_char}) and it was wrong. Find the *other* 'Submit' button and give its [x, y] coordinates as a new focal point."
    focal_point = get_vlm_coordinates(history_map, focal_point_prompt, img_width, img_height)

    if focal_point:
        # --- C.3: Bounding Box Proposal ---
        print("\n--- C.3: Proposing fixed-ratio bboxes... ---")
        bboxes = get_fixed_bboxes(focal_point, img_width, img_height)

        # --- VISUAL 3: Show the proposed regions ---
        print("\nVISUAL: Proposed Bounding Boxes (Input for C.4)")
        bbox_map = draw_bboxes_on_image(img_1, bboxes, "bbox_map.png")
        display(bbox_map)

        # --- C.4: Candidate Prediction (The "Zoom") ---
        print("\n--- C.4: Predicting action for each region... ---")
        colors = ["red", "green", "blue", "yellow"]
        candidates = []
        for i, box in enumerate(bboxes):
            region_crop = img_1.crop(box)
            region_crop.save(f"region_{i}.png")

            crop_width, crop_height = region_crop.size

            print(f"  Analyzing region {i} ({crop_width}x{crop_height})...")



            region_prompt = "Find the 'Submit' button in this small image. Give its *local* coordinates [x, y]."
            local_coords = get_vlm_coordinates(region_crop, region_prompt, crop_width, crop_height)

            # --- VISUAL 4: Show the crop being analyzed ---
            print(f"  VISUAL: Region {i} Crop")
            draw = ImageDraw.Draw(region_crop)
            draw.rectangle((0, 0, crop_width - 1, crop_height - 1), outline=colors[i % len(colors)], width=5)
            display(add_landmarks_to_image(region_crop, [local_coords], f"Region_{i}_Crop.png"))

            if local_coords:
                global_x = box[0] + local_coords[0]
                global_y = box[1] + local_coords[1]
                candidates.append((global_x, global_y))
                print(f"  Found candidate at {global_x, global_y}")

        # --- C.5: Action Aggregation ---
        print("\n--- C.5: Aggregating final action... ---")
        if candidates:
            aggregation_map = add_landmarks_to_image(img_1, candidates, "aggregation_map.png")

            # --- VISUAL 5: Show the aggregation map ---
            print("\nVISUAL: Aggregation Map (Input for C.5)")
            display(aggregation_map)

            agg_prompt = f"Here are {len(candidates)} candidates for the *correct* 'Submit' button, marked with {marker_char}. Which one is the right one to click? Respond with its [x, y] coordinates."
            final_coords = get_vlm_coordinates(aggregation_map, agg_prompt, img_width, img_height)

            # =======================================================
            # STEP D: EXECUTE FINAL ACTION
            # =======================================================
            print("\n--- STEP D: Executing Final Action ---")
            final_clicked_id = check_click_success(final_coords)
            print(f"Final click at {final_coords} would hit element: '{final_clicked_id}'")

            # --- VISUAL 6: Show the final action ---
            print("\nVISUAL: Final Chosen Action")
            display(add_landmarks_to_image(img_1, [final_coords], "final_action_map.png"))

            if final_clicked_id == "submit-btn":
                print("ðŸŽ‰ðŸŽ‰ðŸŽ‰ REGIONFOCUS SUCCESS! ðŸŽ‰ðŸŽ‰ðŸŽ‰")
            else:
                print("ðŸ˜¥ REGIONFOCUS FAILED. Final click was also wrong.")
        else:
            print("ðŸ˜¥ REGIONFOCUS FAILED. No candidates found in any region.")
    else:
        print("ðŸ˜¥ REGIONFOCUS FAILED. VLM could not propose a new focal point.")

# --- 6. Cleanup ---
print("\n--- Cleanup ---")
driver.quit()