In [None]:
import base64
import json
from io import BytesIO

import matplotlib.pyplot as plt
import numpy as np
import requests
from PIL import Image
from datasets import load_dataset


SERVER_URL = "http://localhost:8000/v1/chat/completions"

# Names must match MODEL_CONFIGS keys in ui_tars_server.py
UGROUND_MODEL = "uground-2b"
QWEN_MODEL = "qwen2-vl-2b"


def pil_to_data_url(img: Image.Image, fmt: str = "PNG") -> str:
    """Encode a PIL image as a data URL string (data:image/...;base64,xxx)."""
    buf = BytesIO()
    img.save(buf, format=fmt)
    b64 = base64.b64encode(buf.getvalue()).decode("utf-8")
    return f"data:image/{fmt.lower()};base64,{b64}"


def parse_xy_from_string(text: str) -> tuple[int, int]:
    """Parse coordinates like '(123, 456)' from model output."""
    import re

    m = re.search(r"\(\s*(\d+)\s*,\s*(\d+)\s*\)", text)
    if not m:
        raise ValueError(f"Could not parse coordinates from: {text!r}")
    x, y = int(m.group(1)), int(m.group(2))
    return x, y


def call_model(image: Image.Image, description: str, model_name: str) -> tuple[str, int, int]:
    """Call the FastAPI server for a single image + description.

    Returns (raw_text, x, y) where (x, y) are in the original image pixel space
    (the server already performs the [0,1000) -> pixel scaling).
    """
    data_url = pil_to_data_url(image)

    payload = {
        "model": model_name,
        "messages": [
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": description},
                    {"type": "image_url", "image_url": {"url": data_url}},
                ],
            }
        ],
        "temperature": 0.0,
        "max_tokens": 64,
    }

    resp = requests.post(SERVER_URL, json=payload, timeout=300)
    resp.raise_for_status()
    data = resp.json()
    raw = data["choices"][0]["message"]["content"]
    x, y = parse_xy_from_string(raw)
    return raw, x, y


print("Notebook setup complete. Ready to load Click-100k and query models.")


In [None]:
# Load first 100 samples from Click-100k
# You can adjust the split or subset size as needed.

dataset = load_dataset("mlfoundations/Click-100k", split="train[:100]")
print(dataset)
print("Available keys in first sample:", dataset[0].keys())

# Heuristic: try to use a prompt field if present
PROMPT_FIELD_CANDIDATES = [
    "easyr1_prompt",
    "instruction",
    "query",
]

for cand in PROMPT_FIELD_CANDIDATES:
    if cand in dataset[0]:
        PROMPT_FIELD = cand
        break
else:
    raise KeyError(
        f"Could not find a suitable text prompt field in dataset sample: {dataset[0].keys()}"
    )

print("Using prompt field:", PROMPT_FIELD)



In [None]:
# Run both models on the first N samples and collect predictions

# N = len(dataset)
N = 100

results = []

for idx in range(N):
    sample = dataset[idx]
    img = sample["images"]
    if img is None:
        raise KeyError("Expected an 'image' or 'screenshot' column in Click-100k samples.")

    description = sample[PROMPT_FIELD]

    print(f"Sample {idx}: calling UGround and Qwen2-VL ...")
    raw_ug, x_ug, y_ug = call_model(img, description, UGROUND_MODEL)
    raw_qw, x_qw, y_qw = call_model(img, description, QWEN_MODEL)

    record = {
        "idx": idx,
        "description": description,
        "uground_raw": raw_ug,
        "uground_xy": (x_ug, y_ug),
        "qwen_raw": raw_qw,
        "qwen_xy": (x_qw, y_qw),
    }

    # Optionally keep a reference to the image for plotting later
    record["image"] = img

    results.append(record)

print(f"Completed inference for {len(results)} samples.")


In [None]:
# Plot a few example images with both models' predicted click locations

num_examples = min(4, len(results))
fig, axes = plt.subplots(1, num_examples, figsize=(5 * num_examples, 5))
if num_examples == 1:
    axes = [axes]

for ax, rec in zip(axes, results[:num_examples]):
    img = rec["image"]
    x_ug, y_ug = rec["uground_xy"]
    x_qw, y_qw = rec["qwen_xy"]

    ax.imshow(img)
    ax.scatter([x_ug], [y_ug], c="red", s=50, marker="x", label="UGround 2B")
    ax.scatter([x_qw], [y_qw], c="blue", s=50, marker="o", label="Qwen2-VL-2B")
    ax.set_title(f"Sample {rec['idx']}")
    ax.axis("off")

handles, labels = axes[0].get_legend_handles_labels()
fig.legend(handles, labels, loc="upper right")
fig.suptitle("UGround 2B vs Qwen2-VL-2B: predicted click locations")
fig.tight_layout()

plot_path = "click100k_uground_qwen_examples.png"
fig.savefig(plot_path, dpi=200, bbox_inches="tight")
print(f"Saved example comparison plot to {plot_path}")


In [None]:
# Optional: simple distance comparison between the two models' clicks

coords_ug = np.array([rec["uground_xy"] for rec in results])
coords_qw = np.array([rec["qwen_xy"] for rec in results])

# Euclidean distance between UGround and Qwen predictions per sample
diffs = np.linalg.norm(coords_ug - coords_qw, axis=1)

plt.figure(figsize=(8, 4))
plt.plot(range(len(diffs)), diffs, marker="o")
plt.xlabel("Sample index")
plt.ylabel("||UGround - Qwen|| (pixels)")
plt.title("Per-sample disagreement between UGround 2B and Qwen2-VL-2B")
plt.grid(True)

plot_path2 = "click100k_uground_qwen_distance.png"
plt.savefig(plot_path2, dpi=200, bbox_inches="tight")
print(f"Saved distance comparison plot to {plot_path2}")
