In [1]:
from pipelines import get_gemini_pipes, get_gpt4o_pipes
from utils import *
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
from qwen_vl_utils import process_vision_info
from numpy import np
from region_traverser import *

model_name = "OS-Copilot/OS-Atlas-Base-7B"
model = Qwen2VLForConditionalGeneration.from_pretrained(model_name, torch_dtype="auto", device_map="auto")
processor = AutoProcessor.from_pretrained(model_name)

ds = load_screenspot_ds()

2024-11-14 04:33:45.145245: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-11-14 04:33:45.170833: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

`Qwen2VLRotaryEmbedding` can now be fully parameterized by passing the model config through the `config` argument. All other arguments will be removed in v4.46


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [20]:
def get_coordinate_prediction(images, target):
    content = [{"type": "image", "image": image} for image in images] + [
        {"type": "text", "text": f"In the attached UI screenshot, what is the bbox of the element corresponding to the command \"{target}\"? Write your final answer in the following format (x1, y1, x2, y2)"},
    ]
    if len(images) > 1:
        content[-1]["text"] = "You are given a screenshot represented as two images. The first is fully zoomed out only for context. The second image is zoomed to the area of interest and should be the focus. Write coordinates relative to the first image. " + content[-1]["text"]
    messages = [
        {
            "role": "user",
            "content": content,
        }
    ]
    
    # Preparation for inference
    text = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
    #print(text)
    prefix = "" if len(images) == 1 else "Second image: "
    text += f"<|object_ref_start|>{prefix}{target}<|object_ref_end|><|box_start|>("
    image_inputs, video_inputs = process_vision_info(messages)
    inputs = processor(
        text=[text],
        images=image_inputs,
        videos=video_inputs,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")
    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [
        out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
    ]
    
    output_text = processor.batch_decode(
        generated_ids_trimmed, skip_special_tokens=False, clean_up_tokenization_spaces=False
    )
    #print(output_text)
    coords_str = output_text[0].split("<|box_end|>")[0]
    #print(coords_str)
    coords = coords_str.replace('(', '').replace(')', '').replace('[', '').replace(']', '').split(',')
    x1, y1, x2, y2 = map(int, coords)
    bbox = [x1, y1, x2, y2]
    
    midpoint_x = (x1 + x2) / 2
    midpoint_y = (y1 + y2) / 2
    
    result = (midpoint_x, midpoint_y), bbox
    return result

In [5]:
test_set = ds['web']['icon'][:100]

In [226]:
len(ds['desktop']['text'])

194

In [80]:
from tqdm import tqdm

results = []
for row in tqdm(test_set):
    res = eval_row(row)
    results.append(res)

100%|█████████████████████████████████████████████████████████████████████████████████| 100/100 [04:35<00:00,  2.75s/it]


In [91]:
for row in tqdm(ds['web']['icon'][100:]):
    res = eval_row(row)
    results.append(res)

100%|█████████████████████████████████████████████████████████████████████████████████| 106/106 [04:53<00:00,  2.77s/it]


In [92]:
len([x for x in results if x]) / len(results)

0.7572815533980582

In [65]:
wrong_indices = [i for i in range(len(results)) if results[i] == False]

In [83]:
wrong_indices

[1,
 4,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 23,
 24,
 29,
 30,
 36,
 48,
 49,
 71,
 73,
 76,
 77,
 84,
 85,
 86,
 89,
 90,
 96,
 97]

In [67]:
from tqdm import tqdm
baseline_results = []
for row in tqdm(test_set):
    res = eval_row_baseline(row)
    baseline_results.append(res)

100%|█████████████████████████████████████████████████████████████████████████████████| 100/100 [02:17<00:00,  1.38s/it]


In [90]:
len([x for x in baseline_results if x]) / len(baseline_results)

0.6990291262135923

In [84]:
wrong_indices_baseline = [i for i in range(len(baseline_results)) if baseline_results[i] == False]
[x for x in wrong_indices if x not in wrong_indices_baseline]

[36, 85, 86]

In [48]:
weights

{'web - text': 230,
 'web - icon': 206,
 'mobile - text': 273,
 'mobile - icon': 229,
 'desktop - text': 194,
 'desktop - icon': 140}

In [22]:
from tqdm import tqdm
devices = ["web", "mobile", "desktop"]
ui_types = ["text", "icon"]

weights = {}

for d in devices:
    for t in ui_types:
        test_set = ds[d][t]
        weights[f"{d} - {t}"] = len(test_set)

eval_result = {}

for d in devices:
    for t in ui_types:
        test_set = ds[d][t]
        results = []
        for row in tqdm(test_set):
            res = eval_row(row)
            results.append(res)
        eval_result[f"{d} - {t}"] = len([x for x in results if x]) / len(test_set)

100%|█████████████████████████████████████████████████████████████████████████████████| 230/230 [15:55<00:00,  4.16s/it]
100%|█████████████████████████████████████████████████████████████████████████████████| 206/206 [14:18<00:00,  4.17s/it]
100%|█████████████████████████████████████████████████████████████████████████████████| 273/273 [18:52<00:00,  4.15s/it]
100%|█████████████████████████████████████████████████████████████████████████████████| 229/229 [15:49<00:00,  4.14s/it]
100%|█████████████████████████████████████████████████████████████████████████████████| 194/194 [13:22<00:00,  4.14s/it]
100%|█████████████████████████████████████████████████████████████████████████████████| 140/140 [09:34<00:00,  4.11s/it]


In [1]:
eval_result

NameError: name 'eval_result' is not defined

In [24]:
import numpy as np
np.average(list(eval_result.values()), weights=list(weights.values()))

0.8411949685534591

In [53]:
sum(list(eval_result.values()))/6

0.8268014648235166

In [22]:
import json

def render_crosshair(image, x, y):
    """
    Draws a crosshair intersecting at the given (x, y) coordinates on the image.

    Parameters:
        image (PIL.Image): The input image to draw the crosshair on.
        x (int): The x-coordinate of the crosshair intersection.
        y (int): The y-coordinate of the crosshair intersection.

    Returns:
        PIL.Image: A new image with the crosshair rendered.
    """
    # Create a copy of the image to draw on
    rendered_image = image.copy()
    draw = ImageDraw.Draw(rendered_image)

    # Get image dimensions
    width, height = image.size

    # Draw horizontal and vertical lines for the crosshair
    line_color = "red"  # Color of the crosshair
    line_width = 2      # Width of the crosshair lines

    # Draw vertical line
    draw.line([(x, 0), (x, height)], fill=line_color, width=line_width)

    # Draw horizontal line
    draw.line([(0, y), (width, y)], fill=line_color, width=line_width)

    return rendered_image

#r = await image_pipe([image], "Determine the x, y coordinates of the New Folder button. Think step-by-step carefully in a series of numbered steps. Return your answer as JSON with two keys: thoughts (string), x (int) and y (int)", schema=1)
#r = json.loads(r)
#print(r)
#render_crosshair(image, r["x"], r["y"]).show()