In [1]:
from pipelines import get_gemini_pipes, get_gpt4o_pipes
from utils import *
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
from qwen_vl_utils import process_vision_info
from region_traverser import *

model_name = "Qwen/Qwen2-VL-7B-Instruct"
model = Qwen2VLForConditionalGeneration.from_pretrained(model_name, torch_dtype="auto", device_map="auto")
processor = AutoProcessor.from_pretrained(model_name)

ds = load_screenspot_ds()

2024-11-12 02:14:48.012788: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-11-12 02:14:48.039257: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
`Qwen2VLRotaryEmbedding` can now be fully parameterized by passing the model config through the `config` argument. All other arguments will be removed in v4.46


Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

In [70]:
def get_coordinate_prediction(images, target):
    content = [{"type": "image", "image": image} for image in images] + [
        {"type": "text", "text": f"In the attached UI screenshot, calculate the exact position of the element corresponding to the command \"{target}\". Write your answer in the form of (x, y) where each x and y is normalized between 0 and 1. Examples: (0.25, 0.25) is the top-left, (0.75, 0.75) is the bottom-right."},
    ]
    if len(images) > 1:
        content[-1]["text"] = "You are given a screenshot represented as two images. The second image is fully zoomed out only for context. Write coordinates relative to the first image. " + content[-1]["text"]
    messages = [
        {
            "role": "user",
            "content": content,
        }
    ]
    
    # Preparation for inference
    text = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
    #print(text)
    prefix = "" if len(images) == 1 else "Second image: "
    #text += f"<|object_ref_start|>{prefix}{target}<|object_ref_end|><|box_start|>"
    text += f"The exact coordinate of the element corresponding to the command \"{target}\" is (x,y) = ("
    #text += f"To get the exact position of the element corresponding to the command \"{target}\", let's first think step by step:"
    image_inputs, video_inputs = process_vision_info(messages)
    inputs = processor(
        text=[text],
        images=image_inputs,
        videos=video_inputs,
        padding=True,

        return_tensors="pt",
    )
    inputs = inputs.to("cuda")
    generated_ids = model.generate(**inputs, max_new_tokens=2048)
    generated_ids_trimmed = [
        out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
    ]
    
    output_text = processor.batch_decode(
        generated_ids_trimmed, skip_special_tokens=False, clean_up_tokenization_spaces=False
    )
    #print(output_text[0])
    #coords_str = output_text[0].split("(")[1].split(")")[0]
    #print(coords_str)
    #coords = coords_str.replace('\'', '').replace('(', '').replace('(', '').replace(')', '').replace('[', '').replace(']', '').split(',')
    x, y = extract_tuple_from_string(output_text[0])#map(int, coords)
    
    result = (x*999, y*999)
    return result

In [95]:
test_set = ds['web']['icon']

In [226]:
len(ds['desktop']['text'])

194

In [103]:
from utils import *
def get_quadrant(midpoint, prediction_coord):
    mid_x, mid_y = midpoint
    pred_x, pred_y = prediction_coord

    if pred_x < mid_x and pred_y < mid_y:
        return 0
    elif pred_x >= mid_x and pred_y < mid_y:
        return 1
    elif pred_x < mid_x and pred_y >= mid_y:
        return 2
    elif pred_x >= mid_x and pred_y >= mid_y:
        return 3

dim = 999

def eval_row_baseline(row, verbose=False):
    target = row['target']
    og_size = row['image'].size
    current_region = row['image'].resize((dim,dim))
    x, y = get_coordinate_prediction([current_region], target)
    x = x * current_region.size[0]
    x = y * current_region.size[1]
    porp_x = x / dim
    porp_y = y / dim

    x = og_size[0] * porp_x
    y = og_size[1] * porp_y

    if verbose:
        print(target)
        print(current_region.size)
        print(x,y)
        render_crosshair(row['image'], x, y).convert("RGB").show()
    return is_in_bbox(row['bbox'], x, y)

def eval_row(row, verbose=False):
    target = row['target']
    current_region = row['image'].copy()

    traverser = RegionTraverser(current_region)
    if verbose: print(target)
    k = 3
    #render_crosshair_center(current_region).show()
    for i in range(k):
        images_prompt = [render_crosshair_center(current_region.resize((dim, dim)))]
        #if i > 0:
            #images_prompt = images_prompt + [result_image.resize((dim, dim))]  
        prediction_coord = get_coordinate_prediction(images_prompt, target)
        pred_x, pred_y = prediction_coord
        #if i == 0: images_prompt[0].show()
        #if verbose and i == 0: draw_bbox_on_image(current_region.resize((dim, dim)), pred_bbox).convert("RGB").show()
        #print(list(pred_bbox))
        #render_crosshair(current_region, pred_x, pred_y).convert("RGB").show()
        
        if verbose: print(prediction_coord)
        if i != k-1:
            traverser.consume_coordinate(pred_x, pred_y)#, (pred_bbox[2] - pred_bbox[0])*16, (pred_bbox[3] - pred_bbox[1])*16)
            result_image = traverser.get_highlighted_image()
            current_region = traverser.get_cropped_image().resize((dim,dim))

        if verbose: result_image.convert("RGB").show()
    final_bbox = traverser.get_bounding_box()

    last_porp_x = pred_x / dim
    last_porp_y = pred_y / dim
    
    delta_x = (final_bbox[2] - final_bbox[0]) * last_porp_x
    delta_y = (final_bbox[3] - final_bbox[1]) * last_porp_y

    x,y = final_bbox[0] + delta_x, final_bbox[1] + delta_y
    if verbose:
        render_crosshair(current_region, pred_x, pred_y).convert("RGB").show()
        render_crosshair(row['image'], x, y).convert("RGB").show()
        #draw_bbox_on_image(row['image'], pred_bbox).convert("RGB").show()
    return is_in_bbox(row['bbox'], x, y)

#eval_row(test_set[14], verbose=True)

In [58]:
from tqdm import tqdm

results = []
for row in tqdm(test_set):
    res = eval_row(row)
    results.append(res)

100%|█████████████████████████████████████████████████████████████████████████████████| 206/206 [13:09<00:00,  3.83s/it]


In [59]:
len([x for x in results if x]) / len(test_set)

0.5242718446601942

In [440]:
wrong_indices = [i for i in range(len(results)) if results[i] == False]

In [8]:
results

[False, True, True, False, True, True, True, False, False, True]

In [60]:
from tqdm import tqdm
baseline_results = []
for row in tqdm(test_set):
    res = eval_row_baseline(row)
    baseline_results.append(res)

100%|█████████████████████████████████████████████████████████████████████████████████| 206/206 [04:25<00:00,  1.29s/it]


In [61]:
len([x for x in baseline_results if x]) / len(test_set)

0.11650485436893204

In [441]:
wrong_indices_baseline = [i for i in range(len(baseline_results)) if baseline_results[i] == False]
[x for x in wrong_indices if x not in wrong_indices_baseline]

[7, 99]

In [48]:
weights

{'web - text': 230,
 'web - icon': 206,
 'mobile - text': 273,
 'mobile - icon': 229,
 'desktop - text': 194,
 'desktop - icon': 140}

In [104]:
devices = ["web", "mobile", "desktop"]
ui_types = ["text", "icon"]

weights = {}

for d in devices:
    for t in ui_types:
        test_set = ds[d][t]
        weights[f"{d} - {t}"] = len(test_set)

eval_result = {}

for d in devices:
    for t in ui_types:
        test_set = ds[d][t]
        results = []
        for row in tqdm(test_set):
            res = eval_row(row)
            results.append(res)
        eval_result[f"{d} - {t}"] = len([x for x in results if x]) / len(test_set)

100%|█████████████████████████████████████████████████████████████████████████████████| 230/230 [14:31<00:00,  3.79s/it]
100%|█████████████████████████████████████████████████████████████████████████████████| 206/206 [13:01<00:00,  3.80s/it]
100%|█████████████████████████████████████████████████████████████████████████████████| 273/273 [17:07<00:00,  3.76s/it]
100%|█████████████████████████████████████████████████████████████████████████████████| 229/229 [14:22<00:00,  3.77s/it]
100%|█████████████████████████████████████████████████████████████████████████████████| 194/194 [12:09<00:00,  3.76s/it]
100%|█████████████████████████████████████████████████████████████████████████████████| 140/140 [08:48<00:00,  3.77s/it]


In [105]:
eval_result

{'web - text': 0.7043478260869566,
 'web - icon': 0.49514563106796117,
 'mobile - text': 0.673992673992674,
 'mobile - icon': 0.5502183406113537,
 'desktop - text': 0.7938144329896907,
 'desktop - icon': 0.5357142857142857}

In [106]:
import numpy as np
np.average(list(eval_result.values()), weights=list(weights.values()))

0.6312893081761006