In [1]:
%env CUDA_VISIBLE_DEVICES=1
%env PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python
from utils import *
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
from qwen_vl_utils import process_vision_info
from region_traverser import *
from intern_vl_utils import *
import numpy as np

path = 'OpenGVLab/InternVL2-4B'
model = AutoModel.from_pretrained(
    path,
    torch_dtype=torch.bfloat16,
    low_cpu_mem_usage=True,
    use_flash_attn=True,
    trust_remote_code=True).eval().cuda()
tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True, use_fast=False)

ds = load_screenspot_ds()

env: CUDA_VISIBLE_DEVICES=1
env: PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python


2024-11-15 03:38:07.555617: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-11-15 03:38:07.581456: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Phi3ForCausalLM has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture 

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [4]:
def get_coordinate_prediction(image, target):
    prompt_text = f"<image>\nIn the attached UI screenshot, calculate the exact position of the element corresponding to the command \"{target}\". Write your answer in the form of (x, y) where each x and y is normalized between 0 and 1. Examples: (0.25, 0.25) is the top-left, (0.75, 0.75) is the bottom-right. Answer in a single sentence."
    pixel_values = load_image_from_pil(image, max_num=12).to(torch.bfloat16).cuda()
    generation_config = dict(max_new_tokens=1024, do_sample=False)

    output_text = model.chat(tokenizer, pixel_values, prompt_text, generation_config)

    x, y = extract_tuple_from_string(output_text)#map(int, coords)
    
    result = (x*999, y*999)
    return result

In [5]:
dim = 999

def eval_row_baseline(row, verbose=False):
    target = row['target']
    og_size = row['image'].size
    current_region = row['image'].resize((dim,dim))
    x, y = get_coordinate_prediction(current_region, target)
    x = x * current_region.size[0]
    x = y * current_region.size[1]
    porp_x = x / dim
    porp_y = y / dim

    x = og_size[0] * porp_x
    y = og_size[1] * porp_y

    if verbose:
        print(target)
        print(current_region.size)
        print(x,y)
        render_crosshair(row['image'], x, y).convert("RGB").show()
    return is_in_bbox(row['bbox'], x, y)

def eval_row(row, verbose=False):
    target = row['target']
    current_region = row['image'].copy()

    traverser = RegionTraverser(current_region)
    if verbose: print(target)
    k = 3
    for i in range(k):
        image_prompt = current_region.resize((dim, dim))
        prediction_coord = get_coordinate_prediction(image_prompt, target)
        pred_x, pred_y = prediction_coord
        
        if verbose: print(prediction_coord)
        if i != k-1:
            traverser.consume_coordinate(pred_x, pred_y)
            result_image = traverser.get_highlighted_image()
            current_region = traverser.get_cropped_image().resize((dim,dim))

        if verbose: result_image.convert("RGB").show()
    final_bbox = traverser.get_bounding_box()

    last_porp_x = pred_x / dim
    last_porp_y = pred_y / dim
    
    delta_x = (final_bbox[2] - final_bbox[0]) * last_porp_x
    delta_y = (final_bbox[3] - final_bbox[1]) * last_porp_y

    x,y = final_bbox[0] + delta_x, final_bbox[1] + delta_y
    if verbose:
        render_crosshair(current_region, pred_x, pred_y).convert("RGB").show()
        render_crosshair(row['image'], x, y).convert("RGB").show()
    return is_in_bbox(row['bbox'], x, y)

#eval_row(test_set[85], verbose=True)

In [7]:
from tqdm import tqdm
devices = ["web", "mobile", "desktop"]
ui_types = ["text", "icon"]

weights = {}
eval_result = {}

for d in devices:
    for t in ui_types:
        test_set = ds[d][t]
        results = []
        weights[f"{d} - {t}"] = len(test_set)
        for row in tqdm(test_set):
            res = eval_row(row)
            results.append(res)
        eval_result[f"{d} - {t}"] = len([x for x in results if x]) / len(test_set)

  0%|                                                                                           | 0/230 [00:00<?, ?it/s]The `seen_tokens` attribute is deprecated and will be removed in v4.41. Use the `cache_position` model input instead.
`get_max_cache()` is deprecated for all Cache classes. Use `get_max_cache_shape()` instead. Calling `get_max_cache()` will raise error from v4.48
100%|█████████████████████████████████████████████████████████████████████████████████| 230/230 [15:44<00:00,  4.11s/it]
100%|█████████████████████████████████████████████████████████████████████████████████| 206/206 [14:10<00:00,  4.13s/it]
100%|█████████████████████████████████████████████████████████████████████████████████| 273/273 [17:53<00:00,  3.93s/it]
100%|█████████████████████████████████████████████████████████████████████████████████| 229/229 [14:59<00:00,  3.93s/it]
100%|█████████████████████████████████████████████████████████████████████████████████| 194/194 [12:50<00:00,  3.97s/it]
100%|██████

In [8]:
eval_result

{'web - text': 0.05217391304347826,
 'web - icon': 0.02912621359223301,
 'mobile - text': 0.14652014652014653,
 'mobile - icon': 0.021834061135371178,
 'desktop - text': 0.08762886597938144,
 'desktop - icon': 0.02142857142857143}

In [9]:
np.average(list(eval_result.values()), weights=list(weights.values()))

0.06525157232704402