Download data from Google storage

In [2]:
from google.cloud import storage
from pathlib import Path
import zipfile
from PIL import Image
import json
import random

In [2]:
bucket_name = 'processed_webui'
source_blob_name = 'balanced_7k_processed_filtered.zip'
destination_dir = '../downloads/filtered'
destination_file_name = destination_dir + "/balanced_7k_processed_filtered.zip"

In [4]:
Path(destination_dir).mkdir(parents=True, exist_ok=True)
storage_client = storage.Client()
bucket = storage_client.bucket(bucket_name)
blob = bucket.blob(source_blob_name)
blob.download_to_filename(destination_file_name)
print(f"Blob {source_blob_name} downloaded to {destination_file_name}.")

Blob balanced_7k_processed_filtered.zip downloaded to ../downloads/filtered/balanced_7k_processed_filtered.zip.


In [5]:
with zipfile.ZipFile(destination_file_name, 'r') as zip_ref:
    zip_ref.extractall(destination_dir)

In [9]:
# remove the zip file
Path(destination_file_name).unlink()

FileNotFoundError: [Errno 2] No such file or directory: '../downloads/filtered/balanced_7k_processed_filtered.zip'

In [7]:
sites = Path(destination_dir)
print(f"{len(list(sites.iterdir()))} sites")
sites_list = list(sites.iterdir())
bb_list = []
for site in sites_list:
    bb_list.extend(list((site / "bounding_boxes").iterdir()))
print(f"{len(bb_list)} bounding boxes")

3986 sites
68103 bounding boxes


In [32]:
# select 100 random bounding boxes
import random
random.seed(42)

random_bb_list = random.sample(bb_list, 100)

In [33]:
# crop the images around the selected bounding boxes


images_output = Path("../downloads/filtered/images")
images_output.mkdir(parents=True, exist_ok=True)

for bb in random_bb_list:
    # read bb json data
    with open(bb, "r") as f:
        data = json.load(f)
    site = bb.parent.parent
    image = site / "images" / "full-screenshot.webp"
    image = Image.open(image)
    cropped_image = image.crop((data["bounding_box"]["x"] - 10, data["bounding_box"]["y"] - 10, data["bounding_box"]["x"] + data["bounding_box"]["width"] + 10, data["bounding_box"]["y"] + data["bounding_box"]["height"] + 10))
    # save
    cropped_image.save(images_output / f"{bb.parent.parent.name}_{bb.stem}.jpeg", format="JPEG")


In [34]:
# get the average image size in images_output

sizes = []
for image in images_output.iterdir():
    sizes.append(Image.open(image).size)

average_size = tuple([int(sum(x) / len(sizes)) for x in zip(*sizes)])
print(f"Average size: {average_size}")

Average size: (155, 48)


Using OpenAI's [tokenizer](https://platform.openai.com/tokenizer) and [pricing info](https://openai.com/api/pricing/), we can make the following estimates:

Assumptions:
- prompt: 
"""
Write a name, a short description, and a tag indicating which type of UI element this is (button, heading, link, label, text, image or iframe). Use the following JSON format:

{
  "name": $NAME,
  "description": $DESCRIPTION,
  "tag": $TAG
}

Return the JSON only. Do not enclose it in a code block.
"""
- model: gpt4o
- Avg img size: (216, 53)

Estimates:
- 78 input tokens, 31 output tokens per request.
- Cost per request:
    - Input: 78*5/1000000 ~ 0.00039 USD
    - Output: 31*15/1000000 ~ 0.000465 USD
- Number of requests: ~100
- Cost per image: 0.001275 USD
- Total cost per request: 0.001275 + 0.00039 + 0.000465 = 0.00213 USD
- Total cost: 0.00213 * 100 = 0.213 USD

In [13]:
openai_annotations = Path("../downloads/filtered/openai_annotations")
openai_annotations.mkdir(parents=True, exist_ok=True)

In [14]:
from mllm import Router, Prompt
from threadmem import RoleThread
from pydantic import BaseModel
from PIL import Image, ImageDraw
from agentdesk import Desktop
from surfninja.img import b64_to_image, image_to_b64, crop_box_around
import os
import getpass
import pathlib

In [16]:
secret = getpass.getpass(prompt='Enter OpenAI API key: ')

In [17]:
os.environ["OPENAI_API_KEY"] = secret

In [18]:
router = Router(
    preference=["gpt-4o"]
)

Intialized router with Routing strategy: simple-shuffle

Routing fallbacks: None

Routing context window fallbacks: None

Router Redis Caching=None


In [19]:
class Annotation(BaseModel):
    name: str
    description: str
    tag: str

In [20]:
schema = Annotation.model_json_schema()
schema

{'properties': {'name': {'title': 'Name', 'type': 'string'},
  'description': {'title': 'Description', 'type': 'string'},
  'tag': {'title': 'Tag', 'type': 'string'}},
 'required': ['name', 'description', 'tag'],
 'title': 'Annotation',
 'type': 'object'}

In [24]:
prompt_template = """Write a name, a short description, and a tag indicating which type of UI element this is (button, heading, link, label, text, image or iframe). Use the following JSON format:

{{
  "name": $NAME,
  "description": $DESCRIPTION,
  "tag": $TAG
}}

Return the JSON only. Do not enclose it in a code block."""
prompt = prompt_template
prompt

'Write a name, a short description, and a tag indicating which type of UI element this is (button, heading, link, label, text, image or iframe). Use the following JSON format:\n\n{{\n  "name": $NAME,\n  "description": $DESCRIPTION,\n  "tag": $TAG\n}}\n\nReturn the JSON only. Do not enclose it in a code block.'

In [46]:
for img in images_output.iterdir():
    image = Image.open(img)
    image_b64 = image_to_b64(image, image_format="JPEG")
    thread = RoleThread()
    thread.post(
        role="user",
        msg=prompt,
        images=[image_b64],
    )
    response = router.chat(thread, expect=Annotation, retries=0)
    with open(openai_annotations / f"{img.stem}.json", "w") as f:
        json.dump(response.parsed.json(), f)

This took ~2.5 mins, so if scaled to 68k it should probably have to be parallelized. The good news is that we wouldn't need to run the GPU.

Let's now visualize the annotations.

In [61]:
for bb in random_bb_list:
    # read bb json data
    with open(bb, "r") as f:
        data = json.load(f)
    site = bb.parent.parent
    image = site / "images" / "full-screenshot.webp"
    image = Image.open(image)
    draw = ImageDraw.Draw(image)
    draw.rectangle((data["bounding_box"]["x"], data["bounding_box"]["y"], data["bounding_box"]["x"] + data["bounding_box"]["width"], data["bounding_box"]["y"] + data["bounding_box"]["height"]), outline="red")
    # get annotation data
    with open(openai_annotations / f"{bb.parent.parent.name}_{bb.stem}.json", "r") as f:
        tag_data = json.load(f)
        tag_data = json.loads(tag_data)
    draw.text((data["bounding_box"]["x"], data["bounding_box"]["y"] - 20), f"{tag_data['name']} - {tag_data['tag']}", fill="red")
    image.show()
    input("Press Enter to continue to the next image...")

KeyboardInterrupt: Interrupted by user

In [64]:
# zip the test images
import zipfile
import os

output_zip_path = Path('../downloads/test_images.zip')
base_output_path = Path("../downloads/filtered/images")

with zipfile.ZipFile(output_zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
    for root, dirs, files in os.walk(base_output_path):
        for file in files:
            file_path = os.path.join(root, file)
            zipf.write(file_path, os.path.relpath(file_path, base_output_path))

In [65]:
# zip the openai annotations
output_zip_path = Path('../downloads/openai_annotations.zip')
base_output_path = Path("../downloads/filtered/openai_annotations")

with zipfile.ZipFile(output_zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
    for root, dirs, files in os.walk(base_output_path):
        for file in files:
            file_path = os.path.join(root, file)
            zipf.write(file_path, os.path.relpath(file_path, base_output_path))

In [66]:
# upload the openai annotations zip file to GCS
from google.cloud import storage

bucket_name = "processed_webui"
source_file_name = "../downloads/openai_annotations.zip"
destination_blob_name = "openai_annotations.zip"
storage_client = storage.Client()
bucket = storage_client.bucket(bucket_name)
blob = bucket.blob(destination_blob_name)
blob.upload_from_filename(source_file_name)
print(f"File {source_file_name} uploaded to {destination_blob_name}.")

File ../downloads/openai_annotations.zip uploaded to openai_annotations.zip.


In [67]:
# upload the test images zip file to GCS

bucket_name = "processed_webui"
source_file_name = "../downloads/test_images.zip"
destination_blob_name = "test_images.zip"
storage_client = storage.Client()
bucket = storage_client.bucket(bucket_name)
blob = bucket.blob(destination_blob_name)
blob.upload_from_filename(source_file_name)
print(f"File {source_file_name} uploaded to {destination_blob_name}.")

File ../downloads/test_images.zip uploaded to test_images.zip.


## PaliGemma annotation

In [3]:
# download test images from GCS
bucket_name = 'processed_webui'
source_blob_name = 'test_images.zip'
destination_dir = '../downloads/filtered/images'
destination_file_name = destination_dir + "/test_images.zip"

In [4]:
Path(destination_dir).mkdir(parents=True, exist_ok=True)
storage_client = storage.Client()
bucket = storage_client.bucket(bucket_name)
blob = bucket.blob(source_blob_name)
blob.download_to_filename(destination_file_name)
print(f"Blob {source_blob_name} downloaded to {destination_file_name}.")

Blob test_images.zip downloaded to ../downloads/filtered/images/test_images.zip.


In [5]:
with zipfile.ZipFile(destination_file_name, 'r') as zip_ref:
    zip_ref.extractall(destination_dir)

In [6]:
Path(destination_file_name).unlink()

In [7]:
len(list(Path(destination_dir).iterdir()))

100

In [8]:
paligemma_annotations = Path("../downloads/filtered/paligemma_annotations")
paligemma_annotations.mkdir(parents=True, exist_ok=True)

In [9]:
!nvidia-smi

Mon Jun 24 01:27:50 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.183.01             Driver Version: 535.183.01   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA L4                      Off | 00000000:00:03.0 Off |                    0 |
| N/A   47C    P8              17W /  72W |    119MiB / 23034MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
|   1  NVIDIA L4                      Off | 00000000:00:04.0 Off |  

In [10]:
!pip install transformers accelerate bitsandbytes

Collecting transformers
  Downloading transformers-4.41.2-py3-none-any.whl.metadata (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.8/43.8 kB[0m [31m850.4 kB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
[?25hCollecting accelerate
  Downloading accelerate-0.31.0-py3-none-any.whl.metadata (19 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.43.1-py3-none-manylinux_2_24_x86_64.whl.metadata (2.2 kB)
Collecting safetensors>=0.4.1 (from transformers)
  Downloading safetensors-0.4.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.8 kB)
Downloading transformers-4.41.2-py3-none-any.whl (9.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.1/9.1 MB[0m [31m32.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading accelerate-0.31.0-py3-none-any.whl (309 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m309.4/309.4 kB[0m [31m51.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading bitsandbytes

In [11]:
import torch
import time
from transformers import AutoProcessor, PaliGemmaForConditionalGeneration
from PIL import Image
from concurrent.futures import ProcessPoolExecutor
import multiprocessing

  from .autonotebook import tqdm as notebook_tqdm


In [12]:
def process_images(model_id, device, image_paths, prompt_type):
    start = time.time()
    model = PaliGemmaForConditionalGeneration.from_pretrained(
        model_id,
        device_map=device,
    ).eval()
    processor = AutoProcessor.from_pretrained(model_id)
    
    batch_size = 8
    images = [Image.open(path) for path in image_paths]
    prompts = [prompt_type] * len(images)
    all_predictions = []
    
    for i in range(0, len(images), batch_size):
        batch_images = images[i:i + batch_size]
        batch_prompts = prompts[i:i + batch_size]
        model_inputs = processor(text=batch_prompts, images=batch_images, return_tensors="pt").to(model.device)
        
        with torch.inference_mode():
            generations = model.generate(**model_inputs, max_new_tokens=100, do_sample=False)
            predictions = processor.batch_decode(generations[:,model_inputs["input_ids"].size(1):], skip_special_tokens=True)
        all_predictions.extend(predictions)
    end = time.time()
    print(f"Time taken: {end - start:.2f} seconds")
    return all_predictions

In [14]:
images_output = Path("../downloads/filtered/images")

In [17]:
model_id = "google/paligemma-3b-ft-widgetcap-448"
image_paths = list(images_output.iterdir())

with ProcessPoolExecutor(max_workers=2) as executor:
    future_caption = executor.submit(process_images, model_id, "cuda:0", image_paths, "caption")
    future_ocr = executor.submit(process_images, model_id, "cuda:1", image_paths, "ocr")

    caption_predictions = future_caption.result()
    ocr_predictions = future_ocr.result()

Downloading shards: 100%|██████████| 3/3 [02:09<00:00, 43.08s/it]
Downloading shards: 100%|██████████| 3/3 [02:09<00:00, 43.10s/it]
`config.hidden_act` is ignored, you should use `config.hidden_activation` instead.
Gemma's activation function will be set to `gelu_pytorch_tanh`. Please, use
`config.hidden_activation` if you want to override this behaviour.
See https://github.com/huggingface/transformers/pull/29402 for more details.
`config.hidden_act` is ignored, you should use `config.hidden_activation` instead.
Gemma's activation function will be set to `gelu_pytorch_tanh`. Please, use
`config.hidden_activation` if you want to override this behaviour.
See https://github.com/huggingface/transformers/pull/29402 for more details.
Loading checkpoint shards: 100%|██████████| 3/3 [00:04<00:00,  1.51s/it]
Loading checkpoint shards: 100%|██████████| 3/3 [00:04<00:00,  1.63s/it]


Time taken: 231.79 seconds
Time taken: 242.24 seconds


In [19]:
# store predictions
assert len(caption_predictions) == len(ocr_predictions) == len(image_paths)
for i in range(len(image_paths)):
    with open(paligemma_annotations / f"{image_paths[i].stem}.json", "w") as f:
        json.dump({"name": caption_predictions[i], "ocr": ocr_predictions[i]}, f)

In [20]:
!cat ../downloads/filtered/paligemma_annotations/*.json

{"name": "text box", "ocr": "Creating your first Pod at C"}{"name": "text", "ocr": ":"}{"name": "go to blog", "ocr": "Blog"}{"name": "logo", "ocr": "HATESONG.COM"}{"name": "zoom in", "ocr": "22-Feb-2020 03:04\n4M\n22-Feb-2020 03:04\n4M\nz\n22-Feb-2020 03:04\n4M\nx7"}{"name": "zoom in", "ocr": "Q"}{"name": "pound", "ocr": "\u00a31"}{"name": "open directory", "ocr": "Directory"}{"name": "error page", "ocr": "You don't have permission to access \"http://www.findycompany.com/company/ on this server."}{"name": "click to view image", "ocr": "M"}{"name": "select text", "ocr": "r bh2010@johnthornhillmarketing.com\nthem slatham@domainsrea.com"}{"name": "text box", "ocr": "Node.js v10.x"}{"name": "text in black and white", "ocr": "FRAN\u00c7AIS"}{"name": "select feed", "ocr": "Feeds"}{"name": "select the task", "ocr": "tasks"}{"name": "text box", "ocr": "Everything on this site was made by ME unless stated otherwise."}{"name": "logo", "ocr": "1. Paper Books"}{"name": "zoom in", "ocr": "Q"}{"name

In [22]:
# zip the paligemma annotations
import os
output_zip_path = Path('../downloads/paligemma_annotations.zip')
base_output_path = Path("../downloads/filtered/paligemma_annotations")

with zipfile.ZipFile(output_zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
    for root, dirs, files in os.walk(base_output_path):
        for file in files:
            file_path = os.path.join(root, file)
            zipf.write(file_path, os.path.relpath(file_path, base_output_path))

In [3]:
# upload the paligemma annotations zip file to GCS

bucket_name = "processed_webui"
source_file_name = "../downloads/paligemma_annotations.zip"
destination_blob_name = "paligemma_annotations.zip"
storage_client = storage.Client()
bucket = storage_client.bucket(bucket_name)
blob = bucket.blob(destination_blob_name)
blob.upload_from_filename(source_file_name)
print(f"File {source_file_name} uploaded to {destination_blob_name}.")

File ../downloads/paligemma_annotations.zip uploaded to paligemma_annotations.zip.


## Visualizing PaliGemma annotations

In [69]:
# download paligemma annotations from GCS
bucket_name = 'processed_webui'
source_blob_name = 'paligemma_annotations.zip'
destination_dir = '../downloads/filtered/paligemma_annotations'
destination_file_name = destination_dir + "/paligemma_annotations.zip"

In [70]:
Path(destination_dir).mkdir(parents=True, exist_ok=True)
storage_client = storage.Client()
bucket = storage_client.bucket(bucket_name)
blob = bucket.blob(source_blob_name)
blob.download_to_filename(destination_file_name)
print(f"Blob {source_blob_name} downloaded to {destination_file_name}.")

Blob paligemma_annotations.zip downloaded to ../downloads/filtered/paligemma_annotations/paligemma_annotations.zip.


In [71]:
with zipfile.ZipFile(destination_file_name, 'r') as zip_ref:
    zip_ref.extractall(destination_dir)

In [72]:
Path(destination_file_name).unlink()

In [73]:
len(list(Path(destination_dir).iterdir()))

100

In [76]:
paligemma_annotations = Path("../downloads/filtered/paligemma_annotations")


In [78]:
for bb in random_bb_list:
    # read bb json data
    with open(bb, "r") as f:
        data = json.load(f)
    site = bb.parent.parent
    image = site / "images" / "full-screenshot.webp"
    image = Image.open(image)
    draw = ImageDraw.Draw(image)
    draw.rectangle((data["bounding_box"]["x"], data["bounding_box"]["y"], data["bounding_box"]["x"] + data["bounding_box"]["width"], data["bounding_box"]["y"] + data["bounding_box"]["height"]), outline="red")
    # get annotation data
    with open(paligemma_annotations / f"{bb.parent.parent.name}_{bb.stem}.json", "r") as f:
        tag_data = json.load(f)
        # tag_data = json.loads(tag_data)
    draw.text((data["bounding_box"]["x"], data["bounding_box"]["y"] - 20), f"{tag_data['name']} - {tag_data['ocr']}", fill="red")
    image.show()
    input("Press Enter to continue to the next image...")

KeyboardInterrupt: Interrupted by user

In [80]:
# store random_bb_list in ../downloads/ with pickle

import pickle

with open("../downloads/random_bb_list.pkl", "wb") as f:
    pickle.dump(random_bb_list, f)



In [82]:
openai_annotated_images = Path("../downloads/openai_annotated_images")
openai_annotated_images.mkdir(parents=True, exist_ok=True)

for bb in random_bb_list:
    # read bb json data
    with open(bb, "r") as f:
        data = json.load(f)
    site = bb.parent.parent
    image = site / "images" / "full-screenshot.webp"
    image = Image.open(image)
    draw = ImageDraw.Draw(image)
    draw.rectangle((data["bounding_box"]["x"], data["bounding_box"]["y"], data["bounding_box"]["x"] + data["bounding_box"]["width"], data["bounding_box"]["y"] + data["bounding_box"]["height"]), outline="red")
    # get annotation data
    with open(openai_annotations / f"{bb.parent.parent.name}_{bb.stem}.json", "r") as f:
        tag_data = json.load(f)
        tag_data = json.loads(tag_data)
    draw.text((data["bounding_box"]["x"], data["bounding_box"]["y"] - 20), f"{tag_data['name']} - {tag_data['tag']}", fill="red")
    # save image
    image.save(openai_annotated_images / f"{bb.parent.parent.name}_{bb.stem}.jpeg", format="JPEG")

In [85]:

paligemma_annotated_images = Path("../downloads/paligemma_annotated_images")
paligemma_annotated_images.mkdir(parents=True, exist_ok=True)

for bb in random_bb_list:
    # read bb json data
    with open(bb, "r") as f:
        data = json.load(f)
    site = bb.parent.parent
    image = site / "images" / "full-screenshot.webp"
    image = Image.open(image)
    draw = ImageDraw.Draw(image)
    draw.rectangle((data["bounding_box"]["x"], data["bounding_box"]["y"], data["bounding_box"]["x"] + data["bounding_box"]["width"], data["bounding_box"]["y"] + data["bounding_box"]["height"]), outline="red")
    # get annotation data
    with open(paligemma_annotations / f"{bb.parent.parent.name}_{bb.stem}.json", "r") as f:
        tag_data = json.load(f)
    draw.text((data["bounding_box"]["x"], data["bounding_box"]["y"] - 20), f"{tag_data['name']} - {tag_data['ocr']}", fill="red")
    # save image
    image.save(paligemma_annotated_images / f"{bb.parent.parent.name}_{bb.stem}.jpeg", format="JPEG")

The PaliGemma annotation took in total 242 seconds to process the 100 images, using a batch-size of 8, and running the detection and ocr tasks in parallel.

Using these same parameters, we would expect the annotation of the full dataset to take ~ 164560s (242 / 100 * 68k), which is roughly 45 hours. At $2 per compute-hour that's a total cost of $90 USD.

This is lower than the estimated ~ $140 USD for doing it with OpenAI