In [1]:
import json

with open("domvqa_verified_full.json", "r", encoding="utf-8") as f1:
    data1 = json.load(f1)
with open("domvqa_verified_full_v2.json", "r", encoding="utf-8") as f2:
    data2 = json.load(f2)

fused = data1 + data2

print(f"Loaded {len(data1)} from domvqa_verified.json")
print(f"Loaded {len(data2)} from domvqa_verified_v2.json")
print(f"Fused total: {len(fused)} examples")

Loaded 201 from domvqa_verified.json
Loaded 199 from domvqa_verified_v2.json
Fused total: 400 examples


In [2]:
# Remove samples where either dom_difficulty or img_difficulty is 0
filtered = [
    ex for ex in fused
    if ex.get("dom_difficulty", 1) != 0 and ex.get("img_difficulty", 1) != 0
]
print(f"Filtered total: {len(filtered)} examples (removed {len(fused) - len(filtered)})")

Filtered total: 400 examples (removed 0)


In [3]:
# Isolate samples for which both dom_difficulty and img_difficulty are exactly 1
easy_samples = [
    ex for ex in filtered
    if ex.get("dom_difficulty", 0) == 1 and ex.get("img_difficulty", 0) == 1
]
print(f"Number of samples with dom_difficulty=1 and img_difficulty=1: {len(easy_samples)}")


Number of samples with dom_difficulty=1 and img_difficulty=1: 203


In [4]:
with open("domvqa_fused_full_filtered.json", "w", encoding="utf-8") as f:
    json.dump(filtered, f, ensure_ascii=False, indent=2)
print(f"Saved {len(filtered)} filtered examples to domvqa_fused_full_filtered.json")


Saved 400 filtered examples to domvqa_fused_full_filtered.json


In [6]:
import base64
from io import BytesIO

# Function to encode a PIL image to base64
def encode_image(pil_image):
    buffered = BytesIO()
    pil_image.save(buffered, format="PNG")
    img_bytes = buffered.getvalue()
    return base64.b64encode(img_bytes).decode("utf-8")

def get_dom_messages(question, dom):
    return [
        {
            "role": "user",
            "content": [
                {"type": "input_text", "text": "Given the following DOM of a page, answer the question that is asked."},
                {
                    "type": "input_text",
                    "text": "<dom>" + dom + "</dom>",
                },
                {"type": "input_text", "text": f"Question: {question}" + r"""
                Your answer must be a boolean, a word or a number, contained within $\boxed{}$. Now answer the question.
                Answer:"""},
            ],
        }
    ]

def get_screenshot_messages(question, screenshot):
    return [
        {
            "role": "user",
            "content": [
                {"type": "input_text", "text": "Given the following image of a page, answer the question that is asked."},
                {
                    "type": "input_image",
                    "image_url": f"data:image/jpeg;base64,{encode_image(screenshot)}",
                },
                {"type": "input_text", "text": f"Question: {question}" + r"""
                Your answer must be a boolean, a word or a number, contained within $\boxed{}$. Now answer the question.
                Answer:"""},
            ],
        }
    ]

In [7]:
from openai import OpenAI
from dotenv import load_dotenv
import os

load_dotenv()

client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

In [15]:
from PIL import Image

first_html = easy_samples[0]["cleaned_html"]
first_screenshot_url = easy_samples[0]["screenshot"]
first_screenshot = Image.open(first_screenshot_url)
first_question = easy_samples[0]["question"]
first_answer = easy_samples[0]["answer"]

for k in range(2):
    input_messages = get_dom_messages(first_question, first_html) if k == 0 else get_screenshot_messages(first_question, first_screenshot)
    response = client.responses.create(
                    model="o4-mini",
                    reasoning={"effort": "high", "summary": "detailed"},
                    input=input_messages
    )
    print(response.summary)
    print(response.output_text)

AttributeError: 'Response' object has no attribute 'summary'