In [60]:
import json

with open('domvqa_results_v2.json', 'r', encoding='utf-8') as f:
    domvqa_results = json.load(f)

In [61]:
from pydantic import BaseModel

class answerExtracted(BaseModel):
    answer: str
    reasoning: str

class answerValidation(BaseModel):
    is_correct: bool
    reasoning: str

In [62]:
import base64
from io import BytesIO

# Function to encode a PIL image to base64
def encode_image(pil_image):
    buffered = BytesIO()
    pil_image.save(buffered, format="PNG")
    img_bytes = buffered.getvalue()
    return base64.b64encode(img_bytes).decode("utf-8")

In [63]:
def get_dom_messages(question, dom):
    return [
        {
            "role": "user",
            "content": [
                {"type": "input_text", "text": "Given the following DOM of a page, answer the question that is asked."},
                {
                    "type": "input_text",
                    "text": "<dom>" + dom + "</dom>",
                },
                {"type": "input_text", "text": f"Question: {question}" + r"""
                Your answer must be a boolean, a word or a number, contained within $\boxed{}$. Now answer the question.
                Answer:"""},
            ],
        }
    ]

def get_screenshot_messages(question, screenshot):
    return [
        {
            "role": "user",
            "content": [
                {"type": "input_text", "text": "Given the following image of a page, answer the question that is asked."},
                {
                    "type": "input_image",
                    "image_url": f"data:image/jpeg;base64,{encode_image(screenshot)}",
                },
                {"type": "input_text", "text": f"Question: {question}" + r"""
                Your answer must be a boolean, a word or a number, contained within $\boxed{}$. Now answer the question.
                Answer:"""},
            ],
        }
    ]

def get_validation_messages(answer, ground_truth):
    return [
        {
            "role": "user",
            "content": [
                {"type": "input_text", "text": "Given the following answer and ground truth, determine if the answer is correct."},
                {"type": "input_text", "text": f"Answer: {answer}"},
                {"type": "input_text", "text": f"Ground truth: {ground_truth}"},
            ],
        }
    ]

In [64]:
# Get the first question
from PIL import Image



In [65]:
from openai import OpenAI
from dotenv import load_dotenv
import os

load_dotenv()

client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

In [None]:
import re

num_trials = 4
results = []

for i in range(250, len(domvqa_results)):
    dom_score = 0
    image_score = 0
    first_html = domvqa_results[i]["cleaned_html"]
    first_screenshot_url = domvqa_results[i]["screenshot"]
    first_screenshot = Image.open(first_screenshot_url)
    # Get the first screenshot and answer
    first_question = domvqa_results[i]["question"]
    first_answer = domvqa_results[i]["answer"]
    for k in range(2):
        for _ in range(num_trials):
            input_messages = get_dom_messages(first_question, first_html) if k == 0 else get_screenshot_messages(first_question, first_screenshot)
            response = client.responses.parse(
                model="gpt-4.1-mini",
                input=input_messages,
                text_format=answerExtracted,
            )
            answer = response.output_parsed.answer
            parsed = None
            match = re.search(r'\\boxed\{([^}]*)\}', answer)
            if match:
                parsed = match.group(1)

            validation = client.responses.parse(
                model="gpt-4.1-nano",
                input=get_validation_messages(parsed, first_answer),
                text_format=answerValidation,
            )
            validate_bool = validation.output_parsed.is_correct
            if k == 0:
                dom_score += validate_bool
            else:
                image_score += validate_bool
    dom_difficulty = dom_score / num_trials
    img_difficulty = image_score / num_trials
    result_dict = {
        "question": first_question,
        "answer": first_answer,
        "screenshot": first_screenshot_url,
        "cleaned_html": first_html,
        "dom_difficulty": dom_difficulty,
        "img_difficulty": img_difficulty
    }
    results.append(result_dict)
    if i % 50 == 0:
        print(f"Saved {i} results")
        with open("domvqa_verified_v2.json", "w") as f:
            json.dump(results, f, indent=2)

with open("domvqa_verified_v2.json", "w") as f:
    json.dump(results, f, indent=2)

Saved 250 results
Saved 300 results
Saved 350 results
