In [1]:
from datasets import load_dataset

dataset = load_dataset("osunlp/Multimodal-Mind2Web")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from openai import OpenAI
from dotenv import load_dotenv
import os

load_dotenv()

client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

In [9]:
import base64
from io import BytesIO

# Function to encode a PIL image to base64
def encode_image(pil_image):
    buffered = BytesIO()
    pil_image.save(buffered, format="PNG")
    img_bytes = buffered.getvalue()
    return base64.b64encode(img_bytes).decode("utf-8")

In [23]:
from pydantic import BaseModel

class QAPair(BaseModel):
    question: str
    answer: str
    reasoning: str

class domvqaDataset(BaseModel):
    qapairs: list[QAPair]

domvqaDataset.model_rebuild()

In [30]:
def get_content(dom, screenshot):
    return [
                { "type": "input_text", "text": f"""Given the dom <dom>{dom}</dom> and the screenshot of the page, 
                 generate a list of 10 verifiable question and answer pairs that can be answered using with only the DOM at your disposal or only the screenshot at your disposal.
                 It is important that the answer to the question is obvious with just the DOM or just the screenshot.
                 The question must have thus either a simple numerical answer, a text answer with only one word or a boolean answer.""" },
                {
                    "type": "input_image",
                    "image_url": f"data:image/jpeg;base64,{encode_image(screenshot)}",
                },
            ]

In [34]:
import json
import os

results = []

# Ensure the images directory exists
images_dir = "images"
os.makedirs(images_dir, exist_ok=True)

for i in range(100):
    dom = dataset["train"][i]["cleaned_html"]
    screenshot = dataset["train"][i]["screenshot"]

    response = client.responses.parse(
        model="o3",
        input=[
            {
                "role": "user",
                "content": get_content(dom, screenshot)
            }
        ],
        text_format=domvqaDataset,
    )

    # Save the screenshot as a PNG file in the images directory
    image_filename = f"screenshot_{i}.png"
    image_path = os.path.join(images_dir, image_filename)
    screenshot.save(image_path, format="PNG")

    for qapair in response.output_parsed.qapairs:
        results.append({
            "cleaned_html": dom,
            "screenshot": image_path,
            "question": qapair.question,
            "answer": qapair.answer,
            "reasoning": qapair.reasoning
        })

with open("domvqa_results.json", "w", encoding="utf-8") as f:
    json.dump(results, f, ensure_ascii=False, indent=2)