In [None]:
from transformers import Qwen3VLForConditionalGeneration, AutoProcessor
import torch
import os
from datasets import load_dataset

SLURM_PATH = '/home/yandex/MLWG2025/amitr5'
CACHE_DIR = f'{SLURM_PATH}/tmp/hf_cache'  # Changed to /tmp to avoid quota issues

os.makedirs(CACHE_DIR, exist_ok=True)

if SLURM_PATH in os.getcwd():
    os.environ["PIP_PATH"] = f"{SLURM_PATH}/BaryGNN/anaconda3/envs/conf/bin/pip"
    os.environ["TEMP_DIR"] = CACHE_DIR
    os.environ["HF_HOME"] = CACHE_DIR
    os.environ["TRANSFORMERS_CACHE"] = CACHE_DIR
    os.environ["HF_DATASETS_CACHE"] = CACHE_DIR
    os.environ["HF_HUB_CACHE"] = CACHE_DIR
    os.environ["TMPDIR"] = CACHE_DIR
    # os.environ["TOKENIZERS_PARALLELISM"] = "false"

## Mind2Web
Mind2Web is a large-scale dataset for grounding language instructions to web actions.

In [None]:
!git clone https://github.com/OSU-NLP-Group/Mind2Web.git
%cd Mind2Web
!pip install -r requirements.txt
%cd ..

In [None]:
ds = load_dataset("osunlp/Multimodal-Mind2Web", cache_dir=CACHE_DIR)

# Check available splits
print("Dataset splits:", list(ds.keys()))
# Access the train split
train_ds = ds['train']
print(f"Number of samples in train split: {len(train_ds)}")

In [None]:
import pandas as pd

# Convert the train split to a pandas DataFrame
df = train_ds.to_pandas()
df['action_id'] = range(len(df))  # Add a default integer ID column
df.head()

Dataset splits: ['train', 'test_domain', 'test_task', 'test_website']
Number of samples in train split: 7775


Unnamed: 0,action_uid,raw_html,cleaned_html,operation,pos_candidates,neg_candidates,website,domain,subdomain,annotation_id,confirmed_task,screenshot,action_reprs,target_action_index,target_action_reprs,action_id
0,6c7a7082-2897-41c7-9688-4b0f3d778cdb,"<!DOCTYPE html PUBLIC ""-//W3C//DTD HTML 4.0 Tr...","<html backend_node_id=""208"">\n <body backend_...","{""original_op"": ""CLICK"", ""value"": """", ""op"": ""C...","[{""tag"": ""li"", ""attributes"": ""{\""backend_node_...","[{""tag"": ""div"", ""attributes"": ""{\""backend_node...",united,Travel,Airlines,401c4e6f-6b0b-47b4-8157-92d7ca468bbc,"rent a car in Brooklyn - Central, NY on from A...",{'bytes': b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x...,"[[heading] CAR -> CLICK, [combobox] Enter pi...",0,[heading] CAR -> CLICK,0
1,b64c2417-c44e-46c4-bb0b-ff1775e7da29,"<!DOCTYPE html PUBLIC ""-//W3C//DTD HTML 4.0 Tr...","<html backend_node_id=""10021"">\n <body backen...","{""original_op"": ""TYPE"", ""value"": ""Brooklyn Cen...","[{""tag"": ""input"", ""attributes"": ""{\""backend_no...","[{""tag"": ""div"", ""attributes"": ""{\""backend_node...",united,Travel,Airlines,401c4e6f-6b0b-47b4-8157-92d7ca468bbc,"rent a car in Brooklyn - Central, NY on from A...",{'bytes': b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x...,"[[heading] CAR -> CLICK, [combobox] Enter pi...",1,"[combobox] Enter pick up city, airport name, ...",1
2,dad6690b-9b3e-4395-bd06-9aa065bf4027,"<!DOCTYPE html PUBLIC ""-//W3C//DTD HTML 4.0 Tr...","<html backend_node_id=""20041"">\n <body backen...","{""original_op"": ""CLICK"", ""value"": """", ""op"": ""C...","[{""tag"": ""button"", ""attributes"": ""{\""backend_n...","[{""tag"": ""div"", ""attributes"": ""{\""backend_node...",united,Travel,Airlines,401c4e6f-6b0b-47b4-8157-92d7ca468bbc,"rent a car in Brooklyn - Central, NY on from A...",{'bytes': b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x...,"[[heading] CAR -> CLICK, [combobox] Enter pi...",2,"[div] Brooklyn - Central (New York), US -> CLICK",2
3,e0fd3f28-3f04-455d-8bde-a480f0ec1b0a,"<!DOCTYPE html PUBLIC ""-//W3C//DTD HTML 4.0 Tr...","<html backend_node_id=""30061"">\n <body backen...","{""original_op"": ""CLICK"", ""value"": """", ""op"": ""C...","[{""tag"": ""input"", ""attributes"": ""{\""backend_no...","[{""tag"": ""div"", ""attributes"": ""{\""backend_node...",united,Travel,Airlines,401c4e6f-6b0b-47b4-8157-92d7ca468bbc,"rent a car in Brooklyn - Central, NY on from A...",{'bytes': b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x...,"[[heading] CAR -> CLICK, [combobox] Enter pi...",3,[textbox] Pickup -> CLICK,3
4,4762d735-9dc2-4717-ae8b-baab0b3446e5,"<!DOCTYPE html PUBLIC ""-//W3C//DTD HTML 4.0 Tr...","<html backend_node_id=""40453"">\n <body backen...","{""original_op"": ""CLICK"", ""value"": """", ""op"": ""C...","[{""tag"": ""td"", ""attributes"": ""{\""backend_node_...","[{""tag"": ""div"", ""attributes"": ""{\""backend_node...",united,Travel,Airlines,401c4e6f-6b0b-47b4-8157-92d7ca468bbc,"rent a car in Brooklyn - Central, NY on from A...",{'bytes': b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x...,"[[heading] CAR -> CLICK, [combobox] Enter pi...",4,"[button] Sunday, April 9, 2023 -> CLICK",4


In [51]:
n_steps = df.action_reprs.apply(lambda x: len(x))
print(f"max steps: {n_steps.max()}, min steps: {n_steps.min()}, mean: {n_steps.mean():.2f}, median: {n_steps.median()}")

max steps: 37, min steps: 2, mean: 10.82, median: 9.0


In [61]:
# Group by annotation_id (this creates a GroupBy object for fast access)
grouped = df.groupby('annotation_id')

# Retrieve all rows for a specific annotation_id
ann_id = train_ds[0]["annotation_id"]
task_df = grouped.get_group(ann_id).sort_values('target_action_index')

print(f"Task: {task_df.iloc[0]['confirmed_task']}")
# Iterate and display (task_df is a DataFrame)
for _, ex in task_df.iterrows():
    print(
        f"step={int(ex['target_action_index']) + 1}/{len(task_df)} | op={ex['operation']} "
        f"| target_action={ex["target_action_reprs"]} | pos_candidates={len(ex['pos_candidates'])}, action_id={ex['action_id']}"
    )
    # display(train_ds[ex["action_id"]]["screenshot"])

Task: rent a car in Brooklyn - Central, NY on from April 9 to April 15.
step=1/7 | op={"original_op": "CLICK", "value": "", "op": "CLICK"} | target_action=[heading]  CAR -> CLICK | pos_candidates=1, action_id=0
step=2/7 | op={"original_op": "TYPE", "value": "Brooklyn Central", "op": "TYPE"} | target_action=[combobox]  Enter pick up city, airport name, or airport code. -> TYPE: Brooklyn Central | pos_candidates=1, action_id=1
step=3/7 | op={"original_op": "CLICK", "value": "", "op": "CLICK"} | target_action=[div]  Brooklyn - Central (New York), US -> CLICK | pos_candidates=1, action_id=2
step=4/7 | op={"original_op": "CLICK", "value": "", "op": "CLICK"} | target_action=[textbox]  Pickup -> CLICK | pos_candidates=1, action_id=3
step=5/7 | op={"original_op": "CLICK", "value": "", "op": "CLICK"} | target_action=[button]  Sunday, April 9, 2023 -> CLICK | pos_candidates=1, action_id=4
step=6/7 | op={"original_op": "CLICK", "value": "", "op": "CLICK"} | target_action=[button]  Saturday, April

In [62]:
task_df

Unnamed: 0,action_uid,raw_html,cleaned_html,operation,pos_candidates,neg_candidates,website,domain,subdomain,annotation_id,confirmed_task,screenshot,action_reprs,target_action_index,target_action_reprs,action_id
0,6c7a7082-2897-41c7-9688-4b0f3d778cdb,"<!DOCTYPE html PUBLIC ""-//W3C//DTD HTML 4.0 Tr...","<html backend_node_id=""208"">\n <body backend_...","{""original_op"": ""CLICK"", ""value"": """", ""op"": ""C...","[{""tag"": ""li"", ""attributes"": ""{\""backend_node_...","[{""tag"": ""div"", ""attributes"": ""{\""backend_node...",united,Travel,Airlines,401c4e6f-6b0b-47b4-8157-92d7ca468bbc,"rent a car in Brooklyn - Central, NY on from A...",{'bytes': b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x...,"[[heading] CAR -> CLICK, [combobox] Enter pi...",0,[heading] CAR -> CLICK,0
1,b64c2417-c44e-46c4-bb0b-ff1775e7da29,"<!DOCTYPE html PUBLIC ""-//W3C//DTD HTML 4.0 Tr...","<html backend_node_id=""10021"">\n <body backen...","{""original_op"": ""TYPE"", ""value"": ""Brooklyn Cen...","[{""tag"": ""input"", ""attributes"": ""{\""backend_no...","[{""tag"": ""div"", ""attributes"": ""{\""backend_node...",united,Travel,Airlines,401c4e6f-6b0b-47b4-8157-92d7ca468bbc,"rent a car in Brooklyn - Central, NY on from A...",{'bytes': b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x...,"[[heading] CAR -> CLICK, [combobox] Enter pi...",1,"[combobox] Enter pick up city, airport name, ...",1
2,dad6690b-9b3e-4395-bd06-9aa065bf4027,"<!DOCTYPE html PUBLIC ""-//W3C//DTD HTML 4.0 Tr...","<html backend_node_id=""20041"">\n <body backen...","{""original_op"": ""CLICK"", ""value"": """", ""op"": ""C...","[{""tag"": ""button"", ""attributes"": ""{\""backend_n...","[{""tag"": ""div"", ""attributes"": ""{\""backend_node...",united,Travel,Airlines,401c4e6f-6b0b-47b4-8157-92d7ca468bbc,"rent a car in Brooklyn - Central, NY on from A...",{'bytes': b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x...,"[[heading] CAR -> CLICK, [combobox] Enter pi...",2,"[div] Brooklyn - Central (New York), US -> CLICK",2
3,e0fd3f28-3f04-455d-8bde-a480f0ec1b0a,"<!DOCTYPE html PUBLIC ""-//W3C//DTD HTML 4.0 Tr...","<html backend_node_id=""30061"">\n <body backen...","{""original_op"": ""CLICK"", ""value"": """", ""op"": ""C...","[{""tag"": ""input"", ""attributes"": ""{\""backend_no...","[{""tag"": ""div"", ""attributes"": ""{\""backend_node...",united,Travel,Airlines,401c4e6f-6b0b-47b4-8157-92d7ca468bbc,"rent a car in Brooklyn - Central, NY on from A...",{'bytes': b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x...,"[[heading] CAR -> CLICK, [combobox] Enter pi...",3,[textbox] Pickup -> CLICK,3
4,4762d735-9dc2-4717-ae8b-baab0b3446e5,"<!DOCTYPE html PUBLIC ""-//W3C//DTD HTML 4.0 Tr...","<html backend_node_id=""40453"">\n <body backen...","{""original_op"": ""CLICK"", ""value"": """", ""op"": ""C...","[{""tag"": ""td"", ""attributes"": ""{\""backend_node_...","[{""tag"": ""div"", ""attributes"": ""{\""backend_node...",united,Travel,Airlines,401c4e6f-6b0b-47b4-8157-92d7ca468bbc,"rent a car in Brooklyn - Central, NY on from A...",{'bytes': b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x...,"[[heading] CAR -> CLICK, [combobox] Enter pi...",4,"[button] Sunday, April 9, 2023 -> CLICK",4
5,660f5a50-1d68-4a30-a58b-25330fcabbe4,"<!DOCTYPE html PUBLIC ""-//W3C//DTD HTML 4.0 Tr...","<html backend_node_id=""51246"">\n <body backen...","{""original_op"": ""CLICK"", ""value"": """", ""op"": ""C...","[{""tag"": ""td"", ""attributes"": ""{\""backend_node_...","[{""tag"": ""div"", ""attributes"": ""{\""backend_node...",united,Travel,Airlines,401c4e6f-6b0b-47b4-8157-92d7ca468bbc,"rent a car in Brooklyn - Central, NY on from A...",{'bytes': b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x...,"[[heading] CAR -> CLICK, [combobox] Enter pi...",5,"[button] Saturday, April 15, 2023 -> CLICK",5
6,7473d4f5-d147-4c64-912c-620553698746,"<!DOCTYPE html PUBLIC ""-//W3C//DTD HTML 4.0 Tr...","<html backend_node_id=""61640"">\n <body backen...","{""original_op"": ""CLICK"", ""value"": """", ""op"": ""C...","[{""tag"": ""button"", ""attributes"": ""{\""backend_n...","[{""tag"": ""div"", ""attributes"": ""{\""backend_node...",united,Travel,Airlines,401c4e6f-6b0b-47b4-8157-92d7ca468bbc,"rent a car in Brooklyn - Central, NY on from A...",{'bytes': b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x...,"[[heading] CAR -> CLICK, [combobox] Enter pi...",6,[button] Find cars button. -> CLICK,6


In [63]:
!git clone https://github.com/OSU-NLP-Group/SeeAct.git

Cloning into 'SeeAct'...
remote: Enumerating objects: 1434, done.[K
remote: Counting objects: 100% (360/360), done.[K
remote: Enumerating objects: 1434, done.[K[K
remote: Counting objects: 100% (360/360), done.[K
remote: Compressing objects: 100% (122/122), done.[K
remote: Compressing objects: 100% (122/122), done.[K
remote: Total 1434 (delta 296), reused 238 (delta 238), pack-reused 1074 (from 1)[K
Receiving objects: 100% (1434/1434), 384.05 MiB | 19.57 MiB/s, done.
remote: Total 1434 (delta 296), reused 238 (delta 238), pack-reused 1074 (from 1)[K
Receiving objects: 100% (1434/1434), 384.05 MiB | 19.57 MiB/s, done.
Resolving deltas: 100% (757/757), done.
Resolving deltas: 100% (757/757), done.
