In [1]:
from transformers import Qwen3VLForConditionalGeneration, AutoProcessor
import torch
import os
from datasets import load_dataset

SLURM_PATH = '/home/yandex/MLWG2025/amitr5'
CACHE_DIR = f'{SLURM_PATH}/tmp/hf_cache'  # Changed to /tmp to avoid quota issues

os.makedirs(CACHE_DIR, exist_ok=True)

if SLURM_PATH in os.getcwd():
    os.environ["PIP_PATH"] = f"{SLURM_PATH}/BaryGNN/anaconda3/envs/conf/bin/pip"
    os.environ["TEMP_DIR"] = CACHE_DIR
    os.environ["HF_HOME"] = CACHE_DIR
    os.environ["TRANSFORMERS_CACHE"] = CACHE_DIR
    os.environ["HF_DATASETS_CACHE"] = CACHE_DIR
    os.environ["HF_HUB_CACHE"] = CACHE_DIR
    os.environ["TMPDIR"] = CACHE_DIR
    # os.environ["TOKENIZERS_PARALLELISM"] = "false"

## Mind2Web
Mind2Web is a large-scale dataset for grounding language instructions to web actions.

In [None]:
!git clone https://github.com/OSU-NLP-Group/Mind2Web.git
%cd Mind2Web
!pip install -r requirements.txt
%cd ..

In [2]:
ds = load_dataset("osunlp/Multimodal-Mind2Web", cache_dir=CACHE_DIR)

# Check available splits
print("Dataset splits:", list(ds.keys()))
# Access the train split
train_ds = ds['train']
print(f"Number of samples in train split: {len(train_ds)}")

Resolving data files:   0%|          | 0/27 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/23 [00:00<?, ?it/s]

Dataset splits: ['train', 'test_domain', 'test_task', 'test_website']
Number of samples in train split: 7775


In [3]:
import pandas as pd

# Convert the train split to a pandas DataFrame
df = train_ds.to_pandas()
df['action_id'] = range(len(df))  # Add a default integer ID column
df.head()

Unnamed: 0,action_uid,raw_html,cleaned_html,operation,pos_candidates,neg_candidates,website,domain,subdomain,annotation_id,confirmed_task,screenshot,action_reprs,target_action_index,target_action_reprs,action_id
0,6c7a7082-2897-41c7-9688-4b0f3d778cdb,"<!DOCTYPE html PUBLIC ""-//W3C//DTD HTML 4.0 Tr...","<html backend_node_id=""208"">\n <body backend_...","{""original_op"": ""CLICK"", ""value"": """", ""op"": ""C...","[{""tag"": ""li"", ""attributes"": ""{\""backend_node_...","[{""tag"": ""div"", ""attributes"": ""{\""backend_node...",united,Travel,Airlines,401c4e6f-6b0b-47b4-8157-92d7ca468bbc,"rent a car in Brooklyn - Central, NY on from A...",{'bytes': b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x...,"[[heading] CAR -> CLICK, [combobox] Enter pi...",0,[heading] CAR -> CLICK,0
1,b64c2417-c44e-46c4-bb0b-ff1775e7da29,"<!DOCTYPE html PUBLIC ""-//W3C//DTD HTML 4.0 Tr...","<html backend_node_id=""10021"">\n <body backen...","{""original_op"": ""TYPE"", ""value"": ""Brooklyn Cen...","[{""tag"": ""input"", ""attributes"": ""{\""backend_no...","[{""tag"": ""div"", ""attributes"": ""{\""backend_node...",united,Travel,Airlines,401c4e6f-6b0b-47b4-8157-92d7ca468bbc,"rent a car in Brooklyn - Central, NY on from A...",{'bytes': b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x...,"[[heading] CAR -> CLICK, [combobox] Enter pi...",1,"[combobox] Enter pick up city, airport name, ...",1
2,dad6690b-9b3e-4395-bd06-9aa065bf4027,"<!DOCTYPE html PUBLIC ""-//W3C//DTD HTML 4.0 Tr...","<html backend_node_id=""20041"">\n <body backen...","{""original_op"": ""CLICK"", ""value"": """", ""op"": ""C...","[{""tag"": ""button"", ""attributes"": ""{\""backend_n...","[{""tag"": ""div"", ""attributes"": ""{\""backend_node...",united,Travel,Airlines,401c4e6f-6b0b-47b4-8157-92d7ca468bbc,"rent a car in Brooklyn - Central, NY on from A...",{'bytes': b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x...,"[[heading] CAR -> CLICK, [combobox] Enter pi...",2,"[div] Brooklyn - Central (New York), US -> CLICK",2
3,e0fd3f28-3f04-455d-8bde-a480f0ec1b0a,"<!DOCTYPE html PUBLIC ""-//W3C//DTD HTML 4.0 Tr...","<html backend_node_id=""30061"">\n <body backen...","{""original_op"": ""CLICK"", ""value"": """", ""op"": ""C...","[{""tag"": ""input"", ""attributes"": ""{\""backend_no...","[{""tag"": ""div"", ""attributes"": ""{\""backend_node...",united,Travel,Airlines,401c4e6f-6b0b-47b4-8157-92d7ca468bbc,"rent a car in Brooklyn - Central, NY on from A...",{'bytes': b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x...,"[[heading] CAR -> CLICK, [combobox] Enter pi...",3,[textbox] Pickup -> CLICK,3
4,4762d735-9dc2-4717-ae8b-baab0b3446e5,"<!DOCTYPE html PUBLIC ""-//W3C//DTD HTML 4.0 Tr...","<html backend_node_id=""40453"">\n <body backen...","{""original_op"": ""CLICK"", ""value"": """", ""op"": ""C...","[{""tag"": ""td"", ""attributes"": ""{\""backend_node_...","[{""tag"": ""div"", ""attributes"": ""{\""backend_node...",united,Travel,Airlines,401c4e6f-6b0b-47b4-8157-92d7ca468bbc,"rent a car in Brooklyn - Central, NY on from A...",{'bytes': b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x...,"[[heading] CAR -> CLICK, [combobox] Enter pi...",4,"[button] Sunday, April 9, 2023 -> CLICK",4


In [51]:
n_steps = df.action_reprs.apply(lambda x: len(x))
print(f"max steps: {n_steps.max()}, min steps: {n_steps.min()}, mean: {n_steps.mean():.2f}, median: {n_steps.median()}")

max steps: 37, min steps: 2, mean: 10.82, median: 9.0


In [4]:
# Group by annotation_id (this creates a GroupBy object for fast access)
grouped = df.groupby('annotation_id')

# Retrieve all rows for a specific annotation_id
ann_id = train_ds[0]["annotation_id"]
task_df = grouped.get_group(ann_id).sort_values('target_action_index')

print(f"Task: {task_df.iloc[0]['confirmed_task']}")
# Iterate and display (task_df is a DataFrame)
for _, ex in task_df.iterrows():
    print(
        f"step={int(ex['target_action_index']) + 1}/{len(task_df)} | op={ex['operation']} "
        f"| target_action={ex["target_action_reprs"]} | pos_candidates={len(ex['pos_candidates'])}, action_id={ex['action_id']}"
    )
    # display(train_ds[ex["action_id"]]["screenshot"])

Task: rent a car in Brooklyn - Central, NY on from April 9 to April 15.
step=1/7 | op={"original_op": "CLICK", "value": "", "op": "CLICK"} | target_action=[heading]  CAR -> CLICK | pos_candidates=1, action_id=0
step=2/7 | op={"original_op": "TYPE", "value": "Brooklyn Central", "op": "TYPE"} | target_action=[combobox]  Enter pick up city, airport name, or airport code. -> TYPE: Brooklyn Central | pos_candidates=1, action_id=1
step=3/7 | op={"original_op": "CLICK", "value": "", "op": "CLICK"} | target_action=[div]  Brooklyn - Central (New York), US -> CLICK | pos_candidates=1, action_id=2
step=4/7 | op={"original_op": "CLICK", "value": "", "op": "CLICK"} | target_action=[textbox]  Pickup -> CLICK | pos_candidates=1, action_id=3
step=5/7 | op={"original_op": "CLICK", "value": "", "op": "CLICK"} | target_action=[button]  Sunday, April 9, 2023 -> CLICK | pos_candidates=1, action_id=4
step=6/7 | op={"original_op": "CLICK", "value": "", "op": "CLICK"} | target_action=[button]  Saturday, April

In [68]:
task_df.loc[0, "website"]

'united'

In [82]:
!pip install seeact

Collecting seeact
  Downloading seeact-0.2.9.0-py3-none-any.whl.metadata (13 kB)
  Downloading seeact-0.2.9.0-py3-none-any.whl.metadata (13 kB)
Collecting backoff (from seeact)
Collecting backoff (from seeact)
  Downloading backoff-2.2.1-py3-none-any.whl.metadata (14 kB)
  Downloading backoff-2.2.1-py3-none-any.whl.metadata (14 kB)
Collecting playwright (from seeact)
Collecting playwright (from seeact)
  Downloading playwright-1.56.0-py3-none-manylinux1_x86_64.whl.metadata (3.5 kB)
  Downloading playwright-1.56.0-py3-none-manylinux1_x86_64.whl.metadata (3.5 kB)
Collecting toml (from seeact)
Collecting toml (from seeact)
  Downloading toml-0.10.2-py2.py3-none-any.whl.metadata (7.1 kB)
  Downloading toml-0.10.2-py2.py3-none-any.whl.metadata (7.1 kB)
Collecting openai==1.24.0 (from seeact)
Collecting openai==1.24.0 (from seeact)
  Downloading openai-1.24.0-py3-none-any.whl.metadata (21 kB)
  Downloading openai-1.24.0-py3-none-any.whl.metadata (21 kB)
Collecting httpx==0.27.2 (from seeact)

In [24]:
! playwright install

Downloading Chromium 141.0.7390.37 (playwright build v1194)[2m from https://cdn.playwright.dev/dbazure/download/playwright/builds/chromium/1194/chromium-linux.zip[22m
Chromium 141.0.7390.37 (playwright build v1194) downloaded to /a/home/cc/students/math/amitr5/.cache/ms-playwright/chromium-1194
Downloading Chromium Headless Shell 141.0.7390.37 (playwright build v1194)[2m from https://cdn.playwright.dev/dbazure/download/playwright/builds/chromium/1194/chromium-headless-shell-linux.zip[22m
Chromium 141.0.7390.37 (playwright build v1194) downloaded to /a/home/cc/students/math/amitr5/.cache/ms-playwright/chromium-1194
Downloading Chromium Headless Shell 141.0.7390.37 (playwright build v1194)[2m from https://cdn.playwright.dev/dbazure/download/playwright/builds/chromium/1194/chromium-headless-shell-linux.zip[22m
Chromium Headless Shell 141.0.7390.37 (playwright build v1194) downloaded to /a/home/cc/students/math/amitr5/.cache/ms-playwright/chromium_headless_shell-1194
Downloading Fire

In [30]:
import asyncio
import os
import pandas as pd
import json
import sys
from seeact.agent import SeeActAgent
from transformers import Qwen3VLForConditionalGeneration, AutoProcessor
import torch

# Load the Qwen model (adjust device as needed)
model = Qwen3VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen3-VL-8B-Instruct",
    torch_dtype=torch.float16,
    device_map="auto"
)
processor = AutoProcessor.from_pretrained("Qwen/Qwen3-VL-8B-Instruct")

async def custom_predict(agent):
    """
    Custom predict method using the local Qwen model.
    """
    # Assuming agent has methods to get screenshot and build prompt
    screenshot = agent.get_screenshot()  # PIL Image
    text_prompt = agent.build_prompt()  # Text prompt
    
    messages = [
        {"role": "user", "content": [
            {"type": "image", "image": screenshot},
            {"type": "text", "text": text_prompt}
        ]}
    ]
    inputs = processor.apply_chat_template(
        messages, tokenize=True, return_dict=True, return_tensors="pt"
    ).to(model.device)
    
    with torch.no_grad():
        outputs = model.generate(**inputs, max_new_tokens=512)
    
    prediction_text = processor.decode(outputs[0], skip_special_tokens=True)
    
    # Parse the prediction_text to extract plan, action_type, details
    # This is a placeholder; adjust based on your prompt structure
    prediction = parse_prediction(prediction_text)
    
    return prediction

def parse_prediction(text):
    """
    Parse the model's output into SeeAct's expected format.
    Customize this based on how you structure the prompt.
    """
    # Example parsing (replace with actual logic)
    if "click" in text.lower():
        return {
            'plan': 'Click on an element',
            'action_type': 'click',
            'details': {'element': 'button'}
        }
    elif "type" in text.lower():
        return {
            'plan': 'Type text',
            'action_type': 'type',
            'details': {'text': 'example'}
        }
    else:
        return {
            'plan': 'Unknown action',
            'action_type': 'none',
            'details': {}
        }

async def run_single_task(task_objective, start_url, model_name="gpt-4-turbo"):
    """
    Runs the SeeAct agent for a single, non-interactive task.
    
    Returns a dictionary with the results of the task.
    """
    print(f"\n--- Starting Task ---")
    print(f"Objective: {task_objective}")
    print(f"URL: {start_url}")
    
    task_result = {
        'task': task_objective,
        'url': start_url,
        'status': 'Failed',
        'actions': []
    }
    
    agent = None
    try:
        # Initialize the agent with a supported model (e.g., gpt-4o)
        agent = SeeActAgent(model="gpt-4-turbo", headless=True)  # Use a supported model as base
        
        # Override the predict method with our custom one
        agent.predict = custom_predict.__get__(agent, SeeActAgent)
        
        # Set the task and website
        agent.task = task_objective
        agent.website = start_url
        
        # Now, start() will just open the browser and begin
        await agent.start()
        
        # --- Run the core "See/Act" loop ---
        while not agent.complete_flag:
            print("  - Seeing...")
            prediction = await agent.predict()
            
            # Log the action
            action_info = {
                "plan": prediction.get('plan', 'N/A'),
                "action_type": prediction.get('action_type', 'N/A'),
                "details": prediction.get('details', {})
            }
            task_result['actions'].append(action_info)
            print(f"  - Acting: {action_info['action_type']} (Plan: {action_info['plan']})")

            await agent.execute(prediction)
        
        print(f"--- Task Finished ---")
        task_result['status'] = 'Success'

    except Exception as e:
        print(f"Error during task '{task_objective}': {e}")
        task_result['error'] = str(e)
    
    finally:
        # --- Clean up ---
        # Ensure the agent and its browser are closed
        if agent:
            print("Stopping agent and closing browser.")
            await agent.stop()
            
    return task_result

async def main():
    """
    Main function to load pandas data and run tasks in a batch.
    """
    
    
    data = {
        'task_id': ['task_001'],
        'task_objective': ["rent a car in Brooklyn - Central, NY on from April 9 to April 15."],
        'start_url': ["https://www.united.com/"]
    }
    df = pd.DataFrame(data)
    
    print(f"Loaded {len(df)} tasks to run from DataFrame.")
    
    final_results = []
    
    # --- 3. ITERATE AND RUN ---
    # We run tasks one by one (serially)
    for row in df.itertuples():
        # You must re-create the agent for each task
        result = await run_single_task(
            task_objective=row.task_objective,
            start_url=row.start_url,
            model_name="Qwen/Qwen3-VL-8B-Instruct"
        )
        final_results.append(result)

    # --- 4. SHOW FINAL RESULTS ---
    print("\n\n--- BATCH RUN COMPLETE ---")
    print("Results:")
    print(json.dumps(final_results, indent=2))

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [31]:
# In Jupyter notebooks an event loop is already running.
# Use top-level await instead of asyncio.run to avoid the RuntimeError.
os.environ["OPENAI_API_KEY"] = "YOUR_API_KEY_HERE"
await main()

Loaded 1 tasks to run from DataFrame.

--- Starting Task ---
Objective: rent a car in Brooklyn - Central, NY on from April 9 to April 15.
URL: https://www.united.com/
Initializing model gpt-4-turbo


'NoneType' object has no attribute 'close'
Agent stopped.


Error during task 'rent a car in Brooklyn - Central, NY on from April 9 to April 15.': BrowserType.launch: Target page, context or browser has been closed
Browser logs:

╔════════════════════════════════════════════════════════════════════════════════════════════════╗
║ Looks like you launched a headed browser without having a XServer running.                     ║
║ Set either 'headless: true' or use 'xvfb-run <your-playwright-app>' before running Playwright. ║
║                                                                                                ║
║ <3 Playwright Team                                                                             ║
╚════════════════════════════════════════════════════════════════════════════════════════════════╝
Call log:
  - <launched> pid=67959
  - [pid=67959][err] [67959:67959:1114/194828.075082:ERROR:ui/ozone/platform/x11/ozone_platform_x11.cc:249] Missing X server or $DISPLAY
  - [pid=67959][err] [67959:67959:1114/194828.075129:ERROR:ui/au

In [22]:
# In Jupyter notebooks an event loop is already running.
# Use top-level await instead of asyncio.run to avoid the RuntimeError.
os.environ["OPENAI_API_KEY"] = "YOUR_API_KEY_HERE"
await main()

Loaded 1 tasks to run from DataFrame.

--- Starting Task ---
Objective: rent a car in Brooklyn - Central, NY on from April 9 to April 15.
URL: https://www.united.com/
Initializing model gpt-4o


'NoneType' object has no attribute 'close'
Agent stopped.
Agent stopped.


Error during task 'rent a car in Brooklyn - Central, NY on from April 9 to April 15.': BrowserType.launch: Executable doesn't exist at /a/home/cc/students/math/amitr5/.cache/ms-playwright/chromium-1194/chrome-linux/chrome
╔════════════════════════════════════════════════════════════╗
║ Looks like Playwright was just installed or updated.       ║
║ Please run the following command to download new browsers: ║
║                                                            ║
║     playwright install                                     ║
║                                                            ║
║ <3 Playwright Team                                         ║
╚════════════════════════════════════════════════════════════╝
Stopping agent and closing browser.


--- BATCH RUN COMPLETE ---
Results:
[
  {
    "task": "rent a car in Brooklyn - Central, NY on from April 9 to April 15.",
    "url": "https://www.united.com/",
    "status": "Failed",
    "actions": [],
    "error": "BrowserType.launch