In [1]:
from transformers import Qwen3VLForConditionalGeneration, AutoProcessor
import torch
import os
import pandas as pd
from datasets import load_dataset

SLURM_PATH = '/home/yandex/MLWG2025/amitr5'
CACHE_DIR = f'{SLURM_PATH}/tmp/hf_cache'  # Changed to /tmp to avoid quota issues

os.makedirs(CACHE_DIR, exist_ok=True)

if SLURM_PATH in os.getcwd():
    os.environ["PIP_PATH"] = f"{SLURM_PATH}/BaryGNN/anaconda3/envs/conf/bin/pip"
    os.environ["TEMP_DIR"] = CACHE_DIR
    os.environ["HF_HOME"] = CACHE_DIR
    os.environ["TRANSFORMERS_CACHE"] = CACHE_DIR
    os.environ["HF_DATASETS_CACHE"] = CACHE_DIR
    os.environ["HF_HUB_CACHE"] = CACHE_DIR
    os.environ["TMPDIR"] = CACHE_DIR
    # os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [2]:
from typing import Dict, Any
import torch
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, T5Tokenizer


import sys
sys.path.append('./Mind2Web/src')

%load_ext autoreload
%autoreload 1
%aimport action_prediction.dataloader, data_utils.dom_utils

from action_prediction.dataloader import MultiChoiceDataset, get_data_split
split_file = "test_task"  # or "test_task" or "test_website"
# candidate_results = pickle.load(open(f"{SLURM_PATH}/results/mind2web_{split_file}_candidates.pkl", "rb"))
candidate_results = pd.read_pickle(f"candidates/scores_{split_file}.pkl")
flattened = get_data_split(
    data_dir="osunlp/Multimodal-Mind2Web",
    split_file=split_file,
    candidate_results=candidate_results,
    cache_dir=CACHE_DIR
)

def tensorize_item(item: Dict[str, Any], device: str):
    """
    Convert the model_input dict returned by MultiChoiceDataset.__getitem__
    (lists of ints) into tensors appropriate for model.generate.
    """
    input_ids = torch.LongTensor(item["input_ids"]).unsqueeze(0).to(device)
    attention_mask = torch.LongTensor(item["attention_mask"]).unsqueeze(0).to(device)
    return {"input_ids": input_ids, "attention_mask": attention_mask}

device = "cuda" if torch.cuda.is_available() else "cpu"
model_name = "osunlp/MindAct_ActionPrediction_flan-t5-xl" #"Qwen/Qwen-3.5-VL-Base"

# Load tokenizer and model
# tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False, cache_dir=CACHE_DIR)
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-xl", cache_dir=CACHE_DIR)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name, cache_dir=CACHE_DIR)
model = model.to(device)
model.eval()

# Construct MultiChoiceDataset like evaluate.py does.
dataset = MultiChoiceDataset(
        flattened,
        tokenizer,
        neg_ratio=0,
        num_candidates=5,
        max_context_len=512,
        # mode="generation",  # use generation formatting
        mode="multichoice",  # use multichoice formatting
    )


Resolving data files:   0%|          | 0/27 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [24]:
ds = load_dataset("osunlp/Multimodal-Mind2Web", cache_dir=CACHE_DIR)

print("Dataset splits:", list(ds.keys()))
# Access the test splits
test_domain_ds = ds['test_domain']
test_task_ds = ds['test_task']
test_website_ds = ds['test_website']

print(f"Number of samples in test_domain: {len(test_domain_ds)}")
print(f"Number of samples in test_task: {len(test_task_ds)}")
print(f"Number of samples in test_website: {len(test_website_ds)}")
print("Total number of test samples:", len(test_domain_ds) + len(test_task_ds) + len(test_website_ds))

Resolving data files:   0%|          | 0/27 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/23 [00:00<?, ?it/s]

Dataset splits: ['train', 'test_domain', 'test_task', 'test_website']
Number of samples in test_domain: 4060
Number of samples in test_task: 1339
Number of samples in test_website: 1019
Total number of test samples: 6418


In [85]:
import random
import numpy as np
import torch.nn.functional as F
random.seed(42)
np.random.seed(42)


choices_to_token_ids = dataset.choices_token_ids_mapping()
outputs = []
all_scores = []  # Store scores for each sample

# The dataset length equals len(flattened)*10, and __getitem__ expands each original
# sample into multiple training examples. To pick distinct original examples we
# sample every 10th item (0, 10, 20, ...).
num_samples = 10  # number of original examples to generate for
max_new_tokens = 1#50
max_examples = min(num_samples, len(dataset.data))
for i in range(max_examples):
        idx = i * 10 
        if idx >= len(dataset):
            # fallback: sample last available indices if necessary
            idx = min(idx, len(dataset) - 1)
        item = dataset[idx]  # this is a dict with lists: input_ids, attention_mask, labels
        model_input = tensorize_item(item, device)

        with torch.no_grad():
            out = model.generate(
                **model_input,
                eos_token_id=model.config.eos_token_id,
                max_new_tokens=max_new_tokens,
                return_dict_in_generate=True,
                output_scores=True,
            )

        decoded = tokenizer.batch_decode(out["sequences"], skip_special_tokens=True)[0]
        labels_tokens = item.get("labels")
       
        # Calculate choice probabilities
        logits = out["scores"][0][0]
        all_probs = F.softmax(logits, dim=-1)
        probs = all_probs[list(choices_to_token_ids.values())]
        choices_probs = dict(zip(choices_to_token_ids.keys(), probs.cpu().tolist()))

        outputs.append(
            {
                "index": i,
                "dataset_index": idx,
                "annotation_id": dataset.data[i].get("annotation_id"),
                "action_uid": dataset.data[i].get("action_uid"),
                "generated": decoded,
                # include labels/tokenized labels for reference (if available)
                "labels_tokens": item.get("labels"),
                "choices_probs": choices_probs,
                "score":1 - choices_probs.get(decoded, 0)
            }
        )
        print(f"{'-'*20} Sample {i} {'-'*20}")
        print(f"annotation_id={dataset.data[i].get('annotation_id')}, generated={decoded}")
        
        
print(f"Processed {len(outputs)} samples total")
print("-" * 60)



-------------------- Sample 0 --------------------
annotation_id=91695df8-f256-47c9-8c37-06e8d0fc758f, generated=B
-------------------- Sample 1 --------------------
annotation_id=91695df8-f256-47c9-8c37-06e8d0fc758f, generated=D
-------------------- Sample 1 --------------------
annotation_id=91695df8-f256-47c9-8c37-06e8d0fc758f, generated=D
-------------------- Sample 2 --------------------
annotation_id=91695df8-f256-47c9-8c37-06e8d0fc758f, generated=C
-------------------- Sample 2 --------------------
annotation_id=91695df8-f256-47c9-8c37-06e8d0fc758f, generated=C
-------------------- Sample 3 --------------------
annotation_id=91695df8-f256-47c9-8c37-06e8d0fc758f, generated=C
-------------------- Sample 3 --------------------
annotation_id=91695df8-f256-47c9-8c37-06e8d0fc758f, generated=C
-------------------- Sample 4 --------------------
annotation_id=91695df8-f256-47c9-8c37-06e8d0fc758f, generated=F
-------------------- Sample 4 --------------------
annotation_id=91695df8-f256-4

In [None]:
import numpy as np


options = ["A", "B", "C", "D", "E", "F"]



def temp2():
    option_logits = {opt: logits[token_id].item() for opt, token_id in choices_to_token_ids.items()}

# print("\nRaw logits for each option:")
   
    # for opt, logit in option_logits.items():
        # normalized_prob = all_probs[choices_to_token_ids[opt]]
        # marker = " <-- GENERATED" if opt == generated_token.strip() else ""
        # print(f"  {opt}: logit={logit:.4f}, normalized_prob={normalized_prob:.6f}{marker}")

    # Find the option with highest logit
    best_option = max(choices_probs)
    print(f"Best option: {best_option}")

temp2()


Best option: 0.8456717729568481


tensor([1.5075e-01, 8.4567e-01, 1.4490e-03, 1.0129e-03, 2.1158e-04, 4.7099e-04])

In [57]:
def temp():  
    # Let's check what the model would generate if we manually take argmax
    all_probs = F.softmax(logits, dim=-1)
    top5_probs, top5_ids = torch.topk(all_probs, k=5)
    # print(f"\nTop 10 tokens by probability:")
    return top5_ids, top5_probs
    # for prob, token_id in zip(top5_probs, top5_ids):
        # token = tokenizer.decode([token_id.item()])
        # print(f"  {token!r} (ID: {token_id.item()}): {prob.item():.6f} | logit: {logits[token_id].item():.4f} | {token_id}")

    # Check if generated token is in top predictions
    # if generated_token_id in top5_ids:
    #     rank = (top5_ids == generated_token_id).nonzero(as_tuple=True)[0].item() + 1
    #     print(f"\n✓ Generated token '{generated_token.strip()}' is rank #{rank} in logits")
    # else:
    #     print(f"\n⚠️  Generated token '{generated_token.strip()}' is NOT in top 10!")

%timeit temp()

177 μs ± 17.7 μs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


{'A': 71, 'B': 272, 'C': 205, 'D': 309, 'E': 262, 'F': 377}

In [56]:
%aimport action_prediction.dataloader, data_utils.dom_utils
options = ["A", "B", "C", "D", "E", "F"]
all_probs = F.softmax(logits, dim=-1)
top5_probs, top5_ids = torch.topk(all_probs, k=5)
top5_ids
# x=dataset[11]
# dataset.data[0]
# sum([len(dataset.data[i]["pos_candidates"]) for i in range(len(dataset.data))])
#TODO:
# Figure out action representation formatting for multichoice vs generation modes
# Understand if getting a closed answer (e.g., "B") is enough for full action representation (e.g., Action:CLICK)
# Decide if output should be with \n or not

tensor([272,  71, 205, 309, 377])

In [13]:
dataset.data[0].keys()
x=dataset.data[14]["operation"]
x
# len(dataset.data[0]["action_reprs"])


'{"original_op": "CLICK", "value": "", "op": "CLICK"}'

In [None]:

import argparse
import json
import pathlib
import sys
from typing import Any, Dict

import torch
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

# Import the repository's dataset utilities
from action_prediction.dataloader import MultiChoiceDataset, get_data_split  # type: ignore





def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--model-name", default="google/flan-t5-base")
    parser.add_argument("--data-dir", required=True, help="data dir or dataset script used by get_data_split")
    parser.add_argument("--split-file", required=True, help="path to split json file or list accepted by get_data_split")
    parser.add_argument("--num-samples", type=int, default=10)
    parser.add_argument("--device", default="cuda" if torch.cuda.is_available() else "cpu")
    parser.add_argument("--out", default="multichoice_generations.json")
    parser.add_argument("--max-new-tokens", type=int, default=50)
    args = parser.parse_args()

    outputs = generate_with_dataset(
        model_name=args.model_name,
        data_dir=args.data_dir,
        split_file=args.split_file,
        num_samples=args.num_samples,
        device=args.device,
        max_new_tokens=args.max_new_tokens,
    )

    with open(args.out, "w") as f:
        json.dump(outputs, f, indent=2)
    print(f"Wrote {len(outputs)} generations to {args.out}")


if __name__ == "__main__":
    main()

In [22]:
import pandas as pd
scores = pd.read_pickle('scores_all_data.pkl')
s = list(scores["scores"].keys())[0]
s

'490dc61c-873d-47b6-9050-369cd18e1253_f68804d6-48de-445b-b201-c63d35b8683c'

In [25]:
# Extract action_uid from the scores key
action_uid = s.split('_')[0]
print(f"Action UID: {action_uid}")

# Search for this action_uid in test_ds
test_df = test_ds.to_pandas()
matching_rows = test_df[test_df['action_uid'] == action_uid]
print(f"\nFound {len(matching_rows)} matching rows in test_ds:")
print(matching_rows)

Action UID: 490dc61c-873d-47b6-9050-369cd18e1253

Found 0 matching rows in test_ds:
Empty DataFrame
Columns: [action_uid, raw_html, cleaned_html, operation, pos_candidates, neg_candidates, website, domain, subdomain, annotation_id, confirmed_task, screenshot, action_reprs, target_action_index, target_action_reprs]
Index: []

Found 0 matching rows in test_ds:
Empty DataFrame
Columns: [action_uid, raw_html, cleaned_html, operation, pos_candidates, neg_candidates, website, domain, subdomain, annotation_id, confirmed_task, screenshot, action_reprs, target_action_index, target_action_reprs]
Index: []


In [None]:
# Extract action_uid from the scores key
action_uid = s.split('_')[1]
print(f"Action UID: {action_uid}")
matching_rows = test_df[test_df['action_uid'] == action_uid]
print(f"\nFound {len(matching_rows)} matching rows in test_ds:")
print(matching_rows)

In [3]:
import pandas as pd

# Convert the train split to a pandas DataFrame
df = train_ds.to_pandas()
df['action_id'] = range(len(df))  # Add a default integer ID column
df.head()

Unnamed: 0,action_uid,raw_html,cleaned_html,operation,pos_candidates,neg_candidates,website,domain,subdomain,annotation_id,confirmed_task,screenshot,action_reprs,target_action_index,target_action_reprs,action_id
0,6c7a7082-2897-41c7-9688-4b0f3d778cdb,"<!DOCTYPE html PUBLIC ""-//W3C//DTD HTML 4.0 Tr...","<html backend_node_id=""208"">\n <body backend_...","{""original_op"": ""CLICK"", ""value"": """", ""op"": ""C...","[{""tag"": ""li"", ""attributes"": ""{\""backend_node_...","[{""tag"": ""div"", ""attributes"": ""{\""backend_node...",united,Travel,Airlines,401c4e6f-6b0b-47b4-8157-92d7ca468bbc,"rent a car in Brooklyn - Central, NY on from A...",{'bytes': b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x...,"[[heading] CAR -> CLICK, [combobox] Enter pi...",0,[heading] CAR -> CLICK,0
1,b64c2417-c44e-46c4-bb0b-ff1775e7da29,"<!DOCTYPE html PUBLIC ""-//W3C//DTD HTML 4.0 Tr...","<html backend_node_id=""10021"">\n <body backen...","{""original_op"": ""TYPE"", ""value"": ""Brooklyn Cen...","[{""tag"": ""input"", ""attributes"": ""{\""backend_no...","[{""tag"": ""div"", ""attributes"": ""{\""backend_node...",united,Travel,Airlines,401c4e6f-6b0b-47b4-8157-92d7ca468bbc,"rent a car in Brooklyn - Central, NY on from A...",{'bytes': b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x...,"[[heading] CAR -> CLICK, [combobox] Enter pi...",1,"[combobox] Enter pick up city, airport name, ...",1
2,dad6690b-9b3e-4395-bd06-9aa065bf4027,"<!DOCTYPE html PUBLIC ""-//W3C//DTD HTML 4.0 Tr...","<html backend_node_id=""20041"">\n <body backen...","{""original_op"": ""CLICK"", ""value"": """", ""op"": ""C...","[{""tag"": ""button"", ""attributes"": ""{\""backend_n...","[{""tag"": ""div"", ""attributes"": ""{\""backend_node...",united,Travel,Airlines,401c4e6f-6b0b-47b4-8157-92d7ca468bbc,"rent a car in Brooklyn - Central, NY on from A...",{'bytes': b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x...,"[[heading] CAR -> CLICK, [combobox] Enter pi...",2,"[div] Brooklyn - Central (New York), US -> CLICK",2
3,e0fd3f28-3f04-455d-8bde-a480f0ec1b0a,"<!DOCTYPE html PUBLIC ""-//W3C//DTD HTML 4.0 Tr...","<html backend_node_id=""30061"">\n <body backen...","{""original_op"": ""CLICK"", ""value"": """", ""op"": ""C...","[{""tag"": ""input"", ""attributes"": ""{\""backend_no...","[{""tag"": ""div"", ""attributes"": ""{\""backend_node...",united,Travel,Airlines,401c4e6f-6b0b-47b4-8157-92d7ca468bbc,"rent a car in Brooklyn - Central, NY on from A...",{'bytes': b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x...,"[[heading] CAR -> CLICK, [combobox] Enter pi...",3,[textbox] Pickup -> CLICK,3
4,4762d735-9dc2-4717-ae8b-baab0b3446e5,"<!DOCTYPE html PUBLIC ""-//W3C//DTD HTML 4.0 Tr...","<html backend_node_id=""40453"">\n <body backen...","{""original_op"": ""CLICK"", ""value"": """", ""op"": ""C...","[{""tag"": ""td"", ""attributes"": ""{\""backend_node_...","[{""tag"": ""div"", ""attributes"": ""{\""backend_node...",united,Travel,Airlines,401c4e6f-6b0b-47b4-8157-92d7ca468bbc,"rent a car in Brooklyn - Central, NY on from A...",{'bytes': b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x...,"[[heading] CAR -> CLICK, [combobox] Enter pi...",4,"[button] Sunday, April 9, 2023 -> CLICK",4


In [4]:
# Group by annotation_id (this creates a GroupBy object for fast access)
grouped = df.groupby('annotation_id')

# Retrieve all rows for a specific annotation_id
ann_id = train_ds[0]["annotation_id"]
task_df = grouped.get_group(ann_id).sort_values('target_action_index')

print(f"Task: {task_df.iloc[0]['confirmed_task']}")
# Iterate and display (task_df is a DataFrame)
for _, ex in task_df.iterrows():
    print(
        f"step={int(ex['target_action_index']) + 1}/{len(task_df)} | op={ex['operation']} "
        f"| target_action={ex["target_action_reprs"]} | pos_candidates={len(ex['pos_candidates'])}, action_id={ex['action_id']}"
    )
    # display(train_ds[ex["action_id"]]["screenshot"])

Task: rent a car in Brooklyn - Central, NY on from April 9 to April 15.
step=1/7 | op={"original_op": "CLICK", "value": "", "op": "CLICK"} | target_action=[heading]  CAR -> CLICK | pos_candidates=1, action_id=0
step=2/7 | op={"original_op": "TYPE", "value": "Brooklyn Central", "op": "TYPE"} | target_action=[combobox]  Enter pick up city, airport name, or airport code. -> TYPE: Brooklyn Central | pos_candidates=1, action_id=1
step=3/7 | op={"original_op": "CLICK", "value": "", "op": "CLICK"} | target_action=[div]  Brooklyn - Central (New York), US -> CLICK | pos_candidates=1, action_id=2
step=4/7 | op={"original_op": "CLICK", "value": "", "op": "CLICK"} | target_action=[textbox]  Pickup -> CLICK | pos_candidates=1, action_id=3
step=5/7 | op={"original_op": "CLICK", "value": "", "op": "CLICK"} | target_action=[button]  Sunday, April 9, 2023 -> CLICK | pos_candidates=1, action_id=4
step=6/7 | op={"original_op": "CLICK", "value": "", "op": "CLICK"} | target_action=[button]  Saturday, April

In [12]:
import torch
import sys
import os

# Add Mind2Web to path so we can import from it
sys.path.append('./Mind2Web/src')

from candidate_generation.dataloader import CandidateRankDataset, get_data_split
from candidate_generation.metric import CERerankingEvaluator
from candidate_generation.model import CrossEncoder
from torch.utils.data import DataLoader

eval_data = train_ds
batch_size = 350
max_seq_length = 512

eval_evaluator = CERerankingEvaluator(
    eval_data,
    k=50,
    max_neg=-1,
    batch_size=batch_size,
    name="train",
)

# Use the model path for the CrossEncoder (like in evaluate.py)
model_path = "osunlp/MindAct_CandidateGeneration_deberta-v3-base"
model = CrossEncoder(
    model_path,
    device=torch.device("cuda" if torch.cuda.is_available() else "cpu"),
    num_labels=1,
    max_length=max_seq_length,
)

# Evaluate the model
print("Running evaluation...")
eval_evaluator(model, output_path="./output")
print("Evaluation completed!")
print("Results saved to: ./output")

ModuleNotFoundError: No module named 'dataloader'

In [77]:
scores = pd.read_pickle('scores_all_data.pkl')


In [None]:
uid = task_df.iloc[0]["annotation_id"]
a_uid, a_id = list(scores["scores"].keys())[0].split('_')

In [19]:
list(scores["scores"].keys())[0]

NameError: name 'scores' is not defined

In [116]:
df[df['annotation_id'].str.startswith("15486e7c")]

Unnamed: 0,action_uid,raw_html,cleaned_html,operation,pos_candidates,neg_candidates,website,domain,subdomain,annotation_id,confirmed_task,screenshot,action_reprs,target_action_index,target_action_reprs,action_id


In [82]:
from bs4 import BeautifulSoup
html = task_df.iloc[0]["cleaned_html"]
soup = BeautifulSoup(html, "html.parser")

def get_element_html(node_id: str):
    el = soup.find(attrs={"backend_node_id": node_id})
    return str(el) if el is not None else None

# Example: show HTML for the top-1 candidate
best_id = task_df.iloc[0]["pos_candidates"][0]
print(get_element_html(best_id))

None


In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Download the MindAct Candidate Generation model
print("Downloading MindAct Candidate Generation model...")
model_name = "osunlp/MindAct_CandidateGeneration_deberta-v3-base"

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=CACHE_DIR)

# Load model
candidate_model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    cache_dir=CACHE_DIR,
    torch_dtype=torch.float16,  # Use float16 for efficiency
    device_map="auto"  # Automatically handle device placement
)

print(f"Model {model_name} downloaded and loaded successfully!")
print(f"Model type: {type(candidate_model)}")
print(f"Number of parameters: {sum(p.numel() for p in candidate_model.parameters()):,}")