In [1]:
import argparse
import datetime
import json
import logging
import os
import sys

from tqdm import tqdm

import lib_run_single
from desktop_env.desktop_env import DesktopEnv
from gui_agents.s3.agents.agent_s import AgentS3
from gui_agents.s3.agents.grounding import OSWorldACI

from types import SimpleNamespace

In [2]:
args = SimpleNamespace(
    # environment config
    path_to_vm=None,
    provider_name="docker",          # <--- not vmware
    headless=True,
    action_space="pyautogui",
    observation_type="screenshot",
    screen_width=1920,
    screen_height=1080,
    sleep_after_execution=3.0,
    max_steps=5,                     # small for debugging

    # agent config
    max_trajectory_length=3,
    test_config_base_dir="evaluation_examples",

    # main LM config (Ollama llama3.2:3b)
    model="llama3.2:3b",
    temperature=0.2,
    model_provider="openai",         # Agent-S uses OpenAI-style client
    model_url="http://localhost:11434/v1",
    model_api_key="ollama",
    model_temperature=0.2,

    # grounding model config (reuse same endpoint/model for now)
    ground_provider="openai",
    ground_url="http://localhost:11434/v1",
    ground_api_key="ollama",
    ground_model="qwen2.5vl:3b",
    grounding_width=1920,
    grounding_height=1080,

    # example config
    domain="all",
    test_all_meta_path="evaluation_examples/test_all.json",

    # logging / results
    result_dir="./results",
)

In [3]:
# Build engine params
engine_params = {
    "engine_type": args.model_provider,
    "model": args.model,
    "base_url": getattr(args, "model_url", ""),
    "api_key": getattr(args, "model_api_key", ""),
    "temperature": getattr(args, "model_temperature", None),
}

engine_params_for_grounding = {
    "engine_type": args.ground_provider,
    "model": args.ground_model,
    "base_url": getattr(args, "ground_url", ""),
    "api_key": getattr(args, "ground_api_key", ""),
    "grounding_width": args.grounding_width,
    "grounding_height": args.grounding_height,
}

In [4]:
# OSWorld environment (Docker provider)
env = DesktopEnv(
    provider_name=args.provider_name,
    path_to_vm=args.path_to_vm,
    action_space=args.action_space,
    screen_size=(args.screen_width, args.screen_height),
    headless=args.headless,
    os_type="Ubuntu",
    require_a11y_tree=args.observation_type in ["a11y_tree", "screenshot_a11y_tree", "som"],
    enable_proxy=False,   # <--- TURNED OFF for now
)

# Agent-S grounding agent
grounding_agent = OSWorldACI(
    env=env,
    platform="linux",
    engine_params_for_generation=engine_params,
    engine_params_for_grounding=engine_params_for_grounding,
    width=args.screen_width,
    height=args.screen_height,
)

# Agent-S3 main agent
agent = AgentS3(
    engine_params,
    grounding_agent,
    platform="linux",
)

Progress:   6%|6         | 758M/11.4G [00:06<01:28, 130MiB/s] 


KeyboardInterrupt: 

In [10]:
with open(args.test_all_meta_path, "r", encoding="utf-8") as f:
    test_all_meta = json.load(f)

# For a first run, choose the first domain and the first example ID
domain = next(iter(test_all_meta.keys()))
example_id = test_all_meta[domain][0]

print("Domain:", domain)
print("Example ID:", example_id)

config_file = os.path.join(
    args.test_config_base_dir, f"examples/{domain}/{example_id}.json"
)
print("Config file path:", config_file)

with open(config_file, "r", encoding="utf-8") as f:
    example = json.load(f)

print("Instruction:", example["instruction"])

Domain: chrome
Example ID: bb5e4c0d-f964-439c-97b6-bdb9747de3f4
Config file path: evaluation_examples/examples/chrome/bb5e4c0d-f964-439c-97b6-bdb9747de3f4.json
Instruction: Can you make Bing the main search engine when I look stuff up on the internet?


In [11]:
scores: list[float] = []
example_result_dir = os.path.join(
    args.result_dir,
    args.action_space,
    args.observation_type,
    args.model,
    domain,
    example_id,
)
os.makedirs(example_result_dir, exist_ok=True)

print("Result dir:", example_result_dir)

lib_run_single.run_single_example(
    agent,
    env,
    example,
    args.max_steps,
    example["instruction"],
    args,
    example_result_dir,
    scores,
)

print("Scores list:", scores)

Result dir: ./results/pyautogui/screenshot/llama3.2:3b/chrome/bb5e4c0d-f964-439c-97b6-bdb9747de3f4
Response success!
Response success!
RAW GROUNDING MODEL RESPONSE: (100, 100)
Response success!
RAW GROUNDING MODEL RESPONSE: (100, 100)
Response success!
Response success!
Response success!
RAW GROUNDING MODEL RESPONSE: ```json
[
	{"point_2d": [917, 158], "label": "The search button at the top right of the window"}
]
```
Response success!
RAW GROUNDING MODEL RESPONSE: ```json
[
	{"point_2d": [917, 158], "label": "The search button at the top right of the window"}
]
```
Response success!
Response success!
Response success!
RAW GROUNDING MODEL RESPONSE: ```json
[
	{"point_2d": [162, 58], "label": "The link that says 'Bing' in the top search results"}
]
```
Response success!
RAW GROUNDING MODEL RESPONSE: ```json
[
	{"point_2d": [162, 58], "label": "The link that says 'Bing' in the top search results"}
]
```
Response success!
Response success!
Response success!
RAW GROUNDING MODEL RESPONSE: `

Failed to stop recording. Status code: 500
Failed to stop recording. Status code: 400
Failed to stop recording. Status code: 400
Failed to stop recording.


Scores list: [0.0]


In [None]:
env.close()