## OSWorld

In [None]:
from agential.agents.OSWorldBaseline.agent import OSWorldBaselineAgent

In [8]:
"""
To get accessibility_tree, retreive obs by running this line
- obs, reward, done, info = env.step("pyautogui.rightClick()")
- obs["accessibility_tree"]
"""
with open("accessibility_tree.txt", "r", encoding="utf-8") as file:
    accessibility_tree = file.read()

instruction = "Please help me to find the nearest restaurant."

"""
Image of screen is to be provided by user
"""
obs = {"screenshot": open("output_image.jpeg", 'rb').read()}


In [4]:
agent = OSWorldBaselineAgent(
    model="gpt-4o",
    observation_type="screenshot",
)

In [5]:
response, actions, messages = agent.generate(instruction, obs)

In [None]:
print(f"Response: {response}")
print(f"Actions: {actions}")
print(f"Messages: {messages}")

In [65]:
import logging
import dashscope
from dashscope.api_entities.dashscope_response import GenerationResponse
from http import HTTPStatus

logger = logging.getLogger("desktopenv.agent")

In [67]:
from agential.agents.OSWorldBaseline.functional import (
    encode_image,
    encoded_img_to_pil_img,
    save_to_tmp_img_file,
    linearize_accessibility_tree,
    tag_screenshot,
    parse_actions_from_string,
    parse_code_from_string,
    parse_code_from_som_string,
    trim_accessibility_tree
)

In [68]:
flag = 0
model = "qwen-vl-plus"
qwen_messages = []

for i, message in enumerate(messages):
    qwen_message = {
        "role": message["role"],
        "content": []
    }
    assert len(message["content"]) in [1, 2], "One text, or one text with one image"
    for part in message["content"]:
        qwen_message['content'].append(
            {"image": "file://" + save_to_tmp_img_file(part['image_url']['url'])}) if part[
                                                                                            'type'] == "image_url" else None
        qwen_message['content'].append({"text": part['text']}) if part['type'] == "text" else None

    qwen_messages.append(qwen_message)

In [69]:


while True:
    try:
        if flag > 20:
            break
        logger.info("Generating content with model: %s", model)

        response = GenerationResponse(status_code = 0)

        if model in ["qwen-vl-plus", "qwen-vl-max"]:
            response = dashscope.MultiModalConversation.call(
                model=model,
                messages=qwen_messages,
                result_format="message",
                max_length=1500,
                top_p=0.9,
                temperature=0.5
            )

        elif model in ["qwen-turbo", "qwen-plus", "qwen-max", "qwen-max-0428", "qwen-max-0403",
                            "qwen-max-0107", "qwen-max-longcontext"]:
            response = dashscope.Generation.call(
                model=model,
                messages=qwen_messages,
                result_format="message",
                max_length=1500,
                top_p=0.9,
                temperature=0.5
            )

        else:
            raise ValueError("Invalid model: " + model)

        if response.status_code == HTTPStatus.OK:
            break
        else:
            logger.error('Request id: %s, Status code: %s, error code: %s, error message: %s' % (
                response.request_id, response.status_code,
                response.code, response.message
            ))
            print(response.request_id)
            raise Exception("Failed to call LLM: " + response.message)
    except:
        if flag == 0:
            qwen_messages = [qwen_messages[0]] + qwen_messages[-1:]
        else:
            for i in range(len(qwen_messages[-1]["content"])):
                if "text" in qwen_messages[-1]["content"][i]:
                    qwen_messages[-1]["content"][i]["text"] = ' '.join(
                        qwen_messages[-1]["content"][i]["text"].split()[:-500])
        flag = flag + 1

In [70]:
print(response)

{"status_code": 0, "request_id": "", "code": "", "message": "", "output": null, "usage": null}
