-
Notifications
You must be signed in to change notification settings - Fork 2.2k
Description
import base64
from openai import OpenAI
deployment = "ollama"
instruction = "点击去出车按钮"
screenshot_path = "task2.jpeg"
assert deployment in ["ollama", "hf"]
if deployment == "ollama":
client = OpenAI(
base_url="http://127.0.0.1:11434/v1/",
api_key="ollama", # not used
)
# the model name created via ollama CLI, you can check it via command: ollama list
model = "ui-tars:latest"
else:
client = OpenAI(base_url="", api_key="")
model = "tgi"
prompt = "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\nYou are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task. \n\n ## Output Format\n \n Action_Summary: ...\n Action: ...\n \n\n ## Action Space\n click(start_box=‘<|box_start|>(x1,y1)<|box_end|>’)\nlong_press(start_box=‘<|box_start|>(x1,y1)<|box_end|>’, time=‘’)\ntype(content=‘’)\nscroll(direction=‘down or up or right or left’)\nopen_app(app_name=‘’)\nnavigate_back()\nnavigate_home()\nWAIT()\nfinished() # Submit the task regardless of whether it succeeds or fails.\n\n ## Note\n - Use English in Action_Summary part.\n \n\n ## User Instruction\n"
with open(screenshot_path, "rb") as image_file:
encoded_string = base64.b64encode(image_file.read()).decode("utf-8")
response = client.chat.completions.create(
model=model,
messages=[
{
"role": "user",
"content": [
{"type": "text", "text": prompt + instruction},
{"type": "image_url", "image_url": {"url": f"data:image/jpg;base64,{encoded_string}"}},
],
},
],
)
print(response.choices[0].message.content)
每运行一次都不相同:
python ui_tars.py
信息位于该车辆的左上角,是一个矩形框内部含有黑色文字
Action_Sumary: click(start_box='<|box_start|>(745,265)<|box_end|>')
(agent) mars@marsyaodemini Workspace % python ui_tars.py
_url_0.jpg
Action_Sumary: click(start_box='<|box_start|>(213,465)<|box_end|>')
(agent) mars@marsyaodemini Workspace % python ui_tars.py
(agent) mars@marsyaodemini Workspace % python ui_tars.py
>
Action_Summary: click(start_box='<|box_start|>(246,155)<|box_end|>')
(agent) mars@marsyaodemini Workspace %