# GPT-4V Desktop Explorer

This notebook gives a basic overview of the internals of how to hook GPT-4V up to agentdesk

In [None]:
import json

from examples.gpt4v.instruct import system_prompt, ActionSelection
from examples.gpt4v.oai import chat
from examples.gpt4v.util import visualize_b64_img, clean_llm_json
from agentdesk import Desktop

In [None]:
task = "Search for types of ducks in France"

In [None]:
# Create a local desktop
desktop = Desktop.local()

In [None]:
# Launch the desktop UI
desktop.view(background=True)

In [None]:
# Open Google on the desktop in Chrome
desktop.open_url("https://google.com")

In [None]:
# Get the actions a model can take on the desktop as json schema
actions = desktop.json_schema()
actions

In [None]:
msgs = []

In [None]:
info = desktop.info()
info

In [None]:
# Function calling not yet supported in GPT-4V so lets hack it in
msg =  {
        "role": "system",
        "content": [{"type": "text", "text": system_prompt(actions, info["screen_size"])}],
}
msgs.append(msg)

In [None]:
response = chat(msgs)
msgs.append(response)
response

In [None]:
b64_img = desktop.take_screenshot()
visualize_b64_img(b64_img)

In [None]:
x, y = desktop.mouse_coordinates()
(x, y)

In [None]:
msg = {
      "role": "user",
      "content": [
        {
          "type": "text",
          "text": f"Current mouse coordinates are ({x}, {y}), and the task to solve is '{task}', please return the appropriate next action"
        },
        {
          "type": "image_url",
          "image_url": {
              "url": f"data:image/png;base64,{b64_img}"
          }
        }
      ],
}
msgs.append(msg)
msgs

In [None]:
response = chat(msgs)
response

In [None]:
msgs.append(response)

In [None]:
# Parse our action out from GPT response
clean_content = clean_llm_json(response["content"])
jdict = json.loads(clean_content)
print("response dict: ", jdict)

selection = ActionSelection(**jdict)
print("action selection: ", selection)

In [None]:
if selection.action.name == "finished":
    print("task is finished")

In [None]:
action = desktop.find_action(selection.action.name)
print("found action: ", action)
if not action:
    print("action returned not found: ", selection.action.name)
    raise SystemError("action not found")

In [None]:
response = desktop.use(action, **selection.action.parameters)

In [None]:
b64_img = desktop.take_screenshot()
visualize_b64_img(b64_img)

To see how to do this in an agent loop check out [agent.py](./agent.py)