# Hindsight Experience Replay

This notebook explores an old RL algorithm called Hindsight Experience Replay (HER) for solving the click problem of click agents. The core intuition behind HER centers on agents learning from every action. The result of each movement, even if unsuccessful, can be retroactively seen as the goal.

Algorithm:
* The agent generates a set of targets to click based on the current screenshot.
* For each target, the agent retrieves the current mouse coordinates and generates (Δx, Δy) to move the mouse to the target.
* The agent takes another screenshot and looks to see if the cursor is a pointer or an arrow, as well as if the cursor is over the target.
  * If the cursor is a pointer, but the agent is not over the target, the agent generates a description of the current location and uses that to create a new completed target


In [None]:
from agentdesk import Desktop

In [None]:
desktop = Desktop.gce()

In [None]:
desktop.info()

# Describe Location

Describe the current location of the mouse

In [None]:
from pydantic import BaseModel, Field
from mllm import Router, RoleThread
from surfninja.img import b64_to_image, image_to_b64, crop_box_around

router = Router.from_env()

class ClickArea(BaseModel):
    purpose: str = Field(description="Purpose of the the element the cursor is over e.g. to login the user")
    description: str = Field(description="Description of what the mouse cursor is over")
    location: str = Field(description="General location of the mouse cursor relative to the full screen, e.g. top left, bottom right")


In [None]:
def describe_location(desktop: Desktop) -> ClickArea:
    """Describe the current location of the mouse"""
    
    thread = RoleThread()
    b64_img = desktop.take_screenshot()
    img = b64_to_image(b64_img)

    coords = desktop.mouse_coordinates()
    cropped = crop_box_around(img, coords[0], coords[1])

    thread.post(
        role="user",
        msg=f"""I'm going to provide you with two images. The first is a picture of a desktop UI, 
    the second is a cropped portion of the first image containing just a 100x100 portion focusing on where the mouse cursor is.
    Please describe what the mouse cursor as a JSON object conforming to the schema {ClickArea.model_json_schema()}.
    Please return just raw json. For example if you see the mouse above the chromium icon then 
    you would return {{"is_clickable": true, "description": "A blue chromium icon with the text 'chromium' beneath it", "location": "top-right"}}.
    """,
        images=[image_to_b64(img), image_to_b64(cropped)],
    )

    resp = router.chat(thread, expect=ClickArea)

    if not resp.parsed:
        raise ValueError("No click area found")

    return resp.parsed

# Identifying Targets

Identify all possible clickable targets on a screenshot

In [None]:
from typing import List

class Target(BaseModel):
    """A target which the mouse could be moved to"""
    
    description: str = Field(description="A long description of the target e.g. A round blue button with the text 'login'")
    location: str = Field(description="A general location of the target e.g. top-right, center, bottom-left")
    purpose: str = Field(description="A general purpose of the target e.g. login, logout, register")

class Targets(BaseModel):
    targets: List[Target] = Field(description="A list of targets")

In [None]:
def get_targets(desktop: Desktop) -> Targets:
    """Generate targets from a desktop screenshot"""
    
    thread = RoleThread()
    b64_img = desktop.take_screenshot()
    img = b64_to_image(b64_img)

    thread.post(
        role="user",
        msg=f"""I've provided you with an image of a desktop UI. Please describe all the possible targets that you can interact with.
    Please return a JSON object that conforms to the schema {Targets.model_json_schema()}.
    Please be exhaustive, describing all possibilities on the screenshot.
    Please return just raw json. For example {{"targets": [{{"description": "A green button resembling a user", "location": "top-left", "purpose": "open user settings"}}]}}
    """,
        images=[image_to_b64(img)]
    )
    resp = router.chat(thread, expect=Targets)

    if not resp.parsed:
        raise ValueError("No click area found")

    return resp.parsed

# Navigating to Targets
Navigate to a target description

In [None]:
class MoveDirection(BaseModel):
    x: int = Field(description="Amount to move in the x direction. Positive values move right, negative values move left. 1 is equal to 1 pixel.")
    y: int = Field(description="Amount to move in the y direction. Positive values move down, negative values move up. 1 is equal to 1 pixel.")

In [None]:
def get_move_direction(desktop: Desktop, target: Target) -> MoveDirection:
    """Generate the next direction to move the mouse (Δx, Δy)"""

    thread = RoleThread()
    b64_img = desktop.take_screenshot()
    img = b64_to_image(b64_img)

    coords = desktop.mouse_coordinates()
    cropped = crop_box_around(img, coords[0], coords[1])


    thread.post(
        role="user",
        msg=f"""I've provided you with two images: a screenshot of a desktop UI, and a cropped 100x100 image of the current mouse location. 
    Your goal is to navigate to '{target.description}' located in '{target.location}'. The screen size is {img.size} and the current coordinates are {coords}. 
    Please tell me which direction to move the mouse to get there. Please return a JSON object which conforms to the schema {MoveDirection.model_json_schema()}.
    Please return raw json. For example, if I want to move 12 pixels to the left, and 3 pixels up, I would return: {{ "x": -12, "y": 3}}. You must move the mouse, 
    either 'x' or 'y' must be non-zero. The very tip of the cursor must be on your desired target, if unsure, move the mouse slightly.
    """,
        images=[image_to_b64(img), image_to_b64(cropped)]
    )
    img.save("./.run/screenshot_move.png")
    cropped.save("./.run/cropped_move.png")
    resp = router.chat(thread, expect=MoveDirection)

    if not resp.parsed:
        raise ValueError("No click area found")

    return resp.parsed

In [None]:
from typing import Tuple
from PIL import Image

def apply_move(desktop: Desktop, direction: MoveDirection) -> Tuple[Image.Image, Image.Image]:
    """Apply a mouse movement to the desktop"""
    
    current_coords = desktop.mouse_coordinates()
    print("current_cords: ", current_coords)

    # Calculate new absolute mouse coordinates
    new_x = current_coords[0] + direction.x
    new_y = current_coords[1] + direction.y

    print("new: ", new_x, new_y)

    # Move the mouse to the new coordinates
    desktop.move_mouse(x=new_x, y=new_y)

    b64_img = desktop.take_screenshot()
    img = b64_to_image(b64_img)

    coords = desktop.mouse_coordinates()
    cropped = crop_box_around(img, coords[0], coords[1])
    print("new_coords: ", coords)

    return img, cropped

In [None]:
class CursorType(BaseModel):
    type: str = Field(description="Can be 'default', 'text', or 'pointer'")


def det_cursor_type(desktop: Desktop) -> CursorType:
    """Detect the cursor type"""

    thread = RoleThread()
    b64_img = desktop.take_screenshot()
    img = b64_to_image(b64_img)

    coords = desktop.mouse_coordinates()
    cropped = crop_box_around(img, coords[0], coords[1], padding=30)

    cropped.save("./.run/cursor.png")

    composite = Image.open("./assets/cursor_composite_image.jpg")


    thread.post(
        role="user",
        msg=f"""I've provided you with two images; first is an image of a mouse cursor and the second is an image 
        displaying the different types of cursors and their names. Please return what type of cursor you see.
        Please return a json object which conforms to the schema {CursorType.model_json_schema()}.
        Please return just raw json. For example if the cursor looks like a standard pointer return {{"type": "default"}}
    """,
        images=[image_to_b64(cropped), image_to_b64(composite)]
    )
    resp = router.chat(thread, expect=CursorType)

    if not resp.parsed:
        raise ValueError("No click area found")

    return resp.parsed


class CheckGoal(BaseModel):
    done: bool = Field(description="Whether the cursor is over the correct location")


def is_finished(desktop: Desktop, target: Target) -> bool:
    """Check if the target has been reached"""
    thread = RoleThread()
    b64_img = desktop.take_screenshot()
    img = b64_to_image(b64_img)

    coords = desktop.mouse_coordinates()
    cropped = crop_box_around(img, coords[0], coords[1])


    thread.post(
        role="user",
        msg=f"""I've provided you with two images: a screenshot of a desktop UI, and a cropped 100x100 image of the current mouse location. 
    Your goal is to navigate to '{target.description}' located in '{target.location}'. The screen size is {img.size} and the current coordinates are {coords}. 
    Please tell me if we have achieved that goal. Please return your response as a JSON object which conforms to the schema {CheckGoal.model_json_schema()}.
    Please return raw json. If the goal is achieved the cursor should be directly over the target and should be a pointer, then return {{"done": true}}
    """,
        images=[image_to_b64(img), image_to_b64(cropped)]
    )
    resp = router.chat(thread, expect=CheckGoal)

    if not resp.parsed:
        raise ValueError("No click area found")

    return resp.parsed.done

# Agent

In [None]:
targets = get_targets(desktop)

In [None]:
target = targets.targets[0]
target

In [None]:
desktop.open_url("https://google.com")

In [None]:
import logging
import sys
logger = logging.getLogger("mllm.router")
logger.setLevel(logging.DEBUG)

stdout_handler = logging.StreamHandler(sys.stdout)
stdout_handler.setLevel(logging.DEBUG)  # Ensuring the handler captures debug logs

# Optionally add a formatter
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
stdout_handler.setFormatter(formatter)

# Add the handler to the logger
logger.addHandler(stdout_handler)

In [None]:
max_steps = 10

import shutil
import os

shutil.rmtree("./.run")
os.makedirs("./.run")

completed_targets: List[Target] = []


for step in range(max_steps):
    print("\n---- checking if task is finished...")
    cursor_type = det_cursor_type(desktop)

    print("cursor type: ", cursor_type.type)
    if cursor_type.type != "default":
        if is_finished(desktop, target):
            print("task is done")
            completed_targets.append(target)
            break

        print("task is not finished but cursor is not default")
        area = describe_location(desktop)
        target = Target(description=area.description, location=area.location, purpose=area.purpose)
        completed_targets.append(target)
        print("area: ", area.model_dump())

        
    print("\n---- step: ", step)
    direct = get_move_direction(desktop, target)

    print("\n---- move direction: ", direct.model_dump())

    new_screen, new_cursor = apply_move(desktop, direct)
    new_screen.save("./.run/step_" + str(step) + ".png")