In [4]:
import nest_asyncio
nest_asyncio.apply()
# required for running async playwright in a jupyter notebook

In [10]:
import re
import os
import json
import asyncio
import platform
import requests
import numpy as np
import pandas as pd
import datetime as dt

from enum import Enum
from typing import List
from typing import Dict
from typing import Optional
from typing import TypedDict
from operator import itemgetter
from playwright.async_api import Page

from langchain_openai import ChatOpenAI

from langchain.prompts import PromptTemplate
from langchain.prompts import ChatPromptTemplate
from langchain.prompts import MessagesPlaceholder

from langchain_core.messages import BaseMessage
from langchain_core.messages import SystemMessage

from langchain_core.pydantic_v1 import Field
from langchain_core.pydantic_v1 import BaseModel

from langchain_core.runnables import RunnableParallel
from langchain_core.runnables import RunnablePassthrough
from langchain_core.runnables.graph import CurveStyle
from langchain_core.runnables.graph import NodeColors
from langchain_core.runnables.graph import MermaidDrawMethod

from langchain_core.output_parsers import JsonOutputParser


In [9]:
class BoundingBox(TypedDict):
    x: float
    y: float


class Prediction(TypedDict):
    action: str
    args: Optional[List[str]]


class AgentState(TypedDict):
    page: Page # The playwright web page lets us interact with the web environment
    input: str # user request
    img: str # b64 encoded screenshot
    bboxes: List[BoundingBox] # the bounding boxes from the browser annotation function
    prediction: Prediction # the agent's output
    scratchpad: List[BaseMessage] # list of messages to extract additional information from
    observation: str # the most recent response from a tool

## Tools
The agent has 6 simple tools
1. Click (at labeled box)
2. Type
3. Scroll
4. Wait
5. Go back
6. Go to search engine (google)

In [None]:
async def click(state: AgentState) -> str:
    # click [numerical label]
    page = state['page']
    click_args = state['prediction']['args']
    if click_args is None or len(click_args) != 1:
        return f'Failed to click bounding box labeled as number {click_args}'
    
    bbox_id = int(click_args[0])
    bbox = state['bboxes'][bbox_id - 1] # 1 - indexed
    x, y = bbox['x'], bbox['y']
    res = await page.mouse.click(x, y)

    # in the paper they automatically parse any downloaded PDFs
    # we could add something similar here as well and generally improve response format
    return f'Clicked {bbox_id}'

async def type_text(state: AgentState) -> str:
    page = state['page']
    type_args = state['prediction']['args']
    if type_args is None or len(type_args) != 2:
        return f'Failed to type in element from bounding box labeled as number {type_args}'
    
    bbox_id = int(type_args[0])
    bbox = state['bboxes'][bbox_id - 1] # 1 - indexed
    x, y = bbox['x'], bbox['y']
    text_content = type_args[1]
    await page.mouse.click(x, y)

    # check if MacOS
    select_all = 'Meta+A' if platform.system() == 'Darwin' else 'Control+A'
    await page.keyboard.press(select_all)
    await page.keyboard.press('Backspace')
    await page.keyboard.press(text_content)
    await page.keyboard.press('Enter')
    return f'Typed {text_content} and submitted'

async def scroll(state: AgentState):
    page = state['page']
    scroll_args = state['prediction']['args']
    if scroll_args is None or len(scroll_args) != 2:
        return 'Failed to scroll due to incorrect arguments'
    
    target, direction = scroll_args

    if target.upper() == 'WINDOW':
        # not sure of the best value for this
        # should have feedback of some sort
        scroll_amount = 500
        scroll_direction = -scroll_amount if direction.lower() == 'up' else scroll_amount
        await page.evaluate(f'window.scrollBy(0, {scroll_direction})')
    else:
        # scroll within a specific element
        scroll_amount = 200
        target_id = int(target)
        bbox = state['bboxes'][target - 1] # 1 - indexed
        x, y = bbox['x'], bbox['y']
        scroll_direction = -scroll_amount if direction.lower() == 'up' else scroll_amount

        await page.mouse.move(x, y)
        await page.mouse.wheel(0, scroll_direction)

    return f'Scrolled {direction} in {"window" if target.upper() == "WINDOW" else "element"}'

async def wait(state: AgentState) -> str:
    sleep_time = 5
    await asyncio.sleep(sleep_time)
    return f'Waited for {sleep_time}s'

async def go_back(state: AgentState) -> str:
    page = state['page']
    await page.go_back()
    return f'Navigated back a page to {page.url}.'

async def to_google(state: AgentState) -> str:
    page = state['page']
    await page.goto('https://www.google.com/')
    return 'Navigated to google.com'