In [17]:
import getpass
import os


def _set_env(var: str):
    if not os.environ.get(var):
        os.environ[var] = getpass.getpass(f"{var}: ")


_set_env("ANTHROPIC_API_KEY")

In [18]:
from typing import Annotated, Union

from langchain_anthropic import ChatAnthropic
from langchain_community.tools.tavily_search import TavilySearchResults
from langchain_core.messages import BaseMessage, SystemMessage, HumanMessage, AnyMessage
from typing_extensions import TypedDict

from langgraph.graph import StateGraph
from langgraph.graph.message import add_messages
from langgraph.prebuilt import ToolNode, tools_condition


class State(TypedDict):
    messages: Annotated[list, add_messages]


graph_builder = StateGraph(State)


tool = TavilySearchResults(max_results=2)
tools = [tool]
llm = ChatAnthropic(model="claude-3-haiku-20240307")
llm_with_tools = llm.bind_tools(tools)


def chatbot(state: State):
    return {"messages": [llm_with_tools.invoke(state["messages"])]}


graph_builder.add_node("chatbot", chatbot)

tool_node = ToolNode(tools=[tool])
graph_builder.add_node("tools", tool_node)

graph_builder.add_conditional_edges(
    "chatbot",
    tools_condition,
)
# Any time a tool is called, we return to the chatbot to decide the next step
graph_builder.add_edge("tools", "chatbot")
graph_builder.set_entry_point("chatbot")

In [37]:
from langgraph.checkpoint.sqlite import SqliteSaver

memory = SqliteSaver.from_conn_string(":memory:")

In [38]:
graph = graph_builder.compile(checkpointer=memory, interrupt_before=["tools"])

In [39]:
config = {"configurable": {"thread_id": "1"}}

In [40]:
user_input = "What is LangGraph?"

# The config is the **second positional argument** to stream() or invoke()!
events = graph.stream(
    {"messages": [("user", user_input)]}, config, stream_mode="values"
)
for event in events:
    event["messages"][-1].pretty_print()


What is LangGraph?

[{'text': 'Let me perform a search to find information about LangGraph:', 'type': 'text'}, {'id': 'toolu_01Dzfqn9hg7rhXC598bsM7DL', 'input': {'query': 'LangGraph'}, 'name': 'tavily_search_results_json', 'type': 'tool_use'}]
Tool Calls:
  tavily_search_results_json (toolu_01Dzfqn9hg7rhXC598bsM7DL)
 Call ID: toolu_01Dzfqn9hg7rhXC598bsM7DL
  Args:
    query: LangGraph


In [41]:
snapshot = graph.get_state(config)
snapshot

StateSnapshot(values={'messages': [HumanMessage(content='What is LangGraph?', id='08b0a807-26ef-4a18-9921-a251a0e02698'), AIMessage(content=[{'text': 'Let me perform a search to find information about LangGraph:', 'type': 'text'}, {'id': 'toolu_01Dzfqn9hg7rhXC598bsM7DL', 'input': {'query': 'LangGraph'}, 'name': 'tavily_search_results_json', 'type': 'tool_use'}], response_metadata={'id': 'msg_016wn5TFQMLbg3vVarHGTEwC', 'model': 'claude-3-haiku-20240307', 'stop_reason': 'tool_use', 'stop_sequence': None, 'usage': {'input_tokens': 373, 'output_tokens': 74}}, id='run-3e5d2b4a-850d-4826-a012-cd7d807fdb2c-0', tool_calls=[{'name': 'tavily_search_results_json', 'args': {'query': 'LangGraph'}, 'id': 'toolu_01Dzfqn9hg7rhXC598bsM7DL'}], usage_metadata={'input_tokens': 373, 'output_tokens': 74, 'total_tokens': 447})]}, next=('tools',), config={'configurable': {'thread_id': '1', 'thread_ts': '1ef26971-e2ec-6a30-8001-305fe3b2ea04'}}, metadata={'source': 'loop', 'step': 1, 'writes': {'chatbot': {'mes

In [42]:
snapshot.next

('tools',)

In [43]:
existing_ai_message = snapshot.values["messages"][-1]
existing_ai_message.pretty_print()


[{'text': 'Let me perform a search to find information about LangGraph:', 'type': 'text'}, {'id': 'toolu_01Dzfqn9hg7rhXC598bsM7DL', 'input': {'query': 'LangGraph'}, 'name': 'tavily_search_results_json', 'type': 'tool_use'}]
Tool Calls:
  tavily_search_results_json (toolu_01Dzfqn9hg7rhXC598bsM7DL)
 Call ID: toolu_01Dzfqn9hg7rhXC598bsM7DL
  Args:
    query: LangGraph


In [48]:
# format a new ai message to replace the search query 
print("The ID of existing_ai_message is: ", existing_ai_message.id)
new_ai_message = existing_ai_message.copy()
print("The ID of new_ai_message (BEFORE REPLACEMENT) is: ", new_ai_message.id)
new_ai_message.tool_calls[0]['args']['query'] = "What's the weather"

graph.update_state(config, {"messages": [new_ai_message]})
print("The ID of new_ai_message (AFTER REPLACEMENT) is: ", new_ai_message.id)

The ID of existing_ai_message is:  run-3e5d2b4a-850d-4826-a012-cd7d807fdb2c-0
The ID of new_ai_message (BEFORE REPLACEMENT) is:  run-3e5d2b4a-850d-4826-a012-cd7d807fdb2c-0
The ID of new_ai_message (AFTER REPLACEMENT) is:  run-3e5d2b4a-850d-4826-a012-cd7d807fdb2c-0


In [49]:
snapshot = graph.get_state(config)
snapshot.next

('tools',)

In [50]:
events = graph.stream(None, config, stream_mode="values")
for event in events:
    if "messages" in event:
        event["messages"][-1].pretty_print()

Name: tavily_search_results_json

[{"url": "https://www.weatherapi.com/", "content": "{'location': {'name': 'Current', 'region': 'Harbour Island', 'country': 'Bahamas', 'lat': 25.43, 'lon': -76.78, 'tz_id': 'America/Nassau', 'localtime_epoch': 1717961974, 'localtime': '2024-06-09 15:39'}, 'current': {'last_updated_epoch': 1717961400, 'last_updated': '2024-06-09 15:30', 'temp_c': 28.0, 'temp_f': 82.4, 'is_day': 1, 'condition': {'text': 'Patchy rain nearby', 'icon': '//cdn.weatherapi.com/weather/64x64/day/176.png', 'code': 1063}, 'wind_mph': 11.4, 'wind_kph': 18.4, 'wind_degree': 199, 'wind_dir': 'SSW', 'pressure_mb': 1013.0, 'pressure_in': 29.92, 'precip_mm': 0.51, 'precip_in': 0.02, 'humidity': 73, 'cloud': 88, 'feelslike_c': 31.1, 'feelslike_f': 88.0, 'windchill_c': 28.0, 'windchill_f': 82.4, 'heatindex_c': 31.1, 'heatindex_f': 88.0, 'dewpoint_c': 22.6, 'dewpoint_f': 72.7, 'vis_km': 9.0, 'vis_miles': 5.0, 'uv': 6.0, 'gust_mph': 15.7, 'gust_kph': 25.2}}"}, {"url": "https://origin-east-

In [33]:
from langchain_core.messages import AIMessage, ToolMessage

answer = (
    "LangGraph is a library for building stateful, multi-actor applications with LLMs."
)
new_messages = [
    # The LLM API expects some ToolMessage to match its tool call. We'll satisfy that here.
    ToolMessage(content=answer, tool_call_id=existing_ai_message.tool_calls[0]["id"]),
    # And then directly "put words in the LLM's mouth" by populating its response.
    AIMessage(content=answer),
]

new_messages[-1].pretty_print()

"""
The update_state function operates as if it were one of the nodes in your graph!
By default, the update operation uses the node that was last executed, but you can manually specify it below.
Basically, update_state adds messages. 
"""

graph.update_state(
    # Which state to update
    config,
    # The updated values to provide. The messages in our `State` are "append-only", meaning this will be appended
    # to the existing state. We will review how to update existing messages in the next section!
    {"messages": new_messages},
)

print("\n\nLast 2 messages;")
print(graph.get_state(config).values["messages"][-2:])


LangGraph is a library for building stateful, multi-actor applications with LLMs.


Last 2 messages;
[ToolMessage(content='LangGraph is a library for building stateful, multi-actor applications with LLMs.', id='cbd2073f-c2ae-4032-9ff7-9f2ad05d97cd', tool_call_id='toolu_01S4YSyq33nUcGJ8YoBrTmGP'), AIMessage(content='LangGraph is a library for building stateful, multi-actor applications with LLMs.', id='2d30cf20-058b-4a25-8dd0-26fb9db78bc9')]


In [35]:
snapshot = graph.get_state(config)
len(snapshot.values["messages"])

4

In [16]:
class UserExitError(Exception):
    def __init__(self, message):
        self.message = message


class AgentState(TypedDict):
    messages: Annotated[list[AnyMessage], add_messages] 


class AgentM:
    def __init__(self, model, tools, system="", tool_node=None):
        self.system = system
        self.model = model.bind_tools(tools)
        self.tools = {t.name: t for t in tools}

        # set up memory for multi-run interaction (in memory)
        # every time we initialize the agent, we start with a clean memory
        memory = SqliteSaver.from_conn_string(":memory:")

        graph = StateGraph(AgentState) # BotState here is a schema for the state of the agent
        graph.add_node("llm", self.call_llm) # 'llm' here is just a node name 
        graph.add_node("tools", tool_node) # 'tools' here is just a node name, could either be a function or a runnable
        graph.add_edge("tools", "llm") # any time a tool is called, we return to the LLM
        graph.set_entry_point("llm")
        graph.add_conditional_edges(
            "llm",
            # self.tool_calls_requested, # even though in the loop, there is no place for the user to input
            # {True: "tools", False: END}
            tools_condition,
        )
        self.graph = graph.compile(checkpointer=memory)
    
    def call_llm(self, state: AgentState):
        messages = state['messages']
        if self.system:
            messages = [SystemMessage(content=self.system)] + messages
        response = self.model.invoke(messages) 
        return {"messages": [response]}
    
    def tool_calls_requested(self, state: AgentState):
        last_message = state['messages'][-1]
        return len(last_message.tool_calls) > 0
    

### Implement Custom Tool


In [10]:
from langchain.pydantic_v1 import BaseModel, Field
from langchain.tools import BaseTool, StructuredTool, tool
from typing import List, Dict, Any, Optional, Annotated, Union
from langchain_core.messages import AnyMessage, AIMessage, ToolMessage
from langgraph.graph.message import add_messages

In [11]:
class AgentState:
    messages: Annotated[list[AnyMessage], add_messages]

In [12]:
@tool
async def click(state: AgentState):
    """Click action in the browser - Click [Numerical_Label]"""
    page = state["page"]
    click_args = state["prediction"]["args"]
    if click_args is None or len(click_args) != 1:
        return f"Failed to click bounding box labeled as number {click_args}"
    bbox_id = click_args[0]
    bbox_id = int(bbox_id)
    try:
        bbox = state["bboxes"][bbox_id]
    except Exception:
        return f"Error: no bbox for : {bbox_id}"
    x, y = bbox["x"], bbox["y"]
    await page.mouse.click(x, y)
    # TODO: In the paper, they automatically parse any downloaded PDFs
    # We could add something similar here as well and generally
    # improve response format.
    return f"Clicked {bbox_id}"

ValueError: Value not declarable with JSON Schema, field: name='state' type=AgentState required=True

In [13]:
@tool
def search(query: str) -> str:
    """Look up things online."""
    return "LangChain"

In [14]:
print(search.name)
print(search.description)
print(search.args)

search
Look up things online.
{'query': {'title': 'Query', 'type': 'string'}}


### Understand tools and functions


In [90]:
from langchain.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI
from langchain.schema.output_parser import StrOutputParser

In [95]:
prompt = ChatPromptTemplate.from_template("Tell me a joke about {topic}")
model = ChatOpenAI()
output_parser = StrOutputParser()

chain = prompt | model | output_parser

In [98]:
for s in chain.stream({"topic": "dick"}):
    print(s, end="", flush=True)

Why did the dick go to school? To get some more ed-ju-cum-tion!

In [103]:
print("a")
async for s in chain.astream({"topic": "bears"}):
    print(s, end="", flush=True)
print("b")

a
Why don't bears like fast food? 

Because they can't catch it!b


In [None]:
import asyncio

async def main():
    print("a")
    async for s in chain.astream({"topic": "bears"}):
        print(s, end="", flush=True)
    print("b")

# Run the main coroutine
asyncio.run(main())


# co-routine object might not be executed 
# co-routine object needs await to ectually execute it


In [110]:
import asyncio
import nest_asyncio

# This is just required for running async playwright in a Jupyter notebook
nest_asyncio.apply()

import asyncio

async def async_task_1(): # another coroutine function 
    print("Task 1: Start")
    await asyncio.sleep(2)
    print("Task 1: End")

async def async_task_2(): # a coroutine function
    print("Task 2: Start")
    await asyncio.sleep(1)
    print("Task 2: End")

async def main():
    print("Main start")
    task1 = asyncio.create_task(async_task_1())
    task2 = asyncio.create_task(async_task_2())
    await task1
    await task2
    print("Main end")

asyncio.run(main())

Main start
Task 1: Start
Task 2: Start
Task 2: End
Task 1: End
Main end


In [18]:
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import DocArrayInMemorySearch

In [21]:
vectorstore = DocArrayInMemorySearch.from_texts(
    ['my dick is 5 feet long', 'my activity lasts more than 2 hours'],
    embedding = OpenAIEmbeddings
)

ImportError: Could not import docarray python package. Please install it with `pip install "langchain[docarray]"`.

In [30]:
from langchain_core.utils.function_calling import convert_to_openai_function
from langchain.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI
from langchain.schema.output_parser import StrOutputParser

# Tagging is like a classification job 
# Extracting is get structured info from existing unstructured data 

In [61]:
# this should be an extraction work

# Tagging
class LLMOutput(BaseModel):
    "The output from the LLM to extract. If not explicitly provided do not guess, extract partial info"
    action: str = Field(description="The action to take in the web browser, including the element id should be operated on.")
    thought: str = Field(description="The reasoning behind making the action")

# Extracting
class StructuredAction(BaseModel):
    "Structured infomation about the action to take"
    action: str = Field(description="the exact action to take and should only from 'Click', 'Type', 'Google', 'Scroll'")
    element_id: Optional[int] = Field(description="the element id")
    content: Optional[str] = Field(description="the content needed for the action such as what to type or what to google")


In [62]:
functions = [convert_to_openai_function(LLMOutput), convert_to_openai_function(StructuredAction)]

In [69]:
llm = ChatOpenAI()
prompt = ChatPromptTemplate.from_messages([
    ("system", "You are a web voyager than can make good planning to complete a task on the webpage."),
    ("user", "{observation}"),
])
llm_with_extraction_function = llm.bind(functions=functions)

# the functions are literally made by pydantic classes

In [72]:
model_with_tagging = llm.bind(functions=[convert_to_openai_function(LLMOutput)], function_call="LLMOutput")
model_with_extraction = llm.bind(functions=[convert_to_openai_function(StructuredAction)], function_call="StructuredAction")

In [73]:
chain = prompt | model_with_tagging | model_with_extraction

In [74]:
chain.invoke({"observation": "there are sign-up and sign-in buttons on the page, I need to log in to my account."})

BadRequestError: Error code: 400 - {'error': {'message': "Invalid value: 'LLMOutput'. Supported values are: 'none' and 'auto'.", 'type': 'invalid_request_error', 'param': 'function_call', 'code': 'invalid_value'}}

In [65]:
from langchain_core.output_parsers.openai_functions import JsonOutputFunctionsParser

In [66]:
chain_json = chain | JsonOutputFunctionsParser()

In [67]:
# I tried to do planning and extraction at the same time 
# seems like we need to have action and thought as the output of the LLM using tagging and then turn them into structured info. 

chain_json.invoke({"observation": "there are sign-up and sign-in buttons on the page, I need to log in to my account."})

{'action': 'click',
 'element': 2,
 'thought': 'To log in to your account, click on the sign-in button on the page.'}

In [None]:
# how to use multiple functions in the same chain?

### To decide what functions to call 
#### The langchain concept of a tool -> and then make the tool an openai function

In [86]:
def double_print(func):
    print("marker 1")
    def wrapper(*args, **kwargs):
        # func(*args, **kwargs)
        # func(*args, **kwargs)
        for arg in kwargs:
            func(arg)
            func(arg)
    return wrapper

def tool1(to_print: str):
    print(to_print)

@double_print
def tool2(to_print: str):
    print(to_print)

marker 1


In [87]:
tool2("hi")

In [3]:
from langchain_core.tools import tool
from run import AgentState


@tool
def multiply(a: int, b: int) -> int:
    """Multiply two numbers."""
    return a * b


# Let's inspect some of the attributes associated with the tool.
print(multiply.name)
print(multiply.description)
print(multiply.args)

multiply
Multiply two numbers.
{'a': {'title': 'A', 'type': 'integer'}, 'b': {'title': 'B', 'type': 'integer'}}


In [61]:
from langchain.pydantic_v1 import BaseModel, Field
from run import BoundingBox, Prediction
from playwright.async_api import Page
from typing import List, Annotated
from langchain_core.messages import AnyMessage, SystemMessage, HumanMessage
from langgraph.graph.message import add_messages

from pydantic import BaseModel as PydanticBaseModel

class BaseModel(PydanticBaseModel):
    class Config:
        arbitrary_types_allowed = True



class StateAsInput(BaseModel):
    messages: Annotated[list[AnyMessage], add_messages] = Field(description="The messages exchanged between the user and the agent.")
    supervisor_mode: bool = Field(description="Whether the agent is in supervisor mode.")
    query_human: bool = Field(description="Whether the agent should query the human.")
    page: Page = Field(description="The page where the agent should perform actions.")
    input: str = Field(description="The input to the agent.")
    img: str = Field(description="The screenshot of the page.")
    bounding_boxes: List[BoundingBox] = Field(description="The bounding boxes on the page.")
    prediction: Prediction = Field(description="The prediction of the agent.")
    observation: Annotated[str, add_messages] = Field(description="The observation of the agent.")


In [50]:
from typing import Optional, Type

from langchain.pydantic_v1 import BaseModel
from langchain_core.callbacks import (
    AsyncCallbackManagerForToolRun,
    CallbackManagerForToolRun,
)
from langchain_core.tools import BaseTool

class CustomClickTool(BaseTool):
    name = "click"
    description = "Click action in the browser - Click [Numerical_Label]"
    args_schema: Type[BaseModel] = StateAsInput
    return_direct: bool = True

    def _run(
        self, state: AgentState, run_manager: Optional[CallbackManagerForToolRun] = None
    ) -> str:
        """Use the tool."""
        page = state["page"]
        click_args = state["prediction"]["args"] 
        if click_args is None or len(click_args) != 1:
            return f"Failed to click bounding box labeled as number {click_args}"
        bbox_id = click_args[0]
        bbox_id = int(bbox_id)
        try:
            bbox = state["bboxes"][bbox_id]
        except Exception:
            return f"Error: no bbox for : {bbox_id}"
        x, y = bbox["x"], bbox["y"]
        page.mouse.click(x, y)
        return f"Clicked {bbox_id}"
        

    # async def _arun(
    #     self,
    #     state: AgentState,
    #     run_manager: Optional[AsyncCallbackManagerForToolRun] = None,
    # ) -> str:
    #     """Use the tool asynchronously."""

    #     return self._run(state, run_manager=run_manager.get_sync())

In [52]:
click = CustomClickTool()

print(click.name)
print(click.description)
# print(click.args)
# print(click.return_direct)


click
Click action in the browser - Click [Numerical_Label]


In [59]:
import base64
from langchain_openai import ChatOpenAI


def encode_image(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode('utf-8')

In [65]:
path = "./test_screenshots/screenshot_45.png"
encoded_image = encode_image(path)
print(encoded_image)
from prompts import SYSTEM_PROMPT


iVBORw0KGgoAAAANSUhEUgAABQAAAALQCAIAAABAH0oBAAAAAXNSR0IArs4c6QAAIABJREFUeJzsnWd8FMUfxn+71y/JXS6995CEkAQS0ugdEUWsKKgggoogiu1vw4YFRUQQRAUFFBBEEBAQkN7SSEhIQhrpufR2l3J97/9iNpvjkhyXRijz/fAi9+zszOwsW56dmd8Q7oHDwWxK8jYYS5YsGGkJcc0g1xlt8fBfbH7OmNuQ/JmN7T94luzF+6ijX+ur81kLthEWNprlQwwT++63HoAqYjAYDOY2hn91iuHPQYQjF9guhPU61lMl+rpp2rU6oJitytDjA1FHDOYugV8Zb35ipVNMf9YFgxlghPpWAPDx8Q4IGGRvbwcANTW1OTm5BQWFAMDuVd4r3eFNZ/rv9VXwWnGf1BhzO6Jq1n5Lv8fopZlE0EQgWUAZf/XAYDAYDKYrcvVVToR4B2tBsb5uunadofvFYDCYW4CVuy/B5phOo9dqmkrzb1WNMP1FZOTwIUMGMz+dnBydnBxtbW2Tki73wgAHC+BNZ9jfAC8WwneesMQRfq+F5Ja+qTLmdoWw8yYDx+tLU7H7xWAwGEx3+Yf9igC4yfpiD8KmQF8z0NXBYDD3EIHPLJMEDjUnZUN2avbva/q/Rpj+wsfH29D9MgwZMriuro7secb/cwE9wFPXoU4Lc/OBAnjXpVc1xdz+CMTsl/eARqHdPHegq4LBYDCYO4+9VPIBKnUSGfQv+1Ur4A90dTAYzD2Eme63WykxtycBAYNMbOpFD7AvD5p1oNEDAFAAMi148XqeW1/AllgSrJtYer2O0jY036oa3V0IxJw3/gOSpV0zDTSKga4NBoPBYO48Vur+BQAvyu44e9mLrLHf6I4NdI0wmLscFgv0eqDwhAPMvQSa99vVpt7NAb6dsIoO4DqZFYdJXdnYlJDT/zW6u7Cw4bxxHDh87fYlIHIgRA76hnJorh3oamEwGAzmzkAAnDlkzEX9

In [66]:
bbox_observation = '\nValid Bounding Boxes:\n0 (<div/>): ""\n1 (<div/>): "Amazon Jobs home page"\n2 (<input/>): ""\n3 (<input/>): ""\n4 (<span/>): ""\n5 (<a/>): "Your job application"\n6 (<a/>): "Apply now for Software Dev Engineer, AGI"\n7 (<a/>): "Cookie Preference Center"\n8 (<a/>): "Candidate Cookie Notice"\n9 (<a/>): "Candidate Cookie Notice"\n10 (<a/>): "Candidate Privacy Notice"\n11 (<a/>): "California supplement notice"\n12 (<b/>): "Accept all"\n13 (<b/>): "Reject all"\n14 (<b/>): "Customize cookies"'

text_content = (
            "Observation: please analyze the attached screenshot and give the Thought and Action. I've provided the tag name of each element and the text it contains (if text exists). Note that <textarea> or <input> may be textbox, but not exactly. Please focus more on the screenshot and then refer to the textual information.\n"
            f"{bbox_observation}"
        )

def image_summarize(img_base64, prompt):
    chat = ChatOpenAI(model="gpt-4o", max_tokens=1024)
    msg = chat.invoke(
        [   SystemMessage(content=SYSTEM_PROMPT),
            HumanMessage(
                content=[
                    {"type": "text", "text": text_content},
                    {
                        "type": "image_url",
                        "image_url": {"url": f"data:image/jpeg;base64,{encoded_image}"},
                    },
                ]
            )
        ]
    )
    return msg.content

In [67]:
prompt = "Summarize the image"
print(image_summarize(encoded_image, prompt))

Thought: The webpage is an Amazon Jobs page for the position of Software Dev Engineer, AGI. There are several interactive elements including cookie preferences and a job application link. The task seems to require interacting with a specific element.

Action: Click [6]


In [22]:
print(click.name)
print(click.description)
print(click.args)
print(click.return_direct)


click
Click action in the browser - Click [Numerical_Label]
{'messages': {'title': 'Messages', 'description': 'The messages exchanged between the user and the agent.', 'type': 'array', 'items': {'anyOf': [{'$ref': '#/definitions/AIMessage'}, {'$ref': '#/definitions/HumanMessage'}, {'$ref': '#/definitions/ChatMessage'}, {'$ref': '#/definitions/SystemMessage'}, {'$ref': '#/definitions/FunctionMessage'}, {'$ref': '#/definitions/ToolMessage'}]}}, 'supervisor_mode': {'title': 'Supervisor Mode', 'description': 'Whether the agent is in supervisor mode.', 'type': 'boolean'}, 'query_human': {'title': 'Query Human', 'description': 'Whether the agent should query the human.', 'type': 'boolean'}, 'input': {'title': 'Input', 'description': 'The input to the agent.', 'type': 'string'}, 'img': {'title': 'Img', 'description': 'The screenshot of the page.', 'type': 'string'}, 'bounding_boxes': {'title': 'Bounding Boxes', 'description': 'The bounding boxes on the page.', 'type': 'array', 'items': {'$ref

In [None]:
@tool("click", args_schema=StateAsInput, return_direct=True)
async def click(state: AgentState)-> str:
    """
    Click action in the browser - Click [Numerical_Label]
    """
    page = state["page"]
    click_args = state["prediction"]["args"] 
    if click_args is None or len(click_args) != 1:
        return f"Failed to click bounding box labeled as number {click_args}"
    bbox_id = click_args[0]
    bbox_id = int(bbox_id)
    try:
        bbox = state["bboxes"][bbox_id]
    except Exception:
        return f"Error: no bbox for : {bbox_id}"
    x, y = bbox["x"], bbox["y"]
    await page.mouse.click(x, y)

    return f"Clicked {bbox_id}"

In [108]:
from langchain_openai import OpenAIEmbeddings
import json
from langchain_community.document_loaders import JSONLoader
from langchain_community.vectorstores import Chroma
from pprint import pprint


def retrieve_info_memory(request, relevance_threshold=0.6):
    loader = JSONLoader(file_path="memory/info.json", jq_schema=".[]", text_content=False)
    documents = loader.load()

    if not documents:
        raise ValueError("No documents found")

    embedding = OpenAIEmbeddings()

    db = Chroma.from_documents(documents, embedding)
    docs = db.similarity_search_with_relevance_scores(request, k=1)
    pprint(docs)



    # Check if there are any results
    if not docs or not docs[0]:
        raise ValueError("No matching documents found")
    
    # print(docs[0][1])

    # Assuming the docs have a 'score' attribute that represents relevance
    if docs[0][1] < relevance_threshold:
        raise ValueError("No sufficiently relevant documents found")
    

    return docs[0]

request = "What is my password?"

doc = retrieve_info_memory(request)
print(doc[0])
print(doc[1])


[(Document(page_content='{"query_id": 1, "query": "Please provide the password you would like to use for creating the account.", "response": "1234567Zzw!", "timestamp": "2024-06-17T21:42:57.597586"}', metadata={'seq_num': 1, 'source': '/Users/wellzhang/Desktop/Mortal/memory/info.json'}),
  0.6879530576279216)]
page_content='{"query_id": 1, "query": "Please provide the password you would like to use for creating the account.", "response": "1234567Zzw!", "timestamp": "2024-06-17T21:42:57.597586"}' metadata={'seq_num': 1, 'source': '/Users/wellzhang/Desktop/Mortal/memory/info.json'}
0.6879530576279216


### Testing on retreieving memory from images

In [None]:

image_message = HumanMessage(
    content=[
        {"type": "text", "text": text_content},
        {
            "type": "image_url",
            "image_url": {"url": f"data:image/jpeg;base64,{b64_img}"},
        },
    ]
)

In [None]:
# test if wen need to add images or just text context summary is enough
# if encoded in base64
# try with context summary first 

# convert agent thought into agent_action?

# Human feedback: "Agent action needs improvement." + real human feedback 
# Human feedback: "Agent action is correct." + None

# Thought seems necessary, which shows motivation



# then each step needs to be updated into the experience, otherwise the 
# sreenshot id format: "experience_{experience_id}_step_{step_id}.png"
# prompt: the newer should come first, don't implement until there is a problem
{
    "experiences": [
        {
            "experience_id": "1",
            "steps": [
                {
                    "step_id": "1",
                    "context_summary": {
                        "bbox_description": "",
                        "visual_summary": ""
                    },
                    "image_id": screenshot_id,
                    "agent_thought": "To sign in, we need to enter the email address and password into the respective fields and then click the 'Sign In' button.",
                    "agent_action": "The agent clicked on the bounding box labeled as number 1.",
                    "human_feedback": "",
                    "timestamp": "2022-03-22 12:00:00"
                },
                {
                    "step_id": "2",
                    "context_summary": {
                        "bbox_description": "",
                        "visual_summary": ""
                    },
                    "image_id": screenshot_id,
                    "agent_action": "The agent clicked on the bounding box labeled as number 1.",
                    "human_feedback": "we should sign in first",
                    "timestamp": "2022-03-22 12:00:00"
                }
            ]
        }
    ]
}



In [45]:
from utils import conclude_info

text = "query: Please provide your email address and password for signing in Zillow. response: email is wellzhang1217@gmail.com and password is 1234567Zzw!"
# text = "My name is Jeff, my hair is black and i am 6 feet tall. Anna has the same color hair as me."

conclude_info(text)

Info(info=[Data(ques="What is the user's email address?", ans='wellzhang1217@gmail.com')])

In [48]:
from typing import List, Optional

from langchain_core.pydantic_v1 import BaseModel, Field

from langchain_core.prompts import ChatPromptTemplate

from langchain_openai import ChatOpenAI

llm = ChatOpenAI(model="gpt-4o", temperature=0)


class Data(BaseModel):
    """Personal information about the user presented in a Q&A format, consisting of a question and its answer."""
    ques: Optional[str] = Field(default=None, description="The question regarding the user's information, e.g., 'What is the user's name?' or 'What is the user's email address?'")
    ans: Optional[str] = Field(default=None, description='The answer to the question, e.g., "John Doe" or "john.doe@email.com"')


class Info(BaseModel):
    """Information about the user, consisting of a list of data."""
    info: List[Data] = Field(description="A list of personal information about the user.")

prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "You are an expert extraction algorithm. "
            "Only extract relevant information from the text. "
            "If you do not know the value of an attribute asked to extract, "
            "return null for the attribute's value.",
        ),
        # Please see the how-to about improving performance with
        # reference examples.
        # MessagesPlaceholder('examples'),
        ("human", "{text}"),
    ]
)

runnable = prompt | llm.with_structured_output(schema=Info)
text = "query: Please provide your email address and password for signing in. response: email is wellzhang1217@gmail.com and password is 1234567Zzw!"
runnable.invoke({"text": text})

Info(info=[Data(ques="What is the user's email address?", ans='wellzhang1217@gmail.com'), Data(ques="What is the user's password?", ans='1234567Zzw!')])

In [49]:
TOOL_PROMPT = "Please provide the tag name of each element and the text it contains (if text exists). Note that <textarea> or <input> may be textbox, but not exactly. Please focus more on the screenshot and then refer to the textual information."

text_content = (
    "Observation: please analyze the attached screenshot and give the Thought and Action. I've provided the tag name of each element and the text it contains (if text exists). Note that <textarea> or <input> may be textbox, but not exactly. Please focus more on the screenshot and then refer to the textual information.\n"
    f"{TOOL_PROMPT}"
)

print(text_content)

Observation: please analyze the attached screenshot and give the Thought and Action. I've provided the tag name of each element and the text it contains (if text exists). Note that <textarea> or <input> may be textbox, but not exactly. Please focus more on the screenshot and then refer to the textual information.
Please provide the tag name of each element and the text it contains (if text exists). Note that <textarea> or <input> may be textbox, but not exactly. Please focus more on the screenshot and then refer to the textual information.


In [5]:
import json
from pathlib import Path
from pprint import pprint

file_path='./memory/experiences.json'
data = json.loads(Path(file_path).read_text())

pprint(data)

[{'experience_id': '1', 'steps': [None]},
 {'experience_id': '2', 'steps': []},
 {'experience_id': '3', 'steps': []},
 {'experience_id': '4',
  'steps': [{'agent_action': 'Click',
             'agent_thought': 'To proceed with job application tasks, clicking '
                              'the "Search for Jobs" button is the necessary '
                              'first step.',
             'context_summary': {'bbox_description': '\n'
                                                     'Valid Bounding Boxes:\n'
                                                     '0 (<img/>): ""\n'
                                                     '1 (<button/>): "Search '
                                                     'for Jobs"\n'
                                                     '2 (<input/>): ""\n'
                                                     '3 (<input/>): ""\n'
                                                     '4 (<div/>): "Sign In"\n'
                                 

In [6]:
from langchain_openai import OpenAIEmbeddings
from langchain_community.document_loaders import JSONLoader
from langchain_community.vectorstores import Chroma

loader = JSONLoader(file_path="memory/experiences.json", jq_schema=".[].steps[]", text_content=False)

data = loader.load()

pprint(data)

[Document(page_content='', metadata={'source': '/Users/wellzhang/Desktop/Mortal/memory/experiences.json', 'seq_num': 1}),
 Document(page_content='{"context_summary": {"bbox_description": "\\nValid Bounding Boxes:\\n0 (<img/>): \\"\\"\\n1 (<button/>): \\"Search for Jobs\\"\\n2 (<input/>): \\"\\"\\n3 (<input/>): \\"\\"\\n4 (<div/>): \\"Sign In\\"\\n5 (<button/>): \\"Create Account\\"\\n6 (<button/>): \\"Forgot your password?\\"", "visual_summary": "The screenshot is of a webpage related to Zillow. The page features a job search button labeled \\"Search for Jobs\\" at the top right (1). Below a banner image and text, there is a sign-in form in the center with fields for \\"Email Address\\" (2) and \\"Password\\" (3), a \\"Sign In\\" button (4), a \\"Create Account\\" button (5), and a \\"Forgot your password?\\" button (6)."}, "image_id": "test_screenshots/screenshot_274.png", "agent_thought": "To proceed with job application tasks, clicking the \\"Search for Jobs\\" button is the necessa

In [10]:
import json
from langchain_openai import OpenAIEmbeddings
from langchain_community.document_loaders import JSONLoader
from langchain_community.vectorstores import Chroma
from retrieve_memory import retrieve_experience_memory

# request = "What is my visa type?"
# thought = "To find the visa type, we need to look at the user's profile information."

bbox = """
    Valid Bounding Boxes:
0 (<img/>): ""
1 (<button/>): "Search for Jobs"
2 (<input/>): ""
3 (<input/>): ""
4 (<div/>): "Sign In"
5 (<button/>): "Create Account"
6 (<button/>): "Forgot your password?"
"""

# from current situation
context_summary = {
    "bbox_description": f"{bbox}",
    "visual_summary": "Both the email address and the password input boxes have been filled with the provided credentials. The next step is to click the 'Sign In' button (label 4) to proceed with the login process."
}

context = {
    "bbox_description": f"{bbox}",
    "visual_summary": "Both the email address and the password input boxes have been filled with the provided credentials. The next step is to click the 'Sign In' button (label 4) to proceed with the login process.",
    "agent_thought": "To sign in, we need to enter the email address and password into the respective fields and then click the 'Sign In' button.",
}

context_summary = json.dumps(context_summary)

embedding = OpenAIEmbeddings()

db = Chroma.from_documents(data, embedding)
# docs = db.similarity_search_with_relevance_scores(context_summary, k=3)
docs = db.similarity_search(context_summary, k=5)



pprint(docs)

agent_thought = "To sign in, we need to enter the email address and password into the respective fields and then click the 'Sign In' button."
visual_summary = "Both the email address and the password input boxes have been filled with the provided credentials. The next step is to click the 'Sign In' button (label 4) to proceed with the login process."


# doc = retrieve_info_memory(request)
# doc = retrieve_experience_memory(context_summary)
# print(doc)

[Document(page_content='{"context_summary": {"bbox_description": "\\nValid Bounding Boxes:\\n0 (<img/>): \\"\\"\\n1 (<button/>): \\"Search for Jobs\\"\\n2 (<input/>): \\"\\"\\n3 (<input/>): \\"\\"\\n4 (<div/>): \\"Sign In\\"\\n5 (<button/>): \\"Create Account\\"\\n6 (<button/>): \\"Forgot your password?\\"", "visual_summary": "Both the email address and the password input boxes have been filled with the provided credentials. The next step is to click the \\"Sign In\\" button (label 4) to proceed with the login process."}, "image_id": "test_screenshots/screenshot_283.png", "agent_thought": "The next action is to click the \\"Sign In\\" button to submit the login credentials.", "agent_action": "Click", "human_feedback": "Agent action is correct. ", "timestamp": "2024-06-24T00:02:49.596792"}', metadata={'seq_num': 11, 'source': '/Users/wellzhang/Desktop/Mortal/memory/experiences.json'}),
 Document(page_content='{"context_summary": {"bbox_description": "\\nValid Bounding Boxes:\\n0 (<img/>

In [12]:
docs = retrieve_experience_memory(bbox, visual_summary, agent_thought)

pprint(docs)
print(type(docs[0]))

<class 'list'>
[Document(page_content='{"context_summary": {"bbox_description": "\\nValid Bounding Boxes:\\n0 (<img/>): \\"\\"\\n1 (<button/>): \\"Search for Jobs\\"\\n2 (<input/>): \\"\\"\\n3 (<input/>): \\"\\"\\n4 (<div/>): \\"Sign In\\"\\n5 (<button/>): \\"Create Account\\"\\n6 (<button/>): \\"Forgot your password?\\"", "visual_summary": "Both the email address and the password input boxes have been filled with the provided credentials. The next step is to click the \\"Sign In\\" button (label 4) to proceed with the login process."}, "image_id": "test_screenshots/screenshot_283.png", "agent_thought": "The next action is to click the \\"Sign In\\" button to submit the login credentials.", "agent_action": "Click", "human_feedback": "Agent action is correct. ", "timestamp": "2024-06-24T00:02:49.596792"}', metadata={'seq_num': 11, 'source': '/Users/wellzhang/Desktop/Mortal/memory/experiences.json'}),
 Document(page_content='{"context_summary": {"bbox_description": "\\nValid Bounding Box