From 2c2580bce63688e6eb78a5e36bb4713cb4abe00f Mon Sep 17 00:00:00 2001
From: Derek Meegan <derek@browserbase.com>
Date: Thu, 23 Oct 2025 13:31:31 -0700
Subject: [PATCH 1/7] fix cua example, remove root model

---
 examples/agent_example.py         |  14 ++--
 examples/agent_example_local.py   | 102 ++++++++++++++++++++++++++++++
 stagehand/agent/agent.py          |   5 +-
 stagehand/agent/google_cua.py     |  19 +++---
 stagehand/agent/openai_cua.py     |   8 +--
 stagehand/handlers/cua_handler.py |   5 +-
 stagehand/types/agent.py          |  28 ++++----
 7 files changed, 136 insertions(+), 45 deletions(-)
 create mode 100644 examples/agent_example_local.py

diff --git a/examples/agent_example.py b/examples/agent_example.py
index 5aca9c21..6cb3cb21 100644
--- a/examples/agent_example.py
+++ b/examples/agent_example.py
@@ -36,11 +36,8 @@ async def main():
     # Build a unified configuration object for Stagehand
     config = StagehandConfig(
         env="BROWSERBASE",
-        # env="LOCAL",
         api_key=os.getenv("BROWSERBASE_API_KEY"),
         project_id=os.getenv("BROWSERBASE_PROJECT_ID"),
-        model_name="gpt-4o",
-        self_heal=True,
         system_prompt="You are a browser automation assistant that helps users navigate websites effectively.",
         model_client_options={"apiKey": os.getenv("MODEL_API_KEY")},
         verbose=2,
@@ -51,12 +48,11 @@ async def main():
 
     # Initialize - this creates a new session automatically.
     console.print("\n🚀 [info]Initializing Stagehand...[/]")
-    await stagehand.init()
-    if stagehand.env == "BROWSERBASE":    
-        console.print(f"\n[yellow]Created new session:[/] {stagehand.session_id}")
-        console.print(
-            f"🌐 [white]View your live browser:[/] [url]https://www.browserbase.com/sessions/{stagehand.session_id}[/]"
-        )
+    await stagehand.init() 
+    console.print(f"\n[yellow]Created new session:[/] {stagehand.session_id}")
+    console.print(
+        f"🌐 [white]View your live browser:[/] [url]https://www.browserbase.com/sessions/{stagehand.session_id}[/]"
+    )
 
     console.print("\n▶️ [highlight] Navigating[/] to Google")
     await stagehand.page.goto("https://google.com/")
diff --git a/examples/agent_example_local.py b/examples/agent_example_local.py
new file mode 100644
index 00000000..7a725e4f
--- /dev/null
+++ b/examples/agent_example_local.py
@@ -0,0 +1,102 @@
+import asyncio
+import logging
+import os
+
+from dotenv import load_dotenv
+from rich.console import Console
+from rich.panel import Panel
+from rich.theme import Theme
+
+from stagehand import Stagehand, StagehandConfig, configure_logging
+
+# Create a custom theme for consistent styling
+custom_theme = Theme(
+    {
+        "info": "cyan",
+        "success": "green",
+        "warning": "yellow",
+        "error": "red bold",
+        "highlight": "magenta",
+        "url": "blue underline",
+    }
+)
+
+# Create a Rich console instance with our theme
+console = Console(theme=custom_theme)
+
+load_dotenv()
+
+# Configure logging with the utility function
+configure_logging(
+    level=logging.INFO,  # Set to INFO for regular logs, DEBUG for detailed
+    quiet_dependencies=True,  # Reduce noise from dependencies
+)
+
+async def main():
+    # Build a unified configuration object for Stagehand
+    config = StagehandConfig(
+        env="LOCAL",
+        system_prompt="You are a browser automation assistant that helps users navigate websites effectively.",
+        model_client_options={"apiKey": os.getenv("MODEL_API_KEY")},
+        verbose=2,
+    )
+
+    # Create a Stagehand client using the configuration object.
+    stagehand = Stagehand(config)
+
+    # Initialize - this creates a new session automatically.
+    console.print("\n🚀 [info]Initializing Stagehand...[/]")
+    await stagehand.init()
+
+    console.print("\n▶️ [highlight] Navigating[/] to Google")
+    await stagehand.page.goto("https://google.com/")
+    console.print("✅ [success]Navigated to Google[/]")
+    
+    console.print("\n▶️ [highlight] Using Agent to perform a task[/]: playing a game of 2048")
+    agent = stagehand.agent(
+        model="gemini-2.5-computer-use-preview-10-2025",
+        instructions="You are a helpful web navigation assistant that helps users find information. You are currently on the following page: google.com. Do not ask follow up questions, the user will trust your judgement.",
+        options={"apiKey": os.getenv("GEMINI_API_KEY")}
+    )
+    agent_result = await agent.execute(
+        instruction="Play a game of 2048",
+        max_steps=20,
+        auto_screenshot=True,
+    )
+
+    console.print(agent_result)
+
+    console.print("📊 [info]Agent execution result:[/]")
+    console.print(f"🎯 Completed: [bold]{'Yes' if agent_result.completed else 'No'}[/]")
+    if agent_result.message:
+        console.print(f"💬 Message: [italic]{agent_result.message}[/]")
+    
+    if agent_result.actions:
+        console.print(f"🔄 Actions performed: [bold]{len(agent_result.actions)}[/]")
+        for i, action in enumerate(agent_result.actions):
+            action_type = action.type
+
+            console.print(f"  Action {i+1}: {action_type if action_type else 'Unknown'}")
+    
+    # For debugging, you can also print the full JSON
+    console.print("[dim]Full response JSON:[/]")
+    console.print_json(f"{agent_result.model_dump_json()}")
+
+    # Close the session
+    console.print("\n⏹️  [warning]Closing session...[/]")
+    await stagehand.close()
+    console.print("✅ [success]Session closed successfully![/]")
+    console.rule("[bold]End of Example[/]")
+
+
+if __name__ == "__main__":
+    # Add a fancy header
+    console.print(
+        "\n",
+        Panel(
+            "[light_gray]Stagehand 🤘 Agent Example[/]",
+            border_style="green",
+            padding=(1, 10),
+        ),
+    )
+    asyncio.run(main()) 
\ No newline at end of file
diff --git a/stagehand/agent/agent.py b/stagehand/agent/agent.py
index bcd506be..78faeb45 100644
--- a/stagehand/agent/agent.py
+++ b/stagehand/agent/agent.py
@@ -170,13 +170,10 @@ async def execute(
                 f"Agent execution finished. Success: {agent_result.completed}. Message: {agent_result.message}",
                 category="agent",
             )
-            # To clean up pydantic model output
-            actions_repr = [action.root for action in agent_result.actions]
             self.logger.debug(
-                f"Agent actions: {actions_repr}",
+                f"Agent actions: {agent_result.actions}",
                 category="agent",
             )
-            agent_result.actions = actions_repr
             return agent_result
         else:
             agent_config_payload = self.config.model_dump(
diff --git a/stagehand/agent/google_cua.py b/stagehand/agent/google_cua.py
index 125fd0fc..4ac4dbc1 100644
--- a/stagehand/agent/google_cua.py
+++ b/stagehand/agent/google_cua.py
@@ -25,6 +25,7 @@
     AgentResult,
 )
 from .client import AgentClient
+from pydantic import TypeAdapter
 
 load_dotenv()
 
@@ -176,7 +177,7 @@ def _process_provider_response(
                 and candidate.safety_ratings
             ):
                 error_message += f" - Safety Ratings: {candidate.safety_ratings}"
-            self.logger.warning(error_message, category="agent")
+            self.logger.error(error_message, category="agent")
             return [], reasoning_text, True, error_message, []
 
         if not function_call_parts:
@@ -260,7 +261,7 @@ def _process_provider_response(
                         "keys": [self.key_to_playwright("PageDown")],
                     }
                 else:
-                    self.logger.warning(
+                    self.logger.error(
                         f"Unsupported scroll direction: {direction}", category="agent"
                     )
                     return (
@@ -282,7 +283,7 @@ def _process_provider_response(
                 elif direction in ("left", "right"):
                     magnitude = self._normalize_coordinates(magnitude, 0)[0]
                 else:
-                    self.logger.warning(
+                    self.logger.error(
                         f"Unsupported scroll direction: {direction}", category="agent"
                     )
                     return (
@@ -352,7 +353,7 @@ def _process_provider_response(
                     "arguments": {"url": "https://www.google.com"},
                 }
             else:
-                self.logger.warning(
+                self.logger.error(
                     f"Unsupported Gemini CUA function: {action_name}", category="agent"
                 )
                 return (
@@ -367,13 +368,11 @@ def _process_provider_response(
                 try:
                     # Directly construct the AgentActionType using the payload.
                     # Pydantic will use the 'type' field in action_payload_dict to discriminate the Union.
-                    action_payload_for_agent_action_type = AgentActionType(
-                        **action_payload_dict
-                    )
+                    action_payload_for_agent_action_type = TypeAdapter(AgentActionType).validate_python(action_payload_dict)
 
                     agent_action = AgentAction(
                         action_type=action_type_str,  # This should match the 'type' in action_payload_dict
-                        action=action_payload_for_agent_action_type,  # No RootModel wrapping if AgentActionType is the RootModel itself
+                        action=action_payload_for_agent_action_type,
                         reasoning=reasoning_text,
                         status="tool_code",
                     )
@@ -598,7 +597,7 @@ async def run_task(
                 )
 
             if not agent_action and not task_completed:
-                self.logger.warning(
+                self.logger.debug(
                     "Model did not request an action and task not marked complete. Ending task.",
                     category="agent",
                 )
@@ -614,7 +613,7 @@ async def run_task(
                     usage=usage_obj,
                 )
 
-        self.logger.warning("Max steps reached for Gemini CUA task.", category="agent")
+        self.logger.debug("Max steps reached for Gemini CUA task.", category="agent")
         usage_obj = {
             "input_tokens": total_input_tokens,
             "output_tokens": total_output_tokens,
diff --git a/stagehand/agent/openai_cua.py b/stagehand/agent/openai_cua.py
index a1b9db89..3ab29747 100644
--- a/stagehand/agent/openai_cua.py
+++ b/stagehand/agent/openai_cua.py
@@ -7,7 +7,7 @@
 from openai import (
     OpenAI as OpenAISDK,  # Renamed to avoid conflict with a potential class name
 )
-from pydantic import BaseModel  # Ensure BaseModel is imported for isinstance check
+from pydantic import BaseModel, TypeAdapter  # Ensure BaseModel is imported for isinstance check
 
 from ..handlers.cua_handler import CUAHandler
 from ..types.agent import (
@@ -175,8 +175,8 @@ def _process_provider_response(
                 )
 
             try:
-                action_payload = AgentActionType(
-                    **computer_call_item.action.model_dump()
+                action_payload = TypeAdapter(AgentActionType).validate_python(
+                    computer_call_item.action.model_dump()
                 )
                 agent_action = AgentAction(
                     action_type=computer_call_item.action.type,
@@ -225,7 +225,7 @@ def _process_provider_response(
                 function_action_payload = FunctionAction(type="function", name=function_call_item.name, arguments=arguments)  # type: ignore
                 agent_action = AgentAction(
                     action_type="function",  # Literal 'function'
-                    action=AgentActionType(root=function_action_payload),
+                    action=function_action_payload,
                     reasoning=reasoning_text,  # Reasoning applies to this action
                     status=(
                         function_call_item.status
diff --git a/stagehand/handlers/cua_handler.py b/stagehand/handlers/cua_handler.py
index 2708aa3d..e95c42ee 100644
--- a/stagehand/handlers/cua_handler.py
+++ b/stagehand/handlers/cua_handler.py
@@ -35,13 +35,12 @@ async def get_screenshot_base64(self) -> str:
 
     async def perform_action(self, action: AgentAction) -> ActionExecutionResult:
         """Execute a single action on the page."""
+        specific_action_model = action.action
         self.logger.info(
-            f"Performing action: {action.action.root if action.action else ''}",
+            f"Performing action: {specific_action_model or ''}",
             category=StagehandFunctionName.AGENT,
         )
         action_type = action.action_type
-        # action.action is the RootModel, action.action.root is the specific action model (e.g., ClickAction)
-        specific_action_model = action.action.root if action.action else None
 
         if not specific_action_model:
             self.logger.error(
diff --git a/stagehand/types/agent.py b/stagehand/types/agent.py
index 8f29d3c2..c5a6a614 100644
--- a/stagehand/types/agent.py
+++ b/stagehand/types/agent.py
@@ -1,6 +1,6 @@
 from typing import Any, Literal, Optional, Union
 
-from pydantic import BaseModel, RootModel
+from pydantic import BaseModel
 
 
 class AgentConfig(BaseModel):
@@ -96,20 +96,18 @@ class KeyAction(BaseModel):  # From Anthropic
     text: str
 
 
-AgentActionType = RootModel[
-    Union[
-        ClickAction,
-        DoubleClickAction,
-        TypeAction,
-        KeyPressAction,
-        ScrollAction,
-        DragAction,
-        MoveAction,
-        WaitAction,
-        ScreenshotAction,
-        FunctionAction,
-        KeyAction,
-    ]
+AgentActionType = Union[
+    ClickAction,
+    DoubleClickAction,
+    TypeAction,
+    KeyPressAction,
+    ScrollAction,
+    DragAction,
+    MoveAction,
+    WaitAction,
+    ScreenshotAction,
+    FunctionAction,
+    KeyAction,
 ]
 
 

From a258f9f07060619e7ade98fc27f40c720eecb7bf Mon Sep 17 00:00:00 2001
From: Derek Meegan <derek@browserbase.com>
Date: Thu, 23 Oct 2025 15:46:16 -0700
Subject: [PATCH 2/7] formatting

---
 stagehand/agent/google_cua.py | 4 +++-
 stagehand/agent/openai_cua.py | 5 ++++-
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/stagehand/agent/google_cua.py b/stagehand/agent/google_cua.py
index 4ac4dbc1..e41d4d49 100644
--- a/stagehand/agent/google_cua.py
+++ b/stagehand/agent/google_cua.py
@@ -368,7 +368,9 @@ def _process_provider_response(
                 try:
                     # Directly construct the AgentActionType using the payload.
                     # Pydantic will use the 'type' field in action_payload_dict to discriminate the Union.
-                    action_payload_for_agent_action_type = TypeAdapter(AgentActionType).validate_python(action_payload_dict)
+                    action_payload_for_agent_action_type = TypeAdapter(
+                        AgentActionType
+                    ).validate_python(action_payload_dict)
 
                     agent_action = AgentAction(
                         action_type=action_type_str,  # This should match the 'type' in action_payload_dict
diff --git a/stagehand/agent/openai_cua.py b/stagehand/agent/openai_cua.py
index 3ab29747..16506f5a 100644
--- a/stagehand/agent/openai_cua.py
+++ b/stagehand/agent/openai_cua.py
@@ -7,7 +7,10 @@
 from openai import (
     OpenAI as OpenAISDK,  # Renamed to avoid conflict with a potential class name
 )
-from pydantic import BaseModel, TypeAdapter  # Ensure BaseModel is imported for isinstance check
+from pydantic import (
+    BaseModel,
+    TypeAdapter,
+)  # Ensure BaseModel is imported for isinstance check
 
 from ..handlers.cua_handler import CUAHandler
 from ..types.agent import (

From 2f7a41e3ea150f80792d66edacac9758d6c35e3b Mon Sep 17 00:00:00 2001
From: Derek Meegan <derek@browserbase.com>
Date: Thu, 23 Oct 2025 15:52:41 -0700
Subject: [PATCH 3/7] so formatted

---
 stagehand/agent/google_cua.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/stagehand/agent/google_cua.py b/stagehand/agent/google_cua.py
index e41d4d49..fc46196a 100644
--- a/stagehand/agent/google_cua.py
+++ b/stagehand/agent/google_cua.py
@@ -14,6 +14,7 @@
     GenerateContentConfig,
     Part,
 )
+from pydantic import TypeAdapter
 
 from ..handlers.cua_handler import CUAHandler
 from ..types.agent import (
@@ -25,7 +26,6 @@
     AgentResult,
 )
 from .client import AgentClient
-from pydantic import TypeAdapter
 
 load_dotenv()
 

From 744b78fb88c0f1aa88f3fd2c246259d34f4fd872 Mon Sep 17 00:00:00 2001
From: Derek Meegan <derek@browserbase.com>
Date: Thu, 23 Oct 2025 15:53:34 -0700
Subject: [PATCH 4/7] add changeset

---
 .changeset/arrogant-caiman-of-competence.md | 5 +++++
 1 file changed, 5 insertions(+)
 create mode 100644 .changeset/arrogant-caiman-of-competence.md

diff --git a/.changeset/arrogant-caiman-of-competence.md b/.changeset/arrogant-caiman-of-competence.md
new file mode 100644
index 00000000..d9a26e7f
--- /dev/null
+++ b/.changeset/arrogant-caiman-of-competence.md
@@ -0,0 +1,5 @@
+---
+"stagehand": patch
+---
+
+add local cua example, remove root model from types

From 690a801bd114fd34ae072f20fdc9dede1189a18f Mon Sep 17 00:00:00 2001
From: Derek Meegan <derek@browserbase.com>
Date: Fri, 24 Oct 2025 09:34:06 -0700
Subject: [PATCH 5/7] fix anthropic cua and add self heal back to agent
 exmaples

---
 examples/agent_example.py        |   1 +
 examples/agent_example_local.py  |   1 +
 stagehand/agent/anthropic_cua.py | 190 ++++++++++++++++---------------
 3 files changed, 99 insertions(+), 93 deletions(-)

diff --git a/examples/agent_example.py b/examples/agent_example.py
index 6cb3cb21..4c8cf877 100644
--- a/examples/agent_example.py
+++ b/examples/agent_example.py
@@ -40,6 +40,7 @@ async def main():
         project_id=os.getenv("BROWSERBASE_PROJECT_ID"),
         system_prompt="You are a browser automation assistant that helps users navigate websites effectively.",
         model_client_options={"apiKey": os.getenv("MODEL_API_KEY")},
+        self_heal=True,
         verbose=2,
     )
 
diff --git a/examples/agent_example_local.py b/examples/agent_example_local.py
index 7a725e4f..5d160f54 100644
--- a/examples/agent_example_local.py
+++ b/examples/agent_example_local.py
@@ -38,6 +38,7 @@ async def main():
         env="LOCAL",
         system_prompt="You are a browser automation assistant that helps users navigate websites effectively.",
         model_client_options={"apiKey": os.getenv("MODEL_API_KEY")},
+        self_heal=True,
         verbose=2,
     )
 
diff --git a/stagehand/agent/anthropic_cua.py b/stagehand/agent/anthropic_cua.py
index f0d8b2a7..85401322 100644
--- a/stagehand/agent/anthropic_cua.py
+++ b/stagehand/agent/anthropic_cua.py
@@ -4,6 +4,7 @@
 
 from anthropic import Anthropic, AnthropicError
 from dotenv import load_dotenv
+from pydantic import TypeAdapter
 
 from ..handlers.cua_handler import CUAHandler, StagehandFunctionName
 from ..types.agent import (
@@ -362,7 +363,7 @@ def _convert_tool_use_to_agent_action(
             )
             return None
 
-        action_model_payload: Optional[AgentActionType] = None
+        action_payload_dict: Optional[dict[str, Any]] = None
         reasoning = tool_input.get("reasoning")
 
         try:
@@ -375,52 +376,53 @@ def _convert_tool_use_to_agent_action(
             )
 
             if action_type_str == "left_click":
-                action_model_payload = AgentActionType(
-                    type="click",
-                    x=x,
-                    y=y,
-                    button="left",
-                )
+                action_payload_dict = {
+                    "type": "click",
+                    "x": x,
+                    "y": y,
+                    "button": "left",
+                }
                 action_type_str = "click"  # Normalize
 
             elif action_type_str == "right_click":
-                action_model_payload = AgentActionType(
-                    type="click",
-                    x=x,
-                    y=y,
-                    button="right",
-                )
+                action_payload_dict = {
+                    "type": "click",
+                    "x": x,
+                    "y": y,
+                    "button": "right",
+                }
                 action_type_str = "click"  # Normalize
 
             elif action_type_str == "middle_click":
-                action_model_payload = AgentActionType(
-                    type="click",
-                    x=x,
-                    y=y,
-                    button="middle",
-                )
+                action_payload_dict = {
+                    "type": "click",
+                    "x": x,
+                    "y": y,
+                    "button": "middle",
+                }
                 action_type_str = "click"  # Normalize
 
             elif action_type_str == "double_click":
-                action_model_payload = AgentActionType(
-                    type="double_click",
-                    x=x,
-                    y=y,
-                )
+                action_payload_dict = {
+                    "type": "double_click",
+                    "x": x,
+                    "y": y,
+                }
 
             elif action_type_str == "triple_click":
                 # Handle as double_click for now since we don't have a dedicated triple click
-                action_model_payload = AgentActionType(
-                    type="double_click",
-                    x=x,
-                    y=y,
-                )
+                action_payload_dict = {
+                    "type": "double_click",
+                    "x": x,
+                    "y": y,
+                }
                 action_type_str = "double_click"  # Normalize
 
             elif action_type_str == "type":
-                action_model_payload = AgentActionType(
-                    type="type", text=tool_input.get("text", "")
-                )
+                action_payload_dict = {
+                    "type": "type",
+                    "text": tool_input.get("text", ""),
+                }
 
             elif action_type_str == "key":
                 key_text = tool_input.get("text", "")
@@ -429,10 +431,10 @@ def _convert_tool_use_to_agent_action(
                     keys = [
                         self.key_to_playwright(k.strip()) for k in key_text.split("+")
                     ]
-                    action_model_payload = AgentActionType(
-                        type="keypress",
-                        keys=keys,
-                    )
+                    action_payload_dict = {
+                        "type": "keypress",
+                        "keys": keys,
+                    }
                     action_type_str = "keypress"  # Normalize
 
             elif action_type_str == "hold_key":
@@ -446,10 +448,10 @@ def _convert_tool_use_to_agent_action(
                         self.key_to_playwright(k.strip()) for k in key_text.split("+")
                     ]
                     # For now, handle as a regular keypress
-                    action_model_payload = AgentActionType(
-                        type="keypress",
-                        keys=keys,
-                    )
+                    action_payload_dict = {
+                        "type": "keypress",
+                        "keys": keys,
+                    }
                     action_type_str = "keypress"  # Normalize
 
             elif action_type_str == "scroll":
@@ -469,20 +471,20 @@ def _convert_tool_use_to_agent_action(
                 elif scroll_direction == "left":
                     scroll_x = -scroll_amount * scroll_multiplier
 
-                action_model_payload = AgentActionType(
-                    type="scroll",
-                    x=x or 0,  # Default to 0 if none
-                    y=y or 0,  # Default to 0 if none
-                    scroll_x=scroll_x,
-                    scroll_y=scroll_y,
-                )
+                action_payload_dict = {
+                    "type": "scroll",
+                    "x": x or 0,  # Default to 0 if none
+                    "y": y or 0,  # Default to 0 if none
+                    "scroll_x": scroll_x,
+                    "scroll_y": scroll_y,
+                }
 
             elif action_type_str == "mouse_move":
-                action_model_payload = AgentActionType(
-                    type="move",
-                    x=x,
-                    y=y,
-                )
+                action_payload_dict = {
+                    "type": "move",
+                    "x": x,
+                    "y": y,
+                }
                 action_type_str = "move"  # Normalize
 
             elif action_type_str == "left_click_drag":
@@ -499,14 +501,13 @@ def _convert_tool_use_to_agent_action(
                     and x is not None
                     and y is not None
                 ):
-                    path_points = [
-                        Point(x=start_x, y=start_y),
-                        Point(x=x, y=y),
-                    ]
-                    action_model_payload = AgentActionType(
-                        type="drag",
-                        path=path_points,
-                    )
+                    action_payload_dict = {
+                        "type": "drag",
+                        "path": [
+                            {"x": start_x, "y": start_y},
+                            {"x": x, "y": y},
+                        ],
+                    }
                     action_type_str = "drag"  # Normalize
                 else:
                     self.logger.error(
@@ -517,54 +518,54 @@ def _convert_tool_use_to_agent_action(
 
             elif action_type_str == "left_mouse_down":
                 # Currently not directly supported - handle as a click for now
-                action_model_payload = AgentActionType(
-                    type="click",
-                    x=x,
-                    y=y,
-                    button="left",
-                )
+                action_payload_dict = {
+                    "type": "click",
+                    "x": x,
+                    "y": y,
+                    "button": "left",
+                }
                 action_type_str = "click"  # Normalize
 
             elif action_type_str == "left_mouse_up":
                 # Currently not directly supported - handle as a click for now
-                action_model_payload = AgentActionType(
-                    type="click",
-                    x=x,
-                    y=y,
-                    button="left",
-                )
+                action_payload_dict = {
+                    "type": "click",
+                    "x": x,
+                    "y": y,
+                    "button": "left",
+                }
                 action_type_str = "click"  # Normalize
 
             elif action_type_str == "wait":
                 duration = tool_input.get("duration", 1)  # Default 1 second
                 # Convert seconds to milliseconds
-                action_model_payload = AgentActionType(
-                    type="wait",
-                    miliseconds=int(duration * 1000),
-                )
+                action_payload_dict = {
+                    "type": "wait",
+                    "miliseconds": int(duration * 1000),
+                }
 
             elif action_type_str == "screenshot":
-                action_model_payload = AgentActionType(
-                    type="screenshot",
-                )
+                action_payload_dict = {
+                    "type": "screenshot",
+                }
 
             elif action_type_str == "cursor_position":
                 # This is a read operation, not directly supported
                 # Return a no-op for now
-                action_model_payload = AgentActionType(
-                    type="screenshot",  # Use screenshot as a way to show cursor position
-                )
+                action_payload_dict = {
+                    "type": "screenshot",  # Use screenshot as a way to show cursor position
+                }
                 action_type_str = "screenshot"  # Normalize
 
             elif action_type_str == "function":
                 if tool_name == "goto":
                     url = tool_input.get("url")
                     if url:
-                        action_model_payload = AgentActionType(
-                            type="function",
-                            name="goto",
-                            arguments=FunctionArguments(url=url),
-                        )
+                        action_payload_dict = {
+                            "type": "function",
+                            "name": "goto",
+                            "arguments": {"url": url},
+                        }
                         action_type_str = "function"
                     else:
                         self.logger.error(
@@ -573,11 +574,11 @@ def _convert_tool_use_to_agent_action(
                         )
                         return None
                 elif tool_name == "navigate_back":
-                    action_model_payload = AgentActionType(
-                        type="function",
-                        name="navigate_back",
-                        arguments=FunctionArguments(),
-                    )
+                    action_payload_dict = {
+                        "type": "function",
+                        "name": "navigate_back",
+                        "arguments": None,
+                    }
                     action_type_str = "function"
             else:
                 self.logger.error(
@@ -586,7 +587,10 @@ def _convert_tool_use_to_agent_action(
                 )
                 return None
 
-            if action_model_payload is not None:
+            if action_payload_dict is not None:
+                action_model_payload = TypeAdapter(AgentActionType).validate_python(
+                    action_payload_dict
+                )
                 return AgentAction(
                     action_type=action_type_str,
                     action=action_model_payload,

From a76eefeadc92c4b81effcc9b8b77fce272dc86e8 Mon Sep 17 00:00:00 2001
From: Derek Meegan <derek@browserbase.com>
Date: Fri, 24 Oct 2025 09:35:31 -0700
Subject: [PATCH 6/7] formatting

---
 stagehand/agent/anthropic_cua.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/stagehand/agent/anthropic_cua.py b/stagehand/agent/anthropic_cua.py
index 85401322..67d6a3a6 100644
--- a/stagehand/agent/anthropic_cua.py
+++ b/stagehand/agent/anthropic_cua.py
@@ -553,7 +553,9 @@ def _convert_tool_use_to_agent_action(
                 # This is a read operation, not directly supported
                 # Return a no-op for now
                 action_payload_dict = {
-                    "type": "screenshot",  # Use screenshot as a way to show cursor position
+                    "type": (
+                        "screenshot"
+                    ),  # Use screenshot as a way to show cursor position
                 }
                 action_type_str = "screenshot"  # Normalize
 

From 3d9d7dd574e41a6bc1c98bd665d4bd8fcfa2ce90 Mon Sep 17 00:00:00 2001
From: Derek Meegan <derek@browserbase.com>
Date: Fri, 24 Oct 2025 09:40:52 -0700
Subject: [PATCH 7/7] add back point and function argument types

---
 stagehand/agent/anthropic_cua.py | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/stagehand/agent/anthropic_cua.py b/stagehand/agent/anthropic_cua.py
index 67d6a3a6..df6c7a23 100644
--- a/stagehand/agent/anthropic_cua.py
+++ b/stagehand/agent/anthropic_cua.py
@@ -501,12 +501,13 @@ def _convert_tool_use_to_agent_action(
                     and x is not None
                     and y is not None
                 ):
+                    path_points = [
+                        Point(x=start_x, y=start_y),
+                        Point(x=x, y=y),
+                    ]
                     action_payload_dict = {
                         "type": "drag",
-                        "path": [
-                            {"x": start_x, "y": start_y},
-                            {"x": x, "y": y},
-                        ],
+                        "path": path_points,
                     }
                     action_type_str = "drag"  # Normalize
                 else:
@@ -566,7 +567,7 @@ def _convert_tool_use_to_agent_action(
                         action_payload_dict = {
                             "type": "function",
                             "name": "goto",
-                            "arguments": {"url": url},
+                            "arguments": FunctionArguments(url=url),
                         }
                         action_type_str = "function"
                     else:
@@ -579,7 +580,7 @@ def _convert_tool_use_to_agent_action(
                     action_payload_dict = {
                         "type": "function",
                         "name": "navigate_back",
-                        "arguments": None,
+                        "arguments": FunctionArguments(),
                     }
                     action_type_str = "function"
             else: