diff --git a/.changeset/horned-giga-cockatoo.md b/.changeset/horned-giga-cockatoo.md new file mode 100644 index 00000000..ba6f1d7a --- /dev/null +++ b/.changeset/horned-giga-cockatoo.md @@ -0,0 +1,5 @@ +--- +"stagehand": patch +--- + +Added support for Gemini Computer Use models diff --git a/examples/agent_example.py b/examples/agent_example.py index 9d7e9b76..5aca9c21 100644 --- a/examples/agent_example.py +++ b/examples/agent_example.py @@ -43,7 +43,7 @@ async def main(): self_heal=True, system_prompt="You are a browser automation assistant that helps users navigate websites effectively.", model_client_options={"apiKey": os.getenv("MODEL_API_KEY")}, - verbose=1, + verbose=2, ) # Create a Stagehand client using the configuration object. @@ -64,9 +64,9 @@ async def main(): console.print("\n▶️ [highlight] Using Agent to perform a task[/]: playing a game of 2048") agent = stagehand.agent( - model="computer-use-preview", + model="gemini-2.5-computer-use-preview-10-2025", instructions="You are a helpful web navigation assistant that helps users find information. You are currently on the following page: google.com. Do not ask follow up questions, the user will trust your judgement.", - options={"apiKey": os.getenv("MODEL_API_KEY")} + options={"apiKey": os.getenv("GEMINI_API_KEY")} ) agent_result = await agent.execute( instruction="Play a game of 2048", diff --git a/pyproject.toml b/pyproject.toml index 03d46e65..1851c49f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -9,7 +9,7 @@ description = "Python SDK for Stagehand" readme = "README.md" classifiers = [ "Programming Language :: Python :: 3", "License :: OSI Approved :: MIT License", "Operating System :: OS Independent",] requires-python = ">=3.9" -dependencies = [ "httpx>=0.24.0", "python-dotenv>=1.0.0", "pydantic>=1.10.0", "playwright>=1.42.1", "requests>=2.31.0", "browserbase>=1.4.0", "rich>=13.7.0", "openai>=1.99.6", "anthropic>=0.51.0", "litellm>=1.72.0,<1.75.0", "nest-asyncio>=1.6.0",] +dependencies = [ "httpx>=0.24.0", "python-dotenv>=1.0.0", "pydantic>=1.10.0", "playwright>=1.42.1", "requests>=2.31.0", "browserbase>=1.4.0", "rich>=13.7.0", "openai>=1.99.6", "anthropic>=0.51.0", "litellm>=1.72.0,<1.75.0", "nest-asyncio>=1.6.0", "google-genai>=1.40.0"] [[project.authors]] name = "Browserbase, Inc." email = "support@browserbase.com" diff --git a/stagehand/agent/agent.py b/stagehand/agent/agent.py index d455c1d5..507aeacb 100644 --- a/stagehand/agent/agent.py +++ b/stagehand/agent/agent.py @@ -13,19 +13,24 @@ ) from .anthropic_cua import AnthropicCUAClient from .client import AgentClient +from .google_cua import GoogleCUAClient from .openai_cua import OpenAICUAClient MODEL_TO_CLIENT_CLASS_MAP: dict[str, type[AgentClient]] = { - "computer-use-preview": OpenAICUAClient, + "computer-use-preview-03-11": OpenAICUAClient, "claude-3-5-sonnet-latest": AnthropicCUAClient, "claude-3-7-sonnet-latest": AnthropicCUAClient, "claude-sonnet-4-20250514": AnthropicCUAClient, + "claude-sonnet-4-5-20250929": AnthropicCUAClient, + "gemini-2.5-computer-use-preview-10-2025": GoogleCUAClient, } MODEL_TO_PROVIDER_MAP: dict[str, AgentProvider] = { - "computer-use-preview": AgentProvider.OPENAI, + "computer-use-preview-03-11": AgentProvider.OPENAI, "claude-3-5-sonnet-20240620": AgentProvider.ANTHROPIC, "claude-3-7-sonnet-20250219": AgentProvider.ANTHROPIC, "claude-sonnet-4-20250514": AgentProvider.ANTHROPIC, + "claude-sonnet-4-5-20250929": AgentProvider.ANTHROPIC, + "gemini-2.5-computer-use-preview-10-2025": AgentProvider.GOOGLE, # Add more mappings as needed } diff --git a/stagehand/agent/anthropic_cua.py b/stagehand/agent/anthropic_cua.py index 8a896f8a..f0d8b2a7 100644 --- a/stagehand/agent/anthropic_cua.py +++ b/stagehand/agent/anthropic_cua.py @@ -62,7 +62,7 @@ def __init__( ) dimensions = ( - (viewport["width"], viewport["height"]) if viewport else (1024, 768) + (viewport["width"], viewport["height"]) if viewport else (1288, 711) ) # Default dimensions if self.config: if hasattr(self.config, "display_width") and self.config.display_width is not None: # type: ignore diff --git a/stagehand/agent/google_cua.py b/stagehand/agent/google_cua.py new file mode 100644 index 00000000..4410061d --- /dev/null +++ b/stagehand/agent/google_cua.py @@ -0,0 +1,679 @@ +import asyncio +import os +from typing import Any, Optional + +from dotenv import load_dotenv +from google import genai +from google.genai import types +from google.genai.types import ( + Candidate, + Content, + FunctionResponse, + FunctionResponseBlob, + FunctionResponsePart, + GenerateContentConfig, + Part, +) + +from ..handlers.cua_handler import CUAHandler +from ..types.agent import ( + ActionExecutionResult, + AgentAction, + AgentActionType, + AgentConfig, + AgentExecuteOptions, + AgentResult, + FunctionAction, +) +from .client import AgentClient + +load_dotenv() + + +class GoogleCUAClient(AgentClient): + def __init__( + self, + model: str = "computer-use-preview-10-2025", + instructions: Optional[str] = None, # System prompt + config: Optional[AgentConfig] = None, + logger: Optional[Any] = None, + handler: Optional[CUAHandler] = None, + viewport: Optional[dict[str, int]] = None, + **kwargs, # Allow for other Google specific options if any + ): + super().__init__(model, instructions, config, logger, handler) + + # Match OpenAI pattern for API key handling + api_key = None + if config and hasattr(config, "options") and config.options: + api_key = config.options.get("apiKey") + if not api_key: + api_key = os.getenv("GEMINI_API_KEY") + if not api_key: + raise ValueError( + "GEMINI_API_KEY environment variable not set and not provided in config." + ) + + self.genai_client = genai.Client(api_key=api_key) + + # Match OpenAI pattern for viewport handling + dimensions = ( + (viewport["width"], viewport["height"]) if viewport else (1288, 711) + ) + if ( + self.config + and hasattr(self.config, "display_width") + and hasattr(self.config, "display_height") + ): + dimensions = [self.config.display_width, self.config.display_height] + + self.display_width = dimensions[0] + self.display_height = dimensions[1] + + self._generate_content_config = GenerateContentConfig( + temperature=1, + top_p=0.95, + top_k=40, + max_output_tokens=8192, + tools=[ + types.Tool( + computer_use=types.ComputerUse( + environment=types.Environment.ENVIRONMENT_BROWSER + ) + ) + ], + ) + + self.history: list[Content] = [] + + def format_screenshot(self, screenshot_base64: str) -> Part: + """Formats a screenshot for the Gemini CUA model.""" + return Part( + inline_data=types.Blob(mime_type="image/png", data=screenshot_base64) + ) + + def _format_initial_messages( + self, instruction: str, screenshot_base64: Optional[str] + ) -> list[Content]: + """Formats the initial messages for the Gemini CUA model.""" + parts: list[Part] = [] + + # Add system prompt if provided + if self.instructions: + parts.append(Part(text=self.instructions)) + + parts.append(Part(text=instruction)) + + if screenshot_base64: + parts.append(self.format_screenshot(screenshot_base64)) + + # Initial user message + initial_content = Content(role="user", parts=parts) + self.history = [initial_content] # Start history with the first user message + return self.history + + def _normalize_coordinates(self, x: int, y: int) -> tuple[int, int]: + """Normalizes coordinates from 0-1000 range to actual display dimensions.""" + norm_x = int(x / 1000 * self.display_width) + norm_y = int(y / 1000 * self.display_height) + return norm_x, norm_y + + def _process_provider_response( + self, response: types.GenerateContentResponse + ) -> tuple[ + list[AgentAction], + Optional[str], + bool, + Optional[str], + list[tuple[str, dict[str, Any]]], + ]: + if not response.candidates: + self.logger.error("No candidates in Google response", category="agent") + return ( + [], + "Error: No candidates from model", + True, + "Error: No candidates from model", + [], + ) + + candidate = response.candidates[0] + self.history.append(candidate.content) # Add model's response to history + + reasoning_text: Optional[str] = None + function_call_parts: list[types.FunctionCall] = [] + + for part in candidate.content.parts: + if part.text: + if reasoning_text is None: + reasoning_text = part.text + else: + reasoning_text += ( + " " + part.text + ) # Concatenate if multiple text parts + if part.function_call: + function_call_parts.append(part.function_call) + + # Retry the request in case of malformed FCs + if ( + not function_call_parts + and not reasoning_text + and hasattr(candidate, "finish_reason") + and candidate.finish_reason == types.FinishReason.MALFORMED_FUNCTION_CALL + ): + return [], reasoning_text, False, None, [] + + if ( + hasattr(candidate, "finish_reason") + and candidate.finish_reason != types.FinishReason.FINISH_REASON_UNSPECIFIED + and candidate.finish_reason != types.FinishReason.STOP + and candidate.finish_reason != types.FinishReason.TOOL_CODE + ): + error_message = ( + f"Task stopped due to finish reason: {candidate.finish_reason.name}" + ) + if ( + candidate.finish_reason == types.FinishReason.SAFETY + and candidate.safety_ratings + ): + error_message += f" - Safety Ratings: {candidate.safety_ratings}" + self.logger.warning(error_message, category="agent") + return [], reasoning_text, True, error_message, [] + + if not function_call_parts: + # Task complete or no action needed + final_model_message = ( + reasoning_text if reasoning_text else "No further actions from model." + ) + self.logger.info( + f"Gemini CUA: Task appears complete or requires user input. Reason: {final_model_message}", + category="agent", + ) + return [], reasoning_text, True, final_model_message, [] + + # Process all function calls + agent_actions: list[AgentAction] = [] + invoked_function_info: list[tuple[str, dict[str, Any]]] = [] + + for function_call_part in function_call_parts: + action_name = function_call_part.name + action_args = function_call_part.args + # Store both name and args for safety acknowledgement + invoked_function_info.append( + (action_name, action_args if action_args else {}) + ) + action_type_str = "" + action_payload_dict = {} + + self.logger.info( + f"Function call part: {function_call_part}", category="agent" + ) + # Map Google's function calls to our AgentActionType + # This requires knowing the Pydantic models in ..types.agent + # ClickAction, TypeAction, KeyPressAction, ScrollAction, GoToAction, WaitAction, MoveAction + if action_name == "open_web_browser": + action_type_str = "function" + # For function actions, the payload for AgentActionType's root will be a FunctionAction model + # The FunctionAction model itself needs 'type', 'name', and 'arguments'. + action_payload_dict = { + "type": "function", + "name": "open_web_browser", + "arguments": None, + } + elif action_name == "click_at": + action_type_str = "click" + x, y = self._normalize_coordinates(action_args["x"], action_args["y"]) + action_payload_dict = { + "type": "click", + "x": x, + "y": y, + "button": action_args.get("button", "left"), + } + elif action_name == "type_text_at": + action_type_str = "type" + x, y = self._normalize_coordinates(action_args["x"], action_args["y"]) + action_payload_dict = { + "type": "type", + "text": action_args["text"], + "x": x, + "y": y, + "press_enter_after": action_args.get("press_enter", False), + } + elif action_name == "key_combination": + action_type_str = "keypress" + keys = [ + self.key_to_playwright(key.strip()) + for key in action_args["keys"].split("+") + ] + action_payload_dict = {"type": "keypress", "keys": keys} + elif action_name == "scroll_document": + direction = action_args["direction"].lower() + if direction == "up": + action_type_str = "keypress" + action_payload_dict = { + "type": "keypress", + "keys": [self.key_to_playwright("PageUp")], + } + elif direction == "down": + action_type_str = "keypress" + action_payload_dict = { + "type": "keypress", + "keys": [self.key_to_playwright("PageDown")], + } + else: + self.logger.warning( + f"Unsupported scroll direction: {direction}", category="agent" + ) + return ( + [], + reasoning_text, + True, + f"Unsupported scroll direction: {direction}", + invoked_function_info, + ) + elif action_name == "scroll_at": + action_type_str = "scroll" + x, y = self._normalize_coordinates(action_args["x"], action_args["y"]) + direction = action_args["direction"].lower() + magnitude = action_args.get("magnitude", 800) + + # Denormalize magnitude based on direction + if direction in ("up", "down"): + magnitude = self._normalize_coordinates(0, magnitude)[1] + elif direction in ("left", "right"): + magnitude = self._normalize_coordinates(magnitude, 0)[0] + else: + self.logger.warning( + f"Unsupported scroll direction: {direction}", category="agent" + ) + return ( + [], + reasoning_text, + True, + f"Unsupported scroll direction: {direction}", + invoked_function_info, + ) + + action_payload_dict = { + "type": "scroll", + "x": x, + "y": y, + "direction": direction, + "magnitude": magnitude, + } + elif action_name == "drag_and_drop": + action_type_str = "function" + x, y = self._normalize_coordinates(action_args["x"], action_args["y"]) + dest_x, dest_y = self._normalize_coordinates( + action_args["destination_x"], action_args["destination_y"] + ) + action_payload_dict = { + "type": "function", + "name": "drag_and_drop", + "arguments": { + "x": x, + "y": y, + "destination_x": dest_x, + "destination_y": dest_y, + }, + } + elif action_name == "navigate": + action_type_str = "function" + action_payload_dict = { + "type": "function", + "name": "goto", + "arguments": {"url": action_args["url"]}, + } + elif action_name == "go_back": + action_type_str = "function" + action_payload_dict = { + "type": "function", + "name": "navigate_back", + "arguments": None, + } + elif action_name == "go_forward": + action_type_str = "function" + action_payload_dict = { + "type": "function", + "name": "navigate_forward", + "arguments": None, + } + elif action_name == "wait_5_seconds": + action_type_str = "wait" + action_payload_dict = {"type": "wait", "miliseconds": 5000} + elif action_name == "hover_at": + action_type_str = "move" + x, y = self._normalize_coordinates(action_args["x"], action_args["y"]) + action_payload_dict = {"type": "move", "x": x, "y": y} + elif action_name == "search": + action_type_str = "function" + action_payload_dict = { + "type": "function", + "name": "goto", + "arguments": {"url": "https://www.google.com"}, + } + else: + self.logger.warning( + f"Unsupported Gemini CUA function: {action_name}", category="agent" + ) + return ( + [], + reasoning_text, + True, + f"Unsupported function: {action_name}", + invoked_function_info, + ) + + if action_payload_dict: # Check if a payload was successfully constructed + try: + # Directly construct the AgentActionType using the payload. + # Pydantic will use the 'type' field in action_payload_dict to discriminate the Union. + action_payload_for_agent_action_type = AgentActionType( + **action_payload_dict + ) + + agent_action = AgentAction( + action_type=action_type_str, # This should match the 'type' in action_payload_dict + action=action_payload_for_agent_action_type, # No RootModel wrapping if AgentActionType is the RootModel itself + reasoning=reasoning_text, + status="tool_code", + ) + agent_actions.append(agent_action) + except Exception as e_parse: + self.logger.error( + f"Error parsing Google action '{action_name}' with payload '{action_payload_dict}': {e_parse}", + category="agent", + ) + return ( + [], + reasoning_text, + True, + f"Error parsing action: {e_parse}", + invoked_function_info, + ) + + return ( + agent_actions, + reasoning_text, + False, + None, + invoked_function_info, + ) + + def _format_action_feedback( + self, + function_name_called: str, # Renamed from action_type_performed for clarity with Google's terminology + action_result: ActionExecutionResult, + new_screenshot_base64: str, + current_url: str, + function_call_args: Optional[dict[str, Any]] = None, + ) -> Content: # Returns a single Content object to append to history + + # The response to the model is a FunctionResponse part within a user role content. + # It should contain the result of the function call, which includes the new screenshot and URL. + response_data: dict[str, Any] = { + "url": current_url, + } + + # Acknowledge safety decision for evals (auto-acknowledge like in TS implementation) + if function_call_args and function_call_args.get("safety_decision"): + response_data["safety_acknowledgement"] = "true" + self.logger.info( + "Auto-acknowledging safety decision", + category="agent", + ) + + if not action_result["success"]: + # Include error information if the action failed + response_data["error"] = action_result.get("error", "Unknown error") + self.logger.info( + f"Formatting failed action feedback for Gemini CUA: {response_data['error']}", + category="agent", + ) + + function_response_part = Part( + function_response=FunctionResponse( + name=function_name_called, + response=response_data, + parts=[ + FunctionResponsePart( + inline_data=FunctionResponseBlob( + mime_type="image/png", data=new_screenshot_base64 + ) + ) + ], + ) + ) + + feedback_content = Content(role="user", parts=[function_response_part]) + self.history.append(feedback_content) # Add this feedback to history + return feedback_content + + async def run_task( + self, + instruction: str, + max_steps: int = 20, + options: Optional[AgentExecuteOptions] = None, + ) -> AgentResult: + self.logger.debug( + f"Gemini CUA starting task: '{instruction}' with max_steps: {max_steps}", + category="agent", + ) + + if not self.handler: + self.logger.error( + "CUAHandler not available for GoogleCUAClient.", category="agent" + ) + return AgentResult( + completed=False, + actions=[], + message="Internal error: Handler not set.", + usage={"input_tokens": 0, "output_tokens": 0, "inference_time_ms": 0}, + ) + + await self.handler.inject_cursor() + current_screenshot_b64 = await self.handler.get_screenshot_base64() + current_url = self.handler.page.url + + # _format_initial_messages already initializes self.history + self._format_initial_messages(instruction, current_screenshot_b64) + + actions_taken_detail: list[AgentAction] = ( + [] + ) # To store full AgentAction objects with reasoning, etc. + actions_taken_summary: list[AgentActionType] = ( + [] + ) # To store just the action payloads for AgentResult + + total_input_tokens = 0 # Note: Google API does not directly return token counts per call in the same way as OpenAI. + total_output_tokens = ( + 0 # We might need to estimate or rely on aggregated billing data. + ) + total_inference_time_ms = 0 + + for step_count in range(max_steps): + self.logger.info( + f"Gemini CUA - Step {step_count + 1}/{max_steps}", + category="agent", + ) + + start_time = asyncio.get_event_loop().time() + try: + model_response = self.genai_client.models.generate_content( + model=self.model, + contents=self.history, + config=self._generate_content_config, + ) + end_time = asyncio.get_event_loop().time() + total_inference_time_ms += int((end_time - start_time) * 1000) + + # Token count handling (placeholder as Google API differs from OpenAI here) + # For now, we won't get precise token counts from the response object directly. + # If available through another means (e.g. response.usage_metadata), it can be added. + if ( + hasattr(model_response, "usage_metadata") + and model_response.usage_metadata + ): + # Example: total_input_tokens += model_response.usage_metadata.prompt_token_count + # Example: total_output_tokens += model_response.usage_metadata.candidates_token_count + pass # Adjust if actual fields are known + + except Exception as e: + self.logger.error(f"Google API call failed: {e}", category="agent") + usage_obj = { + "input_tokens": total_input_tokens, + "output_tokens": total_output_tokens, + "inference_time_ms": total_inference_time_ms, + } + return AgentResult( + actions=actions_taken_summary, + message=f"Google API error: {e}", + completed=False, # Changed to False as task did not complete successfully + usage=usage_obj, + ) + + ( + agent_actions, + reasoning_text, + task_completed, + final_model_message, + invoked_function_info_list, + ) = self._process_provider_response(model_response) + + if reasoning_text: + self.logger.info(f"Model reasoning: {reasoning_text}", category="agent") + + if agent_actions: + for idx, agent_action in enumerate(agent_actions): + actions_taken_detail.append(agent_action) + if agent_action.action: + actions_taken_summary.append(agent_action.action) + + invoked_function_name, function_call_args = ( + invoked_function_info_list[idx] + ) + + if invoked_function_name == "open_web_browser": + action_result: ActionExecutionResult = { + "success": True, + "error": None, + } + else: + action_result: ActionExecutionResult = ( + await self.handler.perform_action(agent_action) + ) + current_screenshot_b64 = ( + await self.handler.get_screenshot_base64() + ) + current_url = self.handler.page.url + + function_name_called_for_feedback = "" + if agent_action.action_type == "function" and isinstance( + agent_action.action.root, FunctionAction + ): + function_name_called_for_feedback = ( + agent_action.action.root.name + ) + self._format_action_feedback( + function_name_called=function_name_called_for_feedback, + action_result=action_result, + new_screenshot_base64=current_screenshot_b64, + current_url=current_url, + function_call_args=function_call_args, + ) + else: + if not invoked_function_name: + self.logger.error( + "Original Google function name not found for feedback loop (was None).", + category="agent", + ) + else: + self._format_action_feedback( + function_name_called=invoked_function_name, + action_result=action_result, + new_screenshot_base64=current_screenshot_b64, + current_url=current_url, + function_call_args=function_call_args, + ) + + if task_completed: + self.logger.info( + f"Task marked complete by model. Final message: {final_model_message}", + category="agent", + ) + usage_obj = { + "input_tokens": total_input_tokens, + "output_tokens": total_output_tokens, + "inference_time_ms": total_inference_time_ms, + } + return AgentResult( + actions=actions_taken_summary, + message=final_model_message or "Task completed.", + completed=True, + usage=usage_obj, + ) + + if not agent_action and not task_completed: + self.logger.warning( + "Model did not request an action and task not marked complete. Ending task.", + category="agent", + ) + usage_obj = { + "input_tokens": total_input_tokens, + "output_tokens": total_output_tokens, + "inference_time_ms": total_inference_time_ms, + } + return AgentResult( + actions=actions_taken_summary, + message=final_model_message or "Model provided no further actions.", + completed=False, # Task did not complete as expected + usage=usage_obj, + ) + + self.logger.warning("Max steps reached for Gemini CUA task.", category="agent") + usage_obj = { + "input_tokens": total_input_tokens, + "output_tokens": total_output_tokens, + "inference_time_ms": total_inference_time_ms, + } + return AgentResult( + actions=actions_taken_summary, + message="Max steps reached.", + completed=False, + usage=usage_obj, + ) + + def key_to_playwright(self, key: str) -> str: + """Converts a key name if Gemini CUA uses specific names not covered by CUAHandler.""" + # This largely mirrors CUAHandler._convert_key_name, but can be adapted for Google specifics if any. + # Google's `key_combination` takes strings like "control+c", so this function might be used to map + # individual key names if they differ from Playwright standards *before* CUAHandler gets them. + # However, CUAHandler already has a robust _convert_key_name. So, this client-side one + # might only be needed if Google uses names that _convert_key_name doesn't already handle + # or if we want to pre-process them. + # For now, assume direct pass-through or simple mappings if Google uses very different names. + # The CUAHandler._convert_key_name is quite comprehensive. + # Let's make this a simple pass-through and rely on CUAHandler's conversion. + # If Google sends "ENTER", CUAHandler will map it. If Google sends "Enter", it still works. + # If Google has a special name like "GOOGLE_SEARCH_KEY", it would be mapped here. + custom_map = { + # e.g., "SpecificGoogleKey": "PlaywrightEquivalentKey" + } + return custom_map.get( + key, key + ) # Return mapped key or original if not in custom_map + + def get_text(self, candidate: Candidate) -> Optional[str]: + """Extracts the text from the candidate.""" + text = [] + for part in candidate.content.parts: + if part.text: + text.append(part.text) + return " ".join(text) or None + + def get_function_call(self, candidate: Candidate) -> Optional[types.FunctionCall]: + """Extracts the function call from the candidate.""" + for part in candidate.content.parts: + if part.function_call: + return part.function_call + return None diff --git a/stagehand/agent/openai_cua.py b/stagehand/agent/openai_cua.py index a1f2caaf..a1b9db89 100644 --- a/stagehand/agent/openai_cua.py +++ b/stagehand/agent/openai_cua.py @@ -42,7 +42,7 @@ def __init__( ) dimensions = ( - (viewport["width"], viewport["height"]) if viewport else (1024, 768) + (viewport["width"], viewport["height"]) if viewport else (1288, 711) ) # Default or from self.config if specified if ( self.config diff --git a/stagehand/browser.py b/stagehand/browser.py index d21560d5..e7f985e1 100644 --- a/stagehand/browser.py +++ b/stagehand/browser.py @@ -53,8 +53,8 @@ async def connect_browserbase_browser( project_id=stagehand_instance.browserbase_project_id, browser_settings={ "viewport": { - "width": 1024, - "height": 768, + "width": 1288, + "height": 711, }, }, ) @@ -194,7 +194,7 @@ async def connect_local_browser( ], ), "viewport": local_browser_launch_options.get( - "viewport", {"width": 1024, "height": 768} + "viewport", {"width": 1288, "height": 711} ), "locale": local_browser_launch_options.get("locale", "en-US"), "timezone_id": local_browser_launch_options.get( diff --git a/stagehand/schemas.py b/stagehand/schemas.py index 5ff23fb2..3cb0d54e 100644 --- a/stagehand/schemas.py +++ b/stagehand/schemas.py @@ -234,6 +234,7 @@ class AgentProvider(str, Enum): OPENAI = "openai" ANTHROPIC = "anthropic" + GOOGLE = "google" class AgentConfig(StagehandBaseModel):