From 42f3b206897380a49ed39277acbbafbf53259225 Mon Sep 17 00:00:00 2001 From: Kylejeong2 Date: Mon, 10 Nov 2025 15:27:02 -0800 Subject: [PATCH 1/4] feat: adding stagehand agent tool --- src/tools/agent.ts | 91 ++++++++++++++++++++++++++++++++++++++++++++++ src/tools/index.ts | 3 ++ 2 files changed, 94 insertions(+) create mode 100644 src/tools/agent.ts diff --git a/src/tools/agent.ts b/src/tools/agent.ts new file mode 100644 index 0000000..0cbb002 --- /dev/null +++ b/src/tools/agent.ts @@ -0,0 +1,91 @@ +import { z } from "zod"; +import type { Tool, ToolSchema, ToolResult } from "./tool.js"; +import type { Context } from "../context.js"; +import type { ToolActionResult } from "../types/types.js"; + +/** + * Stagehand Agent + * Docs: https://docs.stagehand.dev/basics/agent + * + * This tool uses Gemini Computer Use to autonomously complete web-based tasks. + * The agent will navigate, interact, and complete the task described in the prompt. + */ + +const AgentInputSchema = z.object({ + prompt: z.string().describe( + `The task prompt describing what you want the agent to accomplish. + Be clear and specific about the goal. For example: + 'Go to Hacker News and find the most controversial post from today, then summarize the top 3 comments'. + The agent will autonomously navigate and interact with web pages to complete this task.`, + ), +}); + +type AgentInput = z.infer; + +const agentSchema: ToolSchema = { + name: "browserbase_stagehand_agent", + description: `Execute a task autonomously using Gemini Computer Use agent. The agent will navigate and interact with web pages to complete the given task.`, + inputSchema: AgentInputSchema, +}; + +async function handleAgent( + context: Context, + params: AgentInput, +): Promise { + const action = async (): Promise => { + try { + const stagehand = await context.getStagehand(); + + // You need to provide GOOGLE_GENERATIVE_AI_API_KEY + const agent = stagehand.agent({ + cua: true, + model: { + modelName: "google/gemini-2.5-computer-use-preview-10-2025", + apiKey: + process.env.GOOGLE_GENERATIVE_AI_API_KEY || + process.env.GOOGLE_API_KEY || + process.env.GEMINI_API_KEY, + }, + }); + + // Execute the task + const result = await agent.execute({ + instruction: params.prompt, + maxSteps: 20, + }); + + // Format response with both steps and result + // The result structure may vary, so we handle it flexibly + const resultData = result as unknown as Record; + const response = { + result: resultData.result || result, + steps: resultData.steps || resultData.trace || [], + }; + + return { + content: [ + { + type: "text", + text: `Agent execution completed:\n${JSON.stringify(response, null, 2)}`, + }, + ], + }; + } catch (error) { + const errorMsg = error instanceof Error ? error.message : String(error); + throw new Error(`Failed to execute agent task: ${errorMsg}`); + } + }; + + return { + action, + waitForNetwork: false, + }; +} + +const agentTool: Tool = { + capability: "core", + schema: agentSchema, + handle: handleAgent, +}; + +export default agentTool; diff --git a/src/tools/index.ts b/src/tools/index.ts index 865d9fa..f0a19da 100644 --- a/src/tools/index.ts +++ b/src/tools/index.ts @@ -5,6 +5,7 @@ import observeTool from "./observe.js"; import screenshotTool from "./screenshot.js"; import sessionTools from "./session.js"; import getUrlTool from "./url.js"; +import agentTool from "./agent.js"; // Export individual tools export { default as navigateTool } from "./navigate.js"; @@ -14,6 +15,7 @@ export { default as observeTool } from "./observe.js"; export { default as screenshotTool } from "./screenshot.js"; export { default as sessionTools } from "./session.js"; export { default as getUrlTool } from "./url.js"; +export { default as agentTool } from "./agent.js"; // Export all tools as array export const TOOLS = [ @@ -24,6 +26,7 @@ export const TOOLS = [ observeTool, screenshotTool, getUrlTool, + agentTool, ]; export const sessionManagementTools = sessionTools; From 15fb63f8fa085fa93c7eab19f229d3b801b93ea0 Mon Sep 17 00:00:00 2001 From: Kylejeong2 Date: Mon, 10 Nov 2025 15:27:56 -0800 Subject: [PATCH 2/4] changesets --- CHANGELOG.md | 6 ++++++ package.json | 2 +- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index f71440a..4c958af 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,11 @@ # @browserbasehq/mcp-server-browserbase +## 2.4.0 + +### Minor Changes + +- feat: adding stagehand agent tool + ## 2.3.0 ### Minor Changes diff --git a/package.json b/package.json index fb43c7c..4539004 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "@browserbasehq/mcp-server-browserbase", - "version": "2.3.0", + "version": "2.4.0", "description": "MCP server for AI web browser automation using Browserbase and Stagehand", "mcpName": "io.github.browserbase/mcp-server-browserbase", "license": "Apache-2.0", From 9a962982ca42e52772a292bfdadd41cc10670949 Mon Sep 17 00:00:00 2001 From: Kylejeong2 Date: Mon, 10 Nov 2025 15:31:23 -0800 Subject: [PATCH 3/4] update default session size + change agent tool to only return the result --- src/sessionManager.ts | 4 ++-- src/tools/agent.ts | 10 +--------- 2 files changed, 3 insertions(+), 11 deletions(-) diff --git a/src/sessionManager.ts b/src/sessionManager.ts index e6aacc9..602f869 100644 --- a/src/sessionManager.ts +++ b/src/sessionManager.ts @@ -47,8 +47,8 @@ export const createStagehandInstance = async ( keepAlive: config.keepAlive ?? false, browserSettings: { viewport: { - width: config.viewPort?.browserWidth ?? 1024, - height: config.viewPort?.browserHeight ?? 768, + width: config.viewPort?.browserWidth ?? 1288, + height: config.viewPort?.browserHeight ?? 711, }, context: config.context?.contextId ? { diff --git a/src/tools/agent.ts b/src/tools/agent.ts index 0cbb002..9ce0319 100644 --- a/src/tools/agent.ts +++ b/src/tools/agent.ts @@ -54,19 +54,11 @@ async function handleAgent( maxSteps: 20, }); - // Format response with both steps and result - // The result structure may vary, so we handle it flexibly - const resultData = result as unknown as Record; - const response = { - result: resultData.result || result, - steps: resultData.steps || resultData.trace || [], - }; - return { content: [ { type: "text", - text: `Agent execution completed:\n${JSON.stringify(response, null, 2)}`, + text: `Agent execution completed:\n${JSON.stringify(result, null, 2)}`, }, ], }; From 0e302a76464521c550944a0bf47824536fd566e8 Mon Sep 17 00:00:00 2001 From: Kylejeong2 Date: Mon, 10 Nov 2025 15:36:15 -0800 Subject: [PATCH 4/4] have agent only return result.message --- src/tools/agent.ts | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/tools/agent.ts b/src/tools/agent.ts index 9ce0319..e333079 100644 --- a/src/tools/agent.ts +++ b/src/tools/agent.ts @@ -13,7 +13,7 @@ import type { ToolActionResult } from "../types/types.js"; const AgentInputSchema = z.object({ prompt: z.string().describe( - `The task prompt describing what you want the agent to accomplish. + `The task prompt describing what you want the sub-agent to accomplish. Be clear and specific about the goal. For example: 'Go to Hacker News and find the most controversial post from today, then summarize the top 3 comments'. The agent will autonomously navigate and interact with web pages to complete this task.`, @@ -42,9 +42,9 @@ async function handleAgent( model: { modelName: "google/gemini-2.5-computer-use-preview-10-2025", apiKey: - process.env.GOOGLE_GENERATIVE_AI_API_KEY || + process.env.GEMINI_API_KEY || process.env.GOOGLE_API_KEY || - process.env.GEMINI_API_KEY, + process.env.GOOGLE_GENERATIVE_AI_API_KEY, }, }); @@ -58,7 +58,7 @@ async function handleAgent( content: [ { type: "text", - text: `Agent execution completed:\n${JSON.stringify(result, null, 2)}`, + text: `${result.message}`, }, ], };