Merge pull request #130 from browserbase/kylejeong/gro-585-agent-tool-for-mcp-server

miguelg719 · web-flow · commit b4d1d1683cd5 · 2025-11-11T10:53:48.000-08:00
feat: adding agent tool for mcp server
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,11 @@
 # @browserbasehq/mcp-server-browserbase
 
+## 2.4.0
+
+### Minor Changes
+
+- feat: adding stagehand agent tool
+
 ## 2.3.0
 
 ### Minor Changes
diff --git a/package.json b/package.json
@@ -1,6 +1,6 @@
 {
   "name": "@browserbasehq/mcp-server-browserbase",
-  "version": "2.3.0",
+  "version": "2.4.0",
   "description": "MCP server for AI web browser automation using Browserbase and Stagehand",
   "mcpName": "io.github.browserbase/mcp-server-browserbase",
   "license": "Apache-2.0",
diff --git a/src/sessionManager.ts b/src/sessionManager.ts
@@ -47,8 +47,8 @@ export const createStagehandInstance = async (
       keepAlive: config.keepAlive ?? false,
       browserSettings: {
         viewport: {
-          width: config.viewPort?.browserWidth ?? 1024,
-          height: config.viewPort?.browserHeight ?? 768,
+          width: config.viewPort?.browserWidth ?? 1288,
+          height: config.viewPort?.browserHeight ?? 711,
         },
         context: config.context?.contextId
           ? {
diff --git a/src/tools/agent.ts b/src/tools/agent.ts
@@ -0,0 +1,83 @@
+import { z } from "zod";
+import type { Tool, ToolSchema, ToolResult } from "./tool.js";
+import type { Context } from "../context.js";
+import type { ToolActionResult } from "../types/types.js";
+
+/**
+ * Stagehand Agent
+ * Docs: https://docs.stagehand.dev/basics/agent
+ *
+ * This tool uses Gemini Computer Use to autonomously complete web-based tasks.
+ * The agent will navigate, interact, and complete the task described in the prompt.
+ */
+
+const AgentInputSchema = z.object({
+  prompt: z.string().describe(
+    `The task prompt describing what you want the sub-agent to accomplish.
+    Be clear and specific about the goal. For example:
+    'Go to Hacker News and find the most controversial post from today, then summarize the top 3 comments'.
+    The agent will autonomously navigate and interact with web pages to complete this task.`,
+  ),
+});
+
+type AgentInput = z.infer<typeof AgentInputSchema>;
+
+const agentSchema: ToolSchema<typeof AgentInputSchema> = {
+  name: "browserbase_stagehand_agent",
+  description: `Execute a task autonomously using Gemini Computer Use agent. The agent will navigate and interact with web pages to complete the given task.`,
+  inputSchema: AgentInputSchema,
+};
+
+async function handleAgent(
+  context: Context,
+  params: AgentInput,
+): Promise<ToolResult> {
+  const action = async (): Promise<ToolActionResult> => {
+    try {
+      const stagehand = await context.getStagehand();
+
+      // You need to provide GOOGLE_GENERATIVE_AI_API_KEY
+      const agent = stagehand.agent({
+        cua: true,
+        model: {
+          modelName: "google/gemini-2.5-computer-use-preview-10-2025",
+          apiKey:
+            process.env.GEMINI_API_KEY ||
+            process.env.GOOGLE_API_KEY ||
+            process.env.GOOGLE_GENERATIVE_AI_API_KEY,
+        },
+      });
+
+      // Execute the task
+      const result = await agent.execute({
+        instruction: params.prompt,
+        maxSteps: 20,
+      });
+
+      return {
+        content: [
+          {
+            type: "text",
+            text: `${result.message}`,
+          },
+        ],
+      };
+    } catch (error) {
+      const errorMsg = error instanceof Error ? error.message : String(error);
+      throw new Error(`Failed to execute agent task: ${errorMsg}`);
+    }
+  };
+
+  return {
+    action,
+    waitForNetwork: false,
+  };
+}
+
+const agentTool: Tool<typeof AgentInputSchema> = {
+  capability: "core",
+  schema: agentSchema,
+  handle: handleAgent,
+};
+
+export default agentTool;
diff --git a/src/tools/index.ts b/src/tools/index.ts
@@ -5,6 +5,7 @@ import observeTool from "./observe.js";
 import screenshotTool from "./screenshot.js";
 import sessionTools from "./session.js";
 import getUrlTool from "./url.js";
+import agentTool from "./agent.js";
 
 // Export individual tools
 export { default as navigateTool } from "./navigate.js";
@@ -14,6 +15,7 @@ export { default as observeTool } from "./observe.js";
 export { default as screenshotTool } from "./screenshot.js";
 export { default as sessionTools } from "./session.js";
 export { default as getUrlTool } from "./url.js";
+export { default as agentTool } from "./agent.js";
 
 // Export all tools as array
 export const TOOLS = [
@@ -24,6 +26,7 @@ export const TOOLS = [
   observeTool,
   screenshotTool,
   getUrlTool,
+  agentTool,
 ];
 
 export const sessionManagementTools = sessionTools;

Original file line number	Diff line number	Diff line change
`@@ -1,6 +1,6 @@`
`1`	`1`	`{`
`2`	`2`	`"name": "@browserbasehq/mcp-server-browserbase",`
`3`		`- "version": "2.3.0",`
	`3`	`+ "version": "2.4.0",`
`4`	`4`	`"description": "MCP server for AI web browser automation using Browserbase and Stagehand",`
`5`	`5`	`"mcpName": "io.github.browserbase/mcp-server-browserbase",`
`6`	`6`	`"license": "Apache-2.0",`