From de7917fe9906b5c68622236783489077302737b2 Mon Sep 17 00:00:00 2001 From: shivammittal274 Date: Thu, 25 Sep 2025 07:43:03 -0700 Subject: [PATCH 1/2] added celebration tool in Agent27 --- src/lib/agent/Agent27.prompt.ts | 1 + src/lib/agent/Agent27.ts | 73 +++++++++++++++++++++++++++++++-- src/lib/execution/Execution.ts | 2 +- 3 files changed, 72 insertions(+), 4 deletions(-) diff --git a/src/lib/agent/Agent27.prompt.ts b/src/lib/agent/Agent27.prompt.ts index d7db3b3b..d23c60f4 100644 --- a/src/lib/agent/Agent27.prompt.ts +++ b/src/lib/agent/Agent27.prompt.ts @@ -371,6 +371,7 @@ export function getToolDescriptions(isLimitedContextMode: boolean = false): stri - tab_focus: Switch between tabs - tab_close: Close browser tabs - extract: Extract data from web pages +- celebration: Show confetti animation - human_input: Request human assistance - done: Mark tasks as complete - visual_click: Click elements using visual descriptions diff --git a/src/lib/agent/Agent27.ts b/src/lib/agent/Agent27.ts index a74bb0c4..d147c706 100644 --- a/src/lib/agent/Agent27.ts +++ b/src/lib/agent/Agent27.ts @@ -47,6 +47,7 @@ import { createMoondreamVisualClickTool, createMoondreamVisualTypeTool, createGrepElementsTool, + createCelebrationTool, } from "@/lib/tools/NewTools"; import { createGroupTabsTool } from "@/lib/tools/tab/GroupTabsTool"; import { createBrowserOSInfoTool } from '@/lib/tools/utility/BrowserOSInfoTool'; @@ -282,6 +283,7 @@ export class NewAgent27 { // Utility tools this.toolManager.register(createExtractTool(this.executionContext)); // Extract text from page this.toolManager.register(createHumanInputTool(this.executionContext)); + this.toolManager.register(createCelebrationTool(this.executionContext)); // Celebration/confetti tool this.toolManager.register(createDateTool(this.executionContext)); // Date/time utilities this.toolManager.register(createBrowserOSInfoTool(this.executionContext)); // BrowserOS info tool @@ -306,10 +308,75 @@ export class NewAgent27 { ); } + /** + * Check if task is a special predefined task and return its metadata + * @param task - The original task string + * @returns Metadata with predefined plan or null if not a special task + */ + private _getSpecialTaskMetadata(task: string): {task: string, metadata: ExecutionMetadata} | null { + // Case-insensitive comparison + const taskLower = task.toLowerCase(); + + // BrowserOS Launch Upvote Task + if (taskLower === "visit browseros launch and upvote ❤️") { + return { + task: "Visit BrowserOS launch and upvote", + metadata: { + executionMode: 'predefined' as const, + predefinedPlan: { + agentId: 'browseros-launch-upvoter', + name: "BrowserOS Launch Upvoter", + goal: "Navigate to BrowserOS launch page and upvote it", + steps: [ + "Navigate to https://dub.sh/browseros-launch", + "Find and click the upvote button on the page using visual_click", + "Use celebration tool to show confetti animation" + ] + } + } + }; + } + + // GitHub Star Task + if (taskLower === "go to github and star browseros ⭐") { + return { + task: "Star the BrowserOS GitHub repository", + metadata: { + executionMode: 'predefined' as const, + predefinedPlan: { + agentId: 'github-star-browseros', + name: "GitHub Repository Star", + goal: "Navigate to BrowserOS GitHub repo and star it", + steps: [ + "Navigate to https://git.new/browserOS", + "Check if the star button indicates already starred (filled star icon)", + "If not starred (outline star icon), click the star button to star the repository", + "Use celebration_tool to show confetti animation" + ] + } + } + }; + } + + // Return null if not a special task + return null; + } + // There are basically two modes of operation: // 1. Dynamic planning - the agent plans and executes in a loop until done // 2. Predefined plan - the agent executes a predefined set of steps in a loop until all are done async execute(task: string, metadata?: ExecutionMetadata): Promise { + // Check for special tasks and get their predefined plans + const specialTaskMetadata = this._getSpecialTaskMetadata(task); + + let _task = task; + let _metadata = metadata; + + if (specialTaskMetadata) { + _task = specialTaskMetadata.task; + _metadata = { ...metadata, ...specialTaskMetadata.metadata }; + Logging.log("NewAgent27", `Special task detected: ${specialTaskMetadata.metadata.predefinedPlan?.name}`, "info"); + } try { this.executionContext.setExecutionMetrics({ ...this.executionContext.getExecutionMetrics(), @@ -320,10 +387,10 @@ export class NewAgent27 { await this._initialize(); // Check for predefined plan - if (metadata?.executionMode === 'predefined' && metadata.predefinedPlan) { - await this._executePredefined(task, metadata.predefinedPlan); + if (_metadata?.executionMode === 'predefined' && _metadata.predefinedPlan) { + await this._executePredefined(_task, _metadata.predefinedPlan); } else { - await this._executeDynamic(task); + await this._executeDynamic(_task); } } catch (error) { this._handleExecutionError(error); diff --git a/src/lib/execution/Execution.ts b/src/lib/execution/Execution.ts index db86fff3..0b5c7f64 100644 --- a/src/lib/execution/Execution.ts +++ b/src/lib/execution/Execution.ts @@ -203,7 +203,7 @@ Upgrade to the latest BrowserOS version from [GitHub Releases](https://github.co this.options.mode === "chat" ? new ChatAgent(executionContext) : getFeatureFlags().isEnabled('NEW_AGENT') - ? new NewAgent(executionContext) + ? new NewAgent27(executionContext) : new BrowserAgent(executionContext); // Execute From f2f10fed316a4f23c3c09e8259de447c2c04a035 Mon Sep 17 00:00:00 2001 From: shivammittal274 Date: Tue, 30 Sep 2025 15:55:35 -0700 Subject: [PATCH 2/2] teach mode and preprocess changes --- src/lib/agent/LocalAgent.ts | 6 +- src/lib/agent/PreprocessAgent.prompt.ts | 343 +++++++++++------------- src/lib/agent/PreprocessAgent.ts | 245 +++++++---------- src/lib/agent/TeachAgent.prompt.ts | 280 ++++++++++++------- 4 files changed, 432 insertions(+), 442 deletions(-) diff --git a/src/lib/agent/LocalAgent.ts b/src/lib/agent/LocalAgent.ts index 1263ab66..4ccdef96 100644 --- a/src/lib/agent/LocalAgent.ts +++ b/src/lib/agent/LocalAgent.ts @@ -738,7 +738,7 @@ Continue upon the previous steps what has been done so far and suggest next step `; const userPromptTokens = TokenCounter.countMessage(new HumanMessage(userPrompt)); const browserStateMessage = await this._getBrowserStateMessage( - /* includeScreenshot */ this.executionContext.supportsVision() && this.executionContext.isLimitedContextMode(), + /* includeScreenshot */ this.executionContext.supportsVision() && !this.executionContext.isLimitedContextMode(), /* simplified */ true, /* screenshotSize */ "large", /* includeBrowserState */ true, @@ -832,7 +832,7 @@ Continue upon the previous steps what has been done so far and suggest next step const additionalTokens = TokenCounter.countMessage(new HumanMessage(executionContext + '\n'+ plannerOutputForExecutor)); const browserStateMessage = await this._getBrowserStateMessage( - /* includeScreenshot */ this.executionContext.supportsVision() && this.executionContext.isLimitedContextMode(), + /* includeScreenshot */ this.executionContext.supportsVision() && !this.executionContext.isLimitedContextMode(), /* simplified */ true, /* screenshotSize */ "medium", /* includeBrowserState */ true, @@ -1370,7 +1370,7 @@ Continue upon your previous steps what has been done so far and suggest next ste `; const userPromptTokens = TokenCounter.countMessage(new HumanMessage(userPrompt)); const browserStateMessage = await this._getBrowserStateMessage( - /* includeScreenshot */ this.executionContext.supportsVision() && this.executionContext.isLimitedContextMode(), + /* includeScreenshot */ this.executionContext.supportsVision() && !this.executionContext.isLimitedContextMode(), /* simplified */ true, /* screenshotSize */ "large", /* includeBrowserState */ true, diff --git a/src/lib/agent/PreprocessAgent.prompt.ts b/src/lib/agent/PreprocessAgent.prompt.ts index f39d557c..6df3f9d3 100644 --- a/src/lib/agent/PreprocessAgent.prompt.ts +++ b/src/lib/agent/PreprocessAgent.prompt.ts @@ -4,219 +4,184 @@ export function generateEventAnalysisPrompt(): string { return ` -You are an expert browser automation analyst specializing in converting recorded user actions into executable agent instructions. +You are an expert browser automation analyst. Your job is to analyze individual user actions and convert them into clear, executable instructions for an automation agent. -## Context You Will Receive: +## Context Provided: -### Workflow Context -- **Overall Workflow Description**: What the user is automating/demonstrating (the complete workflow goal) -- **Action Position**: Current action number (e.g., "Action 3 of 8") -- **Progress So Far**: What has been accomplished before this action (empty if first action) +### User Narration (if available) +The user may have recorded a voice narration explaining what they're trying to achieve. Use this to understand the high-level goal and intent behind the actions. -### Current Action Details -- **Action Type**: The specific browser action (click, type, navigate, scroll, keyDown, etc.) -- **Action Arguments**: Any specific parameters or data for this action +### Action Context +- **Position**: Which action this is in the sequence (e.g., "Action 3 of 8") +- **Previous Actions**: Summary of what has been accomplished so far +- **Current Action**: The specific browser action being performed (click, type, navigate, etc.) with any arguments -### Page States with Visual Context -- **Before State**: Page state before action (URL, title, interactive elements + screenshot) -- **After State**: Page state after action (URL, title, interactive elements + screenshot) +### Page State (Before & After) +You'll see the page state before and after this action, including: +- URL and page title +- Interactive elements visible on the page +- Screenshots showing visual context -## Your Analysis Task: +## Your Task: -Generate structured execution guidance by analyzing this action within its complete workflow context. +Analyze this action and provide structured guidance: -### 1. Semantic Intent Analysis -- What is the user trying to accomplish with THIS specific action? -- How does this action move toward the overall workflow goal? -- Consider the action's position in the sequence and what came before +### 1. Intent +What is the user trying to accomplish with THIS specific action? Consider: +- The narration context (if provided) +- The action's position in the workflow +- What was done before this action +- How this moves toward the overall goal ### 2. Action Description -- Clear, actionable instructions for reproducing this action -- Generic enough to work in similar scenarios -- Focus on desired outcome, not implementation specifics - -### 3. Element Identification Strategy (click/type actions only) -- **Multi-Method Approach**: Visual cues, text content, DOM attributes, positioning -- **Change-Resistant**: Avoid exact class names/IDs that might change -- **Context-Aware**: Use surrounding elements and page structure -- **Human-Descriptive**: How would a human find this element? -- **Example**: "Blue 'Continue' button at bottom of checkout form" vs "button.btn-checkout-continue" - -### 4. Validation Strategy -- **Success Criteria**: How to verify the action completed successfully -- **Multiple Verification Methods**: URL changes, DOM updates, visual changes, content appearance -- **Timing Considerations**: Account for loading delays and async operations -- **Fallback Verification**: Alternative confirmation methods -- **Specific Indicators**: Exact conditions that signal success +Provide clear, actionable instructions that: +- Explain how to reproduce this action +- Are generic enough to work in similar scenarios +- Focus on the desired outcome, not technical details +- Can be understood by an automation agent + +### 3. Element Identification (for click/type actions only) +Describe how to reliably find the target element: +- Use visual cues (colors, size, position) +- Reference text content and labels +- Describe surrounding context +- Avoid brittle selectors (no exact class names/IDs) +- Think: "How would a human describe finding this element?" + +Example: "The blue 'Continue' button at the bottom of the checkout form" NOT "button.btn-checkout-continue" + +### 4. Validation +Explain how to verify this action succeeded: +- What should change? (URL, page content, element visibility) +- Multiple verification methods (don't rely on just one) +- Account for loading delays and async behavior +- Provide fallback checks +- Be specific about success indicators ### 5. Updated Workflow Summary +Update the progress summary to reflect completion of this action: - Incorporate this action into the ongoing workflow narrative -- Update progress summary reflecting current state after this action -- Keep concise (2-3 sentences) and goal-oriented -- Focus on user objectives and workflow progression - -## Output Guidelines: -- **Contextually Aware**: Use the action's position and previous progress -- **Execution-Ready**: Instructions an automation agent can follow -- **Robust**: Handle variations and edge cases -- **Progressive**: Show how this action advances the overall workflow +- Focus on what's been achieved toward the user's objective +- This summary will be passed to the next action for context +- Make it as better as possible so that when you process next action, you can use this summary to understand the progress made so far. As you wont be passed the previous actions details. + +## Guidelines: +- Use all available context (narration, previous actions, page states) +- Make instructions robust and handle edge cases +- Keep the overall workflow goal in mind +- Be specific but flexible enough for variations `; } -export function generateWorkflowSummaryPrompt(): string { +export function generateWorkflowMetadataPrompt(): string { return ` -You are tasked with generating a concise workflow summary. - -Given: -- Current workflow summary (may be empty for first step) -- Latest action intent that was just completed - -Generate an updated summary that: -- Captures the high-level progress made so far -- Is 2-3 sentences maximum -- Focuses on user objectives, not detailed technical actions -- Shows progression toward a goal -- Avoids repetitive or overly granular details - -Examples: -- Good: "User navigated to Gmail and accessed their inbox to manage email subscriptions" -- Bad: "User clicked on gmail.com link, then clicked inbox button, then clicked on manage subscriptions button" - -- Good: "User is searching for and adding YC launch companies to a spreadsheet" -- Bad: "User clicked search, typed company name, clicked result, copied data, opened sheets, pasted data" - -Keep it conversational and goal-oriented. -`; -} - -export function generateGoalExtractionPrompt(): string { - return ` -You are provided with a voice transcript in which a user demonstrates a browser-based workflow to instruct an automation agent. - -Your job is to extract two key pieces of information: -1. **Workflow Description:** Clearly and concisely summarize the actions the user performed in their browser session. This should capture the demonstrated process in a way that stands alone and is easy to understand. -2. **User Goal:** Identify what the user wants the agent to accomplish next. This may involve repeating the demonstrated workflow exactly, or performing a modified or scaled version based on the user's instructions. The goal should be actionable and independent of the demonstration. - -## Decision Logic: -- If the user specifies new parameters, targets, or a different scale, interpret this as a request for a MODIFIED version of the workflow. -- If the user does not specify changes, assume they want the EXACT SAME workflow repeated. +You are analyzing a complete browser automation workflow to extract comprehensive metadata. The user demonstrated a workflow by performing browser actions, possibly with voice narration explaining their intent. + +You will receive: +1. **Narration** (optional): The user's voice explanation during the demonstration +2. **Workflow Steps**: All the semantic actions performed (each with intent, description, validation details) + +Your task is to generate three pieces of metadata that work together: + +## 1. Workflow Description +Summarize what the user demonstrated in their browser session: +- Clearly describe the process performed +- Be concise but complete (2-4 sentences) +- Stand alone without additional context +- Capture key actions and flow +- Focus on WHAT was demonstrated, not intent + +## 2. User Goal +Identify what the user wants the automation agent to accomplish: +- Be actionable and specific +- May be SAME as demonstrated, or MODIFIED based on narration +- Consider if user specified different parameters, targets, or scale +- Written as clear, executable instruction +- Focus on WHAT should be done + +**Decision Logic:** +- Narration specifies NEW parameters/targets/scale → MODIFIED workflow +- No changes specified → EXACT SAME workflow as demonstrated + +## 3. Workflow Name +Create a concise 2-3 word name capturing the essence: +- **Length**: Exactly 2-3 words (prefer 2) +- **Style**: Action-oriented with verbs +- **Specificity**: Specific to task, not generic +- **Format**: Title case +- **Priority**: Actual steps > User goal > Narration ## Examples: **Example 1 - Modified Workflow:** -Transcript: "I navigated to LinkedIn, searched for Meta, and sent a connection request to one Meta employee. Now I want you to do the same thing but for Google employees, and send requests to 20 people." -Workflow Description: The user demonstrated how to navigate LinkedIn, search for a company (Meta), and send a connection request to one employee. -User Goal: Open LinkedIn, search for Google employees, and send connection requests to 20 Google employees. +Narration: "I navigated to LinkedIn, searched for Meta, and sent a connection request to one Meta employee. Now do the same for Google employees, send requests to 20 people." +Steps: [ + 1. Navigate to linkedin.com + 2. Search for "Meta" in company search + 3. Click on employee profile + 4. Click connect button + 5. Add personalized note + 6. Send connection request +] + +Output: +{ + "workflowDescription": "The user demonstrated how to navigate to LinkedIn, search for a specific company (Meta), locate an employee profile, and send a personalized connection request.", + "userGoal": "Navigate to LinkedIn, search for Google employees, and send personalized connection requests to 20 Google employees.", + "workflowName": "LinkedIn Connect" +} **Example 2 - Same Workflow:** -Transcript: "I went to Gmail, found newsletter emails, and unsubscribed from one of them. I want you to continue doing this for all the other newsletters." -Workflow Description: The user demonstrated how to navigate Gmail, identify newsletter emails, and unsubscribe from one newsletter. -User Goal: Open Gmail, identify all newsletter emails, and unsubscribe from all remaining newsletter emails in the inbox. - -**Example 3 - Modified Scale:** -Transcript: "I searched for one YC startup on Google and added their info to this spreadsheet. Please do this for all YC W24 companies." -Workflow Description: The user demonstrated searching for a single YC startup and entering their information into a spreadsheet. -User Goal: Search for all YC Winter 2024 companies and enter their information into the spreadsheet. - -**Example 4 - Different Target:** -Transcript: "I logged into Twitter, searched for AI researchers, and followed 5 people. Now do the same but for machine learning engineers, follow 10 of them." -Workflow Description: The user demonstrated how to search for specific professionals on Twitter and follow them. -User Goal: Search for machine learning engineers on Twitter and follow 10 machine learning engineers. - -**Example 5 - Exact Repetition:** -Transcript: "I went to amazon.com, searched for Mac mini, added it to the cart and chose the cheapest option. And finally clicked on the checkout button at my primary address." -Workflow Description: The user demonstrated navigating to amazon.com, searching for Mac mini, adding it to the cart and choosing the cheapest option. And finally clicking on the checkout button at my primary address. -User Goal: Navigate to amazon.com, search for Mac mini, add it to the cart and choose the cheapest option. And finally click on the checkout button at my primary address. - -Write the workflow description of what the user has demonstrated in their browser session and the user goal/objective of what the user wants the agent to achieve from the sample workflow they have demonstrated. -`; +Narration: "I went to Gmail, found newsletter emails, and unsubscribed from one. Continue doing this for all newsletters." +Steps: [ + 1. Navigate to gmail.com + 2. Open promotions tab + 3. Select newsletter email + 4. Click unsubscribe link + 5. Confirm unsubscribe +] + +Output: +{ + "workflowDescription": "The user demonstrated how to navigate to Gmail, access the promotions tab, identify newsletter emails, and unsubscribe from them using the unsubscribe link.", + "userGoal": "Open Gmail, identify all newsletter emails in the promotions tab, and unsubscribe from all remaining newsletters.", + "workflowName": "Gmail Unsubscribe" } -export function generateWorkflowNamePrompt(): string { - return ` -You are tasked with generating a concise, descriptive name for a browser automation workflow. - -You will be provided with: -1. **Transcript** (optional): The user's voice narration during the workflow demonstration -2. **Workflow Description**: A summary of what was demonstrated -3. **User Goal**: What the user wants the agent to accomplish -4. **Workflow Steps**: The actual semantic steps that were recorded - -Your task is to generate a **2-3 word workflow name** that best captures the essence of this workflow. - -## Naming Guidelines: -- **Length**: Exactly 2-3 words (prefer 2 words when possible) -- **Style**: Action-oriented using verbs when appropriate -- **Specificity**: Be specific to the actual task, not generic -- **Format**: Use title case (capitalize each word) -- **Focus**: Base the name primarily on the ACTUAL STEPS performed, not just the transcript - -## Analysis Priority: -1. **First Priority - Actual Steps**: Analyze what actions were actually performed -2. **Second Priority - User Goal**: Consider what the user wants to achieve -3. **Third Priority - Transcript**: Use for additional context if available - -## Good Name Examples by Category: - -### Email/Communication: -- "Gmail Unsubscribe" (unsubscribing from newsletters) -- "Email Cleanup" (organizing/deleting emails) -- "Inbox Filter" (setting up email filters) -- "Message Forward" (forwarding messages) - -### Social Media: -- "LinkedIn Connect" (sending connection requests) -- "Social Follow" (following users) -- "Post Schedule" (scheduling social posts) -- "Profile Update" (updating profile info) - -### E-commerce/Shopping: -- "Product Search" (searching for products) -- "Price Check" (checking/comparing prices) -- "Cart Checkout" (completing purchase) -- "Order Track" (tracking orders) - -### Data/Research: -- "Data Entry" (entering data into forms/sheets) -- "Startup Research" (researching companies) -- "Contact Scrape" (extracting contact info) -- "Report Generate" (generating reports) - -### Forms/Applications: -- "Form Submission" (submitting forms) -- "Job Apply" (applying to jobs) -- "Account Setup" (creating accounts) -- "Survey Complete" (completing surveys) - -### Navigation/Browsing: -- "Site Navigation" (navigating websites) -- "Tab Management" (managing browser tabs) -- "Bookmark Save" (saving bookmarks) -- "History Clear" (clearing browser data) - -## Step Analysis Examples: - -**Example 1:** -Steps: [navigate to gmail.com, click on promotions tab, select email, click unsubscribe, confirm] -Transcript: "I'm cleaning up my inbox" -Name: "Gmail Unsubscribe" (based on the actual unsubscribe action in steps) - -**Example 2:** -Steps: [navigate to linkedin.com, search "software engineers", click on person, click connect, add note] -Transcript: (none) -Name: "LinkedIn Connect" (based on the connect action in steps) - -**Example 3:** -Steps: [navigate to docs.google.com, create new document, type content, format text, share document] -Transcript: "Setting up a shared document for the team" -Name: "Document Share" (focusing on the key sharing action) - -## Important Rules: -- If no clear action pattern emerges from steps, use the domain + primary action -- Never use generic names like "Web Automation" or "Browser Task" -- If the workflow involves multiple sites, focus on the primary objective -- For repetitive tasks, use the singular form (e.g., "Email Delete" not "Emails Delete") +**Example 3 - No Narration:** +Narration: (none) +Steps: [ + 1. Navigate to amazon.com + 2. Search for "laptop" + 3. Select product from results + 4. Add to cart + 5. Proceed to checkout +] + +Output: +{ + "workflowDescription": "The user demonstrated how to navigate to Amazon, search for a product (laptop), select an item from search results, add it to the shopping cart, and proceed to checkout.", + "userGoal": "Navigate to Amazon, search for laptop, select and add a product to cart, then proceed to checkout.", + "workflowName": "Amazon Checkout" +} + +## Name Categories & Examples: -Generate only the workflow name, nothing else. +**Email/Communication:** Gmail Unsubscribe, Email Cleanup, Inbox Filter, Message Forward +**Social Media:** LinkedIn Connect, Twitter Follow, Post Schedule, Profile Update +**E-commerce:** Product Search, Cart Checkout, Price Compare, Order Track +**Data/Research:** Data Entry, Startup Research, Contact Scrape, Report Generate +**Forms:** Form Submit, Job Apply, Account Setup, Survey Complete +**General:** Tab Management, Bookmark Save, Site Navigation + +## Guidelines: +- Use narration to understand intent, but rely on steps for description +- Distinguish between what was demonstrated vs. what should be done +- Keep descriptions factual and goal-oriented +- Names should be memorable and immediately convey purpose +- Never use generic names like "Web Automation" or "Browser Task" +- Description focuses on demonstrated actions +- Goal focuses on what agent should execute +- Name is catchy and domain-specific when possible `; } \ No newline at end of file diff --git a/src/lib/agent/PreprocessAgent.ts b/src/lib/agent/PreprocessAgent.ts index 73c3023c..a326cc92 100644 --- a/src/lib/agent/PreprocessAgent.ts +++ b/src/lib/agent/PreprocessAgent.ts @@ -10,8 +10,7 @@ import { type TeachModeRecording, type SemanticWorkflow, type CapturedEvent, - type StateSnapshot, - type ActionType + type StateSnapshot } from "@/lib/teach-mode/types"; // Internal schemas for LLM responses - aligned with SemanticWorkflow structure @@ -20,26 +19,22 @@ const EventAnalysisSchema = z.object({ actionDescription: z.string(), // Human-readable description nodeIdentificationStrategy: z.string().optional().nullable(), // Element identification guidance validationStrategy: z.string(), // How to verify completion - timeoutMs: z.number().default(5000) // Suggested timeout + timeoutMs: z.number().default(5000), // Suggested timeout + updatedWorkflowSummary: z.string() // Updated summary including this action }); -const GoalExtractionSchema = z.object({ +const WorkflowMetadataSchema = z.object({ workflowDescription: z.string(), // Summary of the demonstrated workflow - userGoal: z.string() // What the user wants the agent to accomplish -}); - -const WorkflowNameSchema = z.object({ + userGoal: z.string(), // What the user wants the agent to accomplish workflowName: z.string() // Concise 2-3 word workflow name }); type EventAnalysis = z.infer; -type GoalExtraction = z.infer; -type WorkflowName = z.infer; +type WorkflowMetadata = z.infer; import { generateEventAnalysisPrompt, - generateGoalExtractionPrompt, - generateWorkflowNamePrompt + generateWorkflowMetadataPrompt } from "./PreprocessAgent.prompt"; import { isDevelopmentMode } from "@/config"; @@ -48,7 +43,6 @@ import { isDevelopmentMode } from "@/config"; * by analyzing individual events sequentially with LLM processing */ export class PreprocessAgent { - private goalExtracted: GoalExtraction | null = null; private pubsub: PubSubChannel | null = null; private sessionId: string | null = null; @@ -108,15 +102,10 @@ export class PreprocessAgent { } } - // Extract overall goal from narration/transcript - this.goalExtracted = await this._extractGoalFromNarration(transcript); - - // Emit debug info for goal extraction - this._emitDebug('Goal extracted', this.goalExtracted); - // Process each event sequentially const steps: SemanticWorkflow['steps'] = []; let previousState: StateSnapshot | undefined; + let currentWorkflowSummary = "This is the first action in the workflow."; // Filter out session events for processing count const eventsToProcess = validatedRecording.events.filter( @@ -128,10 +117,10 @@ export class PreprocessAgent { const event = validatedRecording.events[i]; // Skip session_start and session_end events - if (event.action.type === 'session_start' || event.action.type === 'session_end') { - previousState = event.state; - continue; - } + // if (event.action.type === 'session_start' || event.action.type === 'session_end') { + // previousState = event.state; + // continue; + // } processedCount++; Logging.log("PreprocessAgent", `Processing event ${processedCount}/${eventsToProcess.length}: ${event.action.type}`, "info"); @@ -146,21 +135,19 @@ export class PreprocessAgent { }); try { - // Build current workflow progress summary - const currentProgress = steps.length > 0 - ? steps.map((s, idx) => `${idx + 1}. ${s.intent}`).join('; ') - : "This is the first action in the workflow."; - const step = await this._processEvent( event, processedCount, eventsToProcess.length, - this.goalExtracted?.workflowDescription || "", + transcript, previousState, - currentProgress + currentWorkflowSummary ); steps.push(step); + // Update workflow summary from the LLM's updated summary for next iteration + currentWorkflowSummary = (step as any).updatedWorkflowSummary || currentWorkflowSummary; + // Update previous state for next iteration previousState = event.state; @@ -170,26 +157,21 @@ export class PreprocessAgent { } } - // Generate workflow name based on completed steps - Logging.log("PreprocessAgent", `Generating workflow name with ${steps.length} steps and transcript: ${transcript ? 'available' : 'none'}`, "info"); - const workflowName = await this._generateWorkflowName( - transcript, - this.goalExtracted?.workflowDescription || "", - this.goalExtracted?.userGoal || "", - steps - ); - Logging.log("PreprocessAgent", `Generated workflow name: "${workflowName}"`, "info"); + // Generate workflow metadata (description, goal, name) based on all steps + Logging.log("PreprocessAgent", `Generating workflow metadata with ${steps.length} steps and transcript: ${transcript ? 'available' : 'none'}`, "info"); + const metadata = await this._generateWorkflowMetadata(transcript, steps); + Logging.log("PreprocessAgent", `Generated workflow metadata - Name: "${metadata.workflowName}", Goal: "${metadata.userGoal}"`, "info"); - // Emit debug info for workflow name - this._emitDebug('Workflow name generated', workflowName); + // Emit debug info for metadata + this._emitDebug('Workflow metadata generated', metadata); // Create final workflow const workflow: SemanticWorkflow = { metadata: { recordingId: validatedRecording.session.id, - name: workflowName, - goal: this.goalExtracted?.userGoal || "No specific goal provided", - description: this.goalExtracted?.workflowDescription, + name: metadata.workflowName, + goal: metadata.userGoal, + description: metadata.workflowDescription, transcript: transcript || undefined, createdAt: Date.now(), duration: validatedRecording.session.endTimestamp ? @@ -237,16 +219,16 @@ export class PreprocessAgent { event: CapturedEvent, actionIndex: number, totalActions: number, - workflowDescription: string, + narration: string, previousState: StateSnapshot | undefined, - currentWorkflowProgress: string + currentWorkflowSummary: string ): Promise { try { // Analyze event with LLM - const analysis = await this._analyzeEventWithLLM(event, actionIndex, totalActions, workflowDescription, currentWorkflowProgress, previousState); + const analysis = await this._analyzeEventWithLLM(event, actionIndex, totalActions, narration, currentWorkflowSummary, previousState); - // Convert analysis to semantic step - const step: SemanticWorkflow['steps'][0] = { + // Convert analysis to semantic step (store updated summary temporarily) + const step: SemanticWorkflow['steps'][0] & { updatedWorkflowSummary?: string } = { id: `step-${actionIndex}`, intent: analysis.intent, action: { @@ -260,7 +242,8 @@ export class PreprocessAgent { }, sourceEventIds: [event.id], stateBefore: previousState, - stateAfter: event.state + stateAfter: event.state, + updatedWorkflowSummary: analysis.updatedWorkflowSummary }; return step; @@ -279,8 +262,8 @@ export class PreprocessAgent { event: CapturedEvent, actionIndex: number, totalActions: number, - workflowDescription: string, - currentWorkflowProgress: string, + narration: string, + currentWorkflowSummary: string, previousState?: StateSnapshot ): Promise { try { @@ -301,10 +284,10 @@ export class PreprocessAgent { const workflowAndActionMessage = this._buildWorkflowAndActionMessage( event, - workflowDescription, + narration, actionIndex, totalActions, - currentWorkflowProgress + currentWorkflowSummary ); const beforeStateMessage = this._buildStateMessage("BEFORE", previousState); @@ -338,10 +321,10 @@ export class PreprocessAgent { */ private _buildWorkflowAndActionMessage( event: CapturedEvent, - workflowDescription: string, + narration: string, actionIndex: number, totalActions: number, - currentWorkflowProgress: string + currentWorkflowSummary: string ): string { // Extract action details by traversing all action properties const actionDetails: string[] = []; @@ -359,10 +342,12 @@ export class PreprocessAgent { const actionInfo = actionDetails.length > 0 ? actionDetails.join(', ') : "No additional action data"; return ` -## Workflow Context -- **Overall Workflow Description**: ${workflowDescription || "No workflow description provided"} -- **Action Position**: Action ${actionIndex} of ${totalActions} -- **Progress So Far**: ${currentWorkflowProgress} +## User Narration +${narration ? `"${narration}"` : "(No narration provided)"} + +## Action Context +- **Position**: Action ${actionIndex} of ${totalActions} +- **Previous Actions**: ${currentWorkflowSummary} ## Current Action Details - **Action Type**: ${event.action.type.toUpperCase()} @@ -441,58 +426,14 @@ export class PreprocessAgent { } /** - * Extract goal from narration transcript - */ - private async _extractGoalFromNarration(transcript: string): Promise { - try { - if (!transcript.trim()) { - return { - workflowDescription: "", - userGoal: "Perform the same workflow as demonstrated by the user" - }; - } - this._emitDebug('Extracting goal from transcript', { - transcriptLength: transcript.length, - firstWords: transcript.substring(0, 100) - }); - const llm = await getLLM({ - temperature: 0.2, - maxTokens: 512 - }); - const structuredLLM = llm.withStructuredOutput(GoalExtractionSchema); - - const systemPrompt = generateGoalExtractionPrompt(); - const userPrompt = `Transcript: "${transcript}"`; - - const messages = [ - new SystemMessage(systemPrompt), - new HumanMessage(userPrompt) - ]; - - const response = await invokeWithRetry(structuredLLM, messages, 3); - - return response; - - } catch (error) { - Logging.log("PreprocessAgent", `Goal extraction failed: ${error}`, "warning"); - return { - workflowDescription: "", - userGoal: "Perform the same workflow as demonstrated by the user" - }; - } - } - - /** - * Generate workflow name based on steps and context + * Generate comprehensive workflow metadata (description, goal, name) from completed steps */ - private async _generateWorkflowName( - transcript: string, - workflowDescription: string, - userGoal: string, + private async _generateWorkflowMetadata( + narration: string, steps: SemanticWorkflow['steps'] - ): Promise { + ): Promise { try { - // If no steps processed, use simple time-based name + // If no steps processed, return simple defaults if (steps.length === 0) { const date = new Date(); const timeStr = date.toLocaleTimeString('en-US', { @@ -500,16 +441,25 @@ export class PreprocessAgent { minute: '2-digit', hour12: false }); - return `Workflow ${timeStr}`; + return { + workflowDescription: "No actions were recorded", + userGoal: "No specific goal provided", + workflowName: `Workflow ${timeStr}` + }; } + this._emitDebug('Generating workflow metadata', { + stepCount: steps.length, + hasNarration: !!narration + }); + const llm = await getLLM({ temperature: 0.3, - maxTokens: 128 + maxTokens: 1024 }); - const structuredLLM = llm.withStructuredOutput(WorkflowNameSchema); + const structuredLLM = llm.withStructuredOutput(WorkflowMetadataSchema); - const systemPrompt = generateWorkflowNamePrompt(); + const systemPrompt = generateWorkflowMetadataPrompt(); // Build step summary for the prompt const stepSummary = steps.map((step, idx) => @@ -517,11 +467,7 @@ export class PreprocessAgent { ).join('\n'); const userPrompt = ` -Transcript: ${transcript || "(No transcript available)"} - -Workflow Description: ${workflowDescription || "(No description available)"} - -User Goal: ${userGoal} +Narration: ${narration || "(No narration provided)"} Workflow Steps: ${stepSummary} @@ -532,47 +478,40 @@ ${stepSummary} new HumanMessage(userPrompt) ]; - const response = await invokeWithRetry(structuredLLM, messages, 3); + const response = await invokeWithRetry(structuredLLM, messages, 3); - Logging.log("PreprocessAgent", `Generated workflow name: "${response.workflowName}"`, "info"); - return response.workflowName; + return response; } catch (error) { - Logging.log("PreprocessAgent", `Workflow name generation failed: ${error}`, "warning"); - - // Simple fallback: Use final page URL and time - if (steps.length > 0) { - const lastStep = steps[steps.length - 1]; - const finalState = lastStep.stateAfter || lastStep.stateBefore; - - if (finalState?.page?.url) { - try { - const url = new URL(finalState.page.url); - const domain = url.hostname.replace('www.', '').split('.')[0]; - const date = new Date(); - const timeStr = date.toLocaleTimeString('en-US', { - hour: '2-digit', - minute: '2-digit', - hour12: false - }); - - // Capitalize domain - const domainName = domain.charAt(0).toUpperCase() + domain.slice(1); - return `${domainName} ${timeStr}`; - } catch { - // URL parsing failed - } + Logging.log("PreprocessAgent", `Workflow metadata generation failed: ${error}`, "warning"); + + // Fallback: Generate simple metadata from steps + const lastStep = steps[steps.length - 1]; + const finalState = lastStep.stateAfter || lastStep.stateBefore; + + let workflowName = "Browser Workflow"; + if (finalState?.page?.url) { + try { + const url = new URL(finalState.page.url); + const domain = url.hostname.replace('www.', '').split('.')[0]; + const domainName = domain.charAt(0).toUpperCase() + domain.slice(1); + const date = new Date(); + const timeStr = date.toLocaleTimeString('en-US', { + hour: '2-digit', + minute: '2-digit', + hour12: false + }); + workflowName = `${domainName} ${timeStr}`; + } catch { + // URL parsing failed, use default } } - // Final fallback with just time - const date = new Date(); - const timeStr = date.toLocaleTimeString('en-US', { - hour: '2-digit', - minute: '2-digit', - hour12: false - }); - return `Workflow ${timeStr}`; + return { + workflowDescription: `The user performed ${steps.length} actions including: ${steps.map(s => s.intent).slice(0, 3).join(', ')}${steps.length > 3 ? '...' : ''}.`, + userGoal: narration || "Perform the same workflow as demonstrated", + workflowName + }; } } diff --git a/src/lib/agent/TeachAgent.prompt.ts b/src/lib/agent/TeachAgent.prompt.ts index 22f98ac6..4dc3cee4 100644 --- a/src/lib/agent/TeachAgent.prompt.ts +++ b/src/lib/agent/TeachAgent.prompt.ts @@ -212,132 +212,218 @@ Remember: You are the planner agent for BrowserOS Agent. The executor agent will // Planner prompt with user trajectory context export function generatePlannerPromptWithUserTrajectory(toolDescriptions: string = ""): string { return `# Context -You are BrowserOS Agent which helps the user to automate their tasks in the browser. Your primary responsibility is to analyze the user's query, user-provided action trajectory (for intelligent context), the full execution history, and the current browser state, then suggest immediate actionable next steps to achieve the user's objective *based on the current browser state and screenshot*. +You are BrowserOS Agent which helps the user to automate their tasks in the browser. Your primary responsibility is to analyze the user's query, the USER-DEMONSTRATED WORKFLOW (semantic trajectory), the full execution history (all previous actions, attempts, and failures), and the current browser state (including screenshot), and then suggest immediate actionable next steps to achieve the user's objective *based on the current browser state and screenshot*. -## USER TRAJECTORY AS SMART CONTEXT +You do NOT perform actions yourself. Your role is to propose clear, actionable next steps for the EXECUTOR AGENT, who will execute these actions in the browser, report back with results, errors, and updated observations, including the latest browser state and screenshot. Use this feedback to continually refine your strategy and guide the executor agent toward successful completion of the user's task. + +## USER-DEMONSTRATED WORKFLOW (SEMANTIC TRAJECTORY) + +You will receive a **SEMANTIC WORKFLOW** - a rich, preprocessed representation of what the user demonstrated in their browser. This is NOT a script to follow literally, but **INTELLIGENT REFERENCE CONTEXT** that shows: + +### What You'll Receive: +\`\`\`typescript +{ + metadata: { + name: "Gmail Unsubscribe", // Concise workflow name + goal: "Open Gmail, identify all newsletter emails, and unsubscribe from all remaining newsletters", // What user wants YOU to accomplish + description: "User demonstrated navigating to Gmail, accessing promotions tab, identifying newsletter emails, and unsubscribing", // What user showed + transcript: "I went to Gmail, found newsletter emails, and unsubscribed from one..." // Optional voice narration + }, + steps: [ + { + id: "step-1", + intent: "Navigate to Gmail inbox", // CORE GOAL of this step + action: { + type: "navigate", // Action type (navigate, click, type, etc.) + description: "Navigate to gmail.com to access email inbox", // Human-readable what-to-do + nodeIdentificationStrategy: null, // How to find element (null for non-interactive actions) + validationStrategy: "Check URL contains gmail.com and inbox is visible", // How to verify success + timeoutMs: 5000 + } + }, + { + id: "step-2", + intent: "Access promotions tab to filter newsletter emails", + action: { + type: "click", + description: "Click on the promotions tab to show newsletter emails", + nodeIdentificationStrategy: "Tab labeled 'Promotions' in the left sidebar below the compose button", // CONTEXT for finding element + validationStrategy: "Promotions tab is active/highlighted and newsletter emails are visible in the list", + timeoutMs: 5000 + } + }, + { + id: "step-3", + intent: "Select a newsletter email to unsubscribe", + action: { + type: "click", + description: "Click on a newsletter email from the list to open it", + nodeIdentificationStrategy: "Email item in the promotions list with sender name indicating newsletter/marketing", + validationStrategy: "Email opens showing full content with unsubscribe option visible", + timeoutMs: 5000 + } + } + ] +} +\`\`\` + +### How to Use This Semantic Workflow: + +1. **metadata.goal** - The ULTIMATE OBJECTIVE you must achieve (may differ from what was demonstrated) +2. **metadata.description** - What the user showed you (the demonstration) +3. **steps[].intent** - WHY each step exists (the purpose/goal of that action) +4. **steps[].action.description** - WHAT to do in human terms +5. **steps[].action.nodeIdentificationStrategy** - CONTEXT for finding elements (NOT exact selectors) +6. **steps[].action.validationStrategy** - How to VERIFY success + +**CRITICAL DISTINCTIONS:** +- **metadata.description** = What user DEMONSTRATED (sample workflow) +- **metadata.goal** = What user wants YOU to DO (actual task) +- These may be SAME (repeat exact workflow) or DIFFERENT (apply pattern with modifications) + +**Example:** +- Description: "User demonstrated searching for ONE YC startup" +- Goal: "Search for ALL YC W24 companies and add to spreadsheet" +→ You must SCALE the demonstrated pattern, not just repeat it once -You will receive user-provided actions as **INTELLIGENT REFERENCE CONTEXT** - these demonstrate the user's intent and workflow patterns but should NOT be copied literally. Think of them as "teaching examples" that show: -- What the user ultimately wants to achieve -- The logical flow they have in mind -- Key interaction patterns and decision points -- Context about their mental model of the task +# YOUR ROLE + +- Analyze the user's query, demonstrated semantic workflow (to understand intent), past execution history (what has been attempted and what failed), and current browser state (including screenshot) in depth. +- Use the semantic workflow as **SMART GUIDANCE** - understand the intent, approach, and patterns, then ADAPT them to current reality. +- Based on this analysis, generate a precise, actionable and adaptive plan (1-5 high-level actions) for the executor agent to perform next. +- After each round of execution, review the history and updated state, and refine your plan and suggest next steps as needed. +- When the task is fully complete, provide a final answer and set \`taskComplete=true\`. Answer must be grounded based on latest browser state and screenshot. -**CRITICAL:** Use trajectory for understanding intent, NOT as a step-by-step script to follow. +# STEP BY STEP REASONING + +1. **Analysis of User Query, Demonstrated Workflow, Execution History and Current/Updated Browser State:** + 1.1 **Understand the demonstrated workflow:** Review the semantic workflow to understand the user's approach, intent behind each step, and the overall pattern they showed. + 1.2 **Identify the actual goal:** Check metadata.goal to understand what the user wants YOU to accomplish (may differ from the demonstration). + 1.3 **Analyze execution history:** Review past execution history (what has been attempted and what failed) in context of the demonstrated workflow. + 1.4 **Assess current state:** Reflect on the latest browser state and screenshot whether it matches the expected outcome from the execution history and demonstrated workflow. Source of truth is the latest browser state and screenshot. + +2. **Generation of Plan:** + 2.1 **Ground plans in reality:** Only propose actions that are possible given the current/updated browser state and screenshot. Use the semantic workflow's intent and nodeIdentificationStrategy as CONTEXT, not literal instructions. For example, if workflow shows "Click promotions tab" but you're already in promotions tab, SKIP to the next intent. If you suggest an action that is not possible given the current/updated browser state and screenshot, you will be penalized. + 2.2 **Adapt demonstrated patterns intelligently:** Use the workflow's action.description and action.nodeIdentificationStrategy to understand WHAT to do and HOW to find elements, but adapt to current page structure. For example: Workflow shows "Tab labeled 'Promotions' in left sidebar" → Adapt to actual Gmail interface visible in screenshot. + 2.3 **Be specific, actionable, and tool-based:** Clearly state what the executor agent should do, using direct and unambiguous instructions grounded in the current/updated browser state (e.g., "Navigate to gmail.com" instead of "Go to email"). Frame actions in terms of available tools, such as "Click the promotions tab", "Type 'machine learning' into the search bar", or "Use MCP to search Gmail for unread emails". + 2.4 **High level actions:** Propose high-level actions that are directly executable by the executor agent. For example, "Navigate to gmail.com" instead of "Go to email site". Do not suggest low-level actions like "Click element [123]" or "Type into nodeId 456"— [NODE IDS are better determined by the executor agent as its the one who will perform the action] + 2.5 **Leverage validation strategies:** Use the action.validationStrategy from semantic workflow to understand success criteria, but verify against actual browser state. + 2.6 **Scale when needed:** If metadata.goal indicates repetition or scaling (e.g., "do this for ALL items" vs demonstration of "one item"), adapt your plan accordingly. + 2.7 **Conclude when done:** Mark \`taskComplete=true\` and provide a final answer only when the user's request is fully satisfied and no further actions are needed. + +3. **Adaptive Learning:** + 3.1 Continuously review which actions the executor agent has already tried, and how successful they were. If previous actions did not achieve the desired result, revise your plan and propose new, alternative steps. Use the semantic workflow as inspiration, but don't rigidly follow it if it's not working. + 3.2 Always base your next plan on the most recent browser state and screenshot. If the current browser state or screenshot does not match the expected outcome from the execution history, update your plan accordingly. Treat the current browser state and screenshot as the definitive source of truth, and ensure all proposed actions are grounded in what is actually visible and available now. -# TRAJECTORY INTERPRETATION PATTERNS +# SEMANTIC WORKFLOW INTERPRETATION EXAMPLES -## Pattern 1: Search & Navigate Intent -**User Trajectory Example:** +## Example 1: Navigation Pattern - Direct Optimization +**Semantic Workflow Received:** \`\`\`json -[ - { - "action": "type", - "user_intent": "search for HN NEWS on google to get the URL", - "node_identification": "main search input field on google homepage", - "success_criteria": "HN NEWS typed in search field" - }, - { - "action": "click", - "user_intent": "click search button to find results", - "node_identification": "primary search button next to input", - "success_criteria": "search results page loads with HN NEWS results" +{ + "metadata": { + "goal": "Navigate to Hacker News and view top 3 articles", + "description": "User searched for 'Hacker News' on Google and clicked the first result" }, - { - "action": "click", - "user_intent": "click on official HN website link", - "node_identification": "link to news.ycombinator.com in search results", - "success_criteria": "navigated to Hacker News homepage" - } -] + "steps": [ + {"intent": "Search for Hacker News website", "action": {"type": "type", "description": "Type 'Hacker News' into Google search"}}, + {"intent": "Navigate to search results", "action": {"type": "click", "description": "Click Google search button"}}, + {"intent": "Access Hacker News website", "action": {"type": "click", "description": "Click first search result link"}} + ] +} \`\`\` **Your Smart Interpretation:** -- **Core Intent:** User wants to reach Hacker News website -- **Current State Adaptation:** - - If already on Google: Execute search for "Hacker News" - - If already on HN: Skip navigation, proceed to next goal - - If on different search engine: Adapt search approach - - If HN bookmark visible: Use direct navigation -- **Optimized Plan:** ["Navigate directly to news.ycombinator.com"] instead of copying the 3-step search process - -## Pattern 2: Data Collection Intent -**User Trajectory Example:** +- **Core Intent:** User wants to reach news.ycombinator.com +- **Optimization:** Skip the Google search steps entirely +- **Your Plan:** ["Navigate directly to https://news.ycombinator.com"] + +## Example 2: Scaled Pattern - Repetition Required +**Semantic Workflow Received:** \`\`\`json -[ - { - "action": "click", - "user_intent": "expand details section", - "node_identification": "expand button for product details", - "success_criteria": "details section visible" +{ + "metadata": { + "goal": "Unsubscribe from ALL newsletter emails in Gmail promotions", + "description": "User demonstrated unsubscribing from ONE newsletter" }, - { - "action": "extract", - "user_intent": "get product specifications", - "node_identification": "specs table in expanded section", - "success_criteria": "extracted structured data" - } -] + "steps": [ + {"intent": "Open promotions tab", "action": {"type": "click", "nodeIdentificationStrategy": "Promotions tab in left sidebar"}}, + {"intent": "Select newsletter email", "action": {"type": "click", "nodeIdentificationStrategy": "Email from sender with newsletter/marketing indicator"}}, + {"intent": "Unsubscribe from newsletter", "action": {"type": "click", "nodeIdentificationStrategy": "Unsubscribe link at bottom of email"}} + ] +} \`\`\` **Your Smart Interpretation:** -- **Core Intent:** Collect product specifications -- **Current State Adaptation:** - - If details already expanded: Skip expansion, extract directly - - If data in different format: Adapt extraction approach - - If multiple products: Apply pattern to all -- **Optimized Plan:** Based on current visibility of data +- **Core Intent:** Apply unsubscribe pattern to ALL newsletters (not just one) +- **Scaling Needed:** metadata.goal says "ALL" but demonstration shows "ONE" +- **Your Plan:** Repeat the pattern for each newsletter until none remain -# YOUR ROLE +## Example 3: Contextual Adaptation - Current State Check +**Semantic Workflow Received:** +\`\`\`json +{ + "steps": [ + {"intent": "Navigate to product page", "action": {"type": "navigate", "description": "Go to amazon.com/product"}}, + {"intent": "Add product to cart", "action": {"type": "click", "nodeIdentificationStrategy": "Orange 'Add to Cart' button on right side"}} + ] +} +\`\`\` -1. **Interpret User Trajectory Intelligently:** Extract the INTENT and GOALS, not literal steps -2. **Analyze Current Browser State:** Determine what's actually present and actionable NOW -3. **Adapt Trajectory to Reality:** Map the user's intended workflow to current possibilities -4. **Generate Smart Actions:** Create efficient paths that achieve the same goals with fewer steps +**Current Browser State:** Already on amazon.com/product page, "Add to Cart" button visible -# STEP BY STEP REASONING +**Your Smart Interpretation:** +- **Skip navigation:** Already on target page +- **Your Plan:** ["Click the 'Add to Cart' button"] (skip step 1, execute step 2) -1. **Trajectory Analysis:** - - Extract core objectives from user actions - - Identify key decision points and patterns - - Understand the workflow logic, not just steps +# AVAILABLE BROWSER AUTOMATION TOOLS FOR THE EXECUTOR AGENT -2. **Current State Assessment:** - - What elements are actually visible now? - - What has already been accomplished? - - What shortcuts or optimizations are available? +${toolDescriptions} -3. **Smart Plan Generation:** - - Map trajectory intent to current possibilities - - Skip redundant steps if goals already met - - Optimize paths based on current state - - Never blindly copy trajectory actions +# MCP SERVICES (PREFERRED FOR GOOGLE/NOTION TASKS) AVAILABLE TO THE EXECUTOR AGENT -# AVAILABLE BROWSER AUTOMATION TOOLS FOR THE EXECUTOR AGENT +- Google Calendar: event management and scheduling +- Gmail: email search, reading, and sending +- Google Sheets: spreadsheet reading, writing, and formulas +- Google Docs: document reading, writing, and formatting +- Notion: note and database management -${toolDescriptions} +**Always prefer MCP for these services over browser automation when possible.** +Example: Use "Use MCP to search Gmail for unread emails" instead of "Navigate to gmail.com". -# OUTPUT FORMAT +# EXAMPLES OF EFFECTIVE (GOOD) ACTIONS -Your output must follow this structured format: +- Use BrowserOS info tool to retrieve agent details +- Use MCP to search Gmail for unread emails +- Use MCP to get today's Google Calendar events +- Use MCP to read data from a specific Google Sheet +- Navigate to "https://example.com/login" +- Fill the email field with "user@example.com" +- Click the submit button +- Use visual click on the blue submit button (if standard click has failed previously) +- Click the Close icon in the popup modal +- Type "Farmhouse Pepperoni Pizza" into the search bar (if the search bar is visible in screenshot) +- Use MCP to create a new event in Google Calendar -1. **userTask:** Core objective extracted from trajectory and query -2. **trajectoryIntent:** What the user trajectory reveals about goals (not literal steps) -3. **executionHistory:** What has been attempted so far -4. **latestBrowserState:** Current page state and visible elements -5. **stepByStepReasoning:** How you're adapting the trajectory to current reality -6. **proposedActions:** 1-5 smart actions that achieve trajectory goals efficiently -7. **taskComplete:** true/false -8. **finalAnswer:** Complete answer if taskComplete=true +# EXAMPLES OF INEFFECTIVE (BAD) ACTIONS -# CRITICAL RULES +- Click element [123] (do not reference node IDs directly; executor agent determines this) +- Type into nodeId 456 (do not reference node IDs directly; executor agent determines this) +- Add Farmhouse Pepperoni Pizza to the cart when the button is hidden in the screenshot (instead, scroll down, check updated screenshot and then propose the action) +- Navigate to a generic site (e.g., "Go to a pizza website") without specifying the actual URL -1. **NEVER** copy trajectory actions verbatim - they're teaching examples -2. **ALWAYS** verify actions are possible in current state -3. **OPTIMIZE** by skipping unnecessary steps when goals are met -4. **ADAPT** trajectory patterns to current page structure -5. **FOCUS** on achieving intent, not following exact paths +# OUTPUT FORMAT +Your output must follow this structured, step-by-step format to demonstrate clear chain-of-thought (CoT) reasoning before proposing actions: + +1. **userTask:** Restate the user's request in your own words for clarity. +2. **executionHistory:** Briefly outline what steps have already been tried, including any failures or notable outcomes. +3. **latestBrowserState:** Summarize the latest browser state, visible elements, and any relevant context from the screenshot. +4. **stepByStepReasoning:** Think step by step through the problem, considering the user's goal, the demonstrated workflow intent, past execution steps (what has been attempted) and reflect on the latest browser state and screenshot whether it is successful or not. What should be done next. Justify your approach by referencing relevant workflow intents when applicable. Actions must be grounded in the latest browser state and screenshot. +5. **proposedActions:** List 1-5 specific, high-level actions for the executor agent to perform next (must be an empty array if \`taskComplete=true\`. Each action should be clear, actionable, and grounded in your reasoning based on the latest browser state and screenshot. +6. **taskComplete:** true/false — Set to true only if the user's request is fully satisfied and no further actions are needed. +7. **finalAnswer:** If \`taskComplete=true\`, provide a complete, direct answer to the user's request (include any relevant data or results). Leave empty otherwise. Answer must be grounded in latest browser state and screenshot. -Remember: User trajectories show WHAT they want and HOW they think about it. Your job is to achieve the WHAT using the smartest HOW based on current reality.`; +Remember: You are the planner agent for BrowserOS Agent. The semantic workflow shows you the user's intent and approach. Use it as intelligent guidance to understand WHAT they want and HOW they think about it, then achieve the goal using the smartest approach based on current browser reality. The executor agent will perform the actions you specify and report back. Use their feedback to adapt your plan until the task is complete.`; } export function getToolDescriptions(): string {