diff --git a/examples/integrations/cartesia/.env.example b/examples/integrations/cartesia/.env.example new file mode 100644 index 0000000..6ce6cdb --- /dev/null +++ b/examples/integrations/cartesia/.env.example @@ -0,0 +1,10 @@ +# Gemini API Key for language model (default) +GEMINI_API_KEY=your_gemini_api_key_here + +# Browserbase API key and Project ID +BROWSERBASE_API_KEY=your_browserbase_api_key_here +BROWSERBASE_PROJECT_ID=your_browserbase_project_id_here + +# Optional: Model configuration +# MODEL_NAME=google/gemini-2.0-flash-exp +# MODEL_API_KEY=your_model_api_key_here \ No newline at end of file diff --git a/examples/integrations/cartesia/.gitignore b/examples/integrations/cartesia/.gitignore new file mode 100644 index 0000000..7d9a412 --- /dev/null +++ b/examples/integrations/cartesia/.gitignore @@ -0,0 +1,41 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*.pyd +.Python + +# Virtual environments +.env +.venv/ +venv/ +env/ + +virtualenv/ + +# Conda environments +conda-env/ +envs/ +.conda/ +conda-meta/ + +# uv environments (in addition to uv.lock at top) +uv.lock +.python-version + +# Python package managers +poetry.lock +Pipfile.lock +pip-log.txt + +# pyenv +.pyenv/ + +# Distribution / packaging +*.egg-info/ +dist/ +build/ + +# Editor / OS files +.DS_Store + +.cartesia/ \ No newline at end of file diff --git a/examples/integrations/cartesia/README.md b/examples/integrations/cartesia/README.md new file mode 100644 index 0000000..5c34e98 --- /dev/null +++ b/examples/integrations/cartesia/README.md @@ -0,0 +1,143 @@ +# Voice Agent with Real-time Web Form Filling + +This project demonstrates an advanced voice agent that conducts phone questionnaires while automatically filling out web forms in real-time using Stagehand browser automation. + +Here's what the system architecture looks like: + +![Workflow](workflow_diagram.png) + +## Features + +- **Voice Conversations**: Natural voice interactions using Cartesia Line +- **Real-time Form Filling**: Automatically fills web forms as answers are collected +- **Browser Automation**: Uses Stagehand AI to interact with any web form +- **Intelligent Mapping**: AI-powered mapping of voice answers to form fields +- **Async Processing**: Non-blocking form filling maintains conversation flow - form fields are filled in background tasks without delaying voice responses +- **Auto-submission**: Submits forms automatically when complete + +## Architecture + +``` +Voice Call (Cartesia) → Form Filling Node → Records Answer + ↓ + Stagehand Browser API + ↓ + Fills Web Form Field + ↓ + Continues Conversation + ↓ + Submits Form on Completion +``` + +## Getting Started + +First things first, here is what you will need: +- A [Cartesia](https://play.cartesia.ai/agents) account and API key +- A [Gemini API Key](https://aistudio.google.com/apikey) +- A [Browserbase API Key and Project ID](https://www.browserbase.com/overview) + +Make sure to add the API keys in your `.env` file or to the API keys section in your Cartesia account. + +- Required packages: + ```bash + cartesia-line + stagehand>=0.5.4 + google-genai>=1.26.0 + python-dotenv>=1.0.0 + PyYAML>=6.0.0 + loguru>=0.7.0 + aiohttp>=3.12.0 + pydantic>=2.0.0 + ``` + +## Setup + +1. Install dependencies: +```bash +pip install -r requirements.txt +``` + +2. Set up environment variables - create a `.env` file: +```bash +GEMINI_API_KEY=your_gemini_api_key_here +BROWSERBASE_API_KEY=your_browserbase_api_key_here +BROWSERBASE_PROJECT_ID=your_browserbase_project_id_here +``` + +3. Run the agent: +```bash +python main.py +``` + +## Project Structure + +### `main.py` +Entry point for the voice agent. Handles call initialization with `VoiceAgentApp` class and orchestrates the conversation flow with form filling integration. + +### `form_filling_node.py` +ReasoningNode subclass customized for voice-optimized form filling. Integrates Stagehand browser automation and manages async form filling during conversation without blocking the voice flow. Provides status updates and error handling. + +### `stagehand_form_filler.py` +Browser automation manager that handles all web interactions. Opens and controls web forms, maps conversation data to form fields using AI, transforms voice answers to form-compatible formats, and handles form submission. Supports different field types (text, select, checkbox, etc.). + +### `config.py` +System configuration file including system prompts, model IDs, and temperature + +### `config.toml` +Your Cartesia Line agent id. + +## Configuration + +The system can be configured through multiple files: + +- **`config.py`**: System prompts, model IDs (Gemini model selection), hyperparameters, and boolean flags for features +- **`config.toml`** / **YAML files**: Questionnaire structure and questions flow +- **`cartesia.toml`**: Deployment configuration for Cartesia platform (installs dependencies and runs the script) +- **Variables**: + - `FORM_URL`: Target web form to fill + +## Example Flow + +1. User calls the voice agent +2. Agent asks: "What type of voice agent are you building?" +3. User responds: "A customer service agent" +4. System: + - Records the answer + - Opens browser to form (if not already open) + - Fills "Customer Service" in the role selection field + - Takes screenshot for debugging +5. Agent asks next question +6. Process continues until all questions answered +7. Form is automatically submitted + +## Advanced Features + +- **Background Processing**: Form filling happens asynchronously using background tasks - conversation remains smooth and responsive +- **Error Recovery**: Continues conversation even if form filling fails +- **Progress Tracking**: Monitor form completion status +- **Screenshot Debugging**: Captures screenshots after each field +- **Flexible Mapping**: AI interprets answers for different field types + +## Deploying the Agent + +The `cartesia.toml` file defines how your agent will be installed and run when deployed on the Cartesia platform. This file tells the platform to install dependencies from `requirements.txt` and execute `main.py`. + +You can clone this repository and add it to your [agents dashboard](https://play.cartesia.ai/agents) along with your API Keys (set them in the Cartesia Platform's API keys section). + +For detailed deployment instructions, see [how to deploy an agent from the Cartesia Docs](https://docs.cartesia.ai/line/start-building/talk-to-your-first-agent). + +## Testing + +Test with different scenarios: +- Complete questionnaire flow +- Interruptions and corrections +- Various answer formats +- Multi-page forms +- Form validation errors + +## Production Considerations + +- Configure proper error logging +- Add retry logic for form submission +- Implement form validation checks +- Consider rate limiting for API calls \ No newline at end of file diff --git a/examples/integrations/cartesia/cartesia.toml b/examples/integrations/cartesia/cartesia.toml new file mode 100644 index 0000000..979916c --- /dev/null +++ b/examples/integrations/cartesia/cartesia.toml @@ -0,0 +1,8 @@ +[app] +name = "form-filling" + +[build] +cmd = "pip install -r requirements.txt" + +[run] +cmd = "python main.py" diff --git a/examples/integrations/cartesia/config.py b/examples/integrations/cartesia/config.py new file mode 100644 index 0000000..e8b9bc2 --- /dev/null +++ b/examples/integrations/cartesia/config.py @@ -0,0 +1,35 @@ +"""Configuration settings for the voice agent. + +This module contains system prompts, model configurations, and +hyperparameters for the Cartesia voice agent with form filling. +""" + +import os + +DEFAULT_MODEL_ID = os.getenv("MODEL_ID", "gemini-2.5-flash") + +DEFAULT_TEMPERATURE = 0.7 +SYSTEM_PROMPT = """ +### You and your role +You are a friendly assistant conducting a questionnaire. +Be professional but conversational. Confirm answers when appropriate. +If a user's answer is unclear, ask for clarification. +For sensitive information, be especially tactful and professional. + +IMPORTANT: When you receive a clear answer from the user, use the +record_answer tool to record their response. + +### Your tone +When having a conversation, you should: +- Always polite and respectful, even when users are challenging +- Concise and brief but never curt. Keep your responses to 1-2 + sentences and less than 35 words +- When asking a question, be sure to ask in a short and concise manner +- Only ask one question at a time + +If the user is rude, or curses, respond with exceptional politeness +and genuine curiosity. You should always be polite. + +Remember, you're on the phone, so do not use emojis or abbreviations. +Spell out units and dates. +""" diff --git a/examples/integrations/cartesia/config.toml b/examples/integrations/cartesia/config.toml new file mode 100644 index 0000000..2355be8 --- /dev/null +++ b/examples/integrations/cartesia/config.toml @@ -0,0 +1 @@ +agent-id = 'your-agent-id' diff --git a/examples/integrations/cartesia/form_filling_node.py b/examples/integrations/cartesia/form_filling_node.py new file mode 100644 index 0000000..db87a26 --- /dev/null +++ b/examples/integrations/cartesia/form_filling_node.py @@ -0,0 +1,489 @@ +"""Voice agent that fills web forms in real-time using Stagehand. + +This module implements a ReasoningNode subclass that conducts voice +conversations while automatically filling web forms in the background. +The agent uses Stagehand for browser automation and handles async form +filling during conversation without blocking the voice flow. +""" + +import asyncio +from dataclasses import dataclass +from typing import AsyncGenerator, Dict, List, Optional, Union + +from config import DEFAULT_MODEL_ID, DEFAULT_TEMPERATURE +from google.genai import types as gemini_types +from loguru import logger +from pydantic import BaseModel, Field +from stagehand_form_filler import StagehandFormFiller + +from line.events import AgentResponse, EndCall, ToolResult +from line.nodes.conversation_context import ConversationContext +from line.nodes.reasoning import ReasoningNode +from line.tools.system_tools import EndCallArgs, end_call +from line.utils.gemini_utils import convert_messages_to_gemini + + +class RecordFormFieldArgs(BaseModel): + """Arguments for recording a form field""" + + field_name: str = Field(description="The form field being filled") + value: str = Field(description="The value to enter in the field") + + +class RecordFormFieldTool: + """Tool for recording form field values""" + + @staticmethod + def name() -> str: + return "record_form_field" + + @staticmethod + def description() -> str: + return "Record a value for a form field that needs to be filled" + + @staticmethod + def parameters() -> dict: + return RecordFormFieldArgs.model_json_schema() + + @staticmethod + def to_gemini_tool(): + """Convert to Gemini tool format. + + Returns: + A Gemini Tool object with function declarations. + """ + return gemini_types.Tool( + function_declarations=[ + gemini_types.FunctionDeclaration( + name=RecordFormFieldTool.name(), + description=RecordFormFieldTool.description(), + parameters=RecordFormFieldTool.parameters(), + ) + ] + ) + + +@dataclass +class FormQuestion: + """Represents a question to ask the user""" + + field_name: str + question: str + field_type: str = "text" + required: bool = True + + +class FormFillingNode(ReasoningNode): + """Voice agent that fills web forms while conducting conversations. + + This class uses Stagehand to read and fill web forms dynamically, + maintains conversation flow while automating browser actions, and + intelligently extracts form structure and asks relevant questions. + """ + + def __init__( + self, + system_prompt: str, + gemini_client, + form_url: str, + model_id: str = DEFAULT_MODEL_ID, + temperature: float = DEFAULT_TEMPERATURE, + max_context_length: int = 15, + max_output_tokens: int = 1000, + ): + """Initialize the Form Filling node with Stagehand integration. + + Args: + system_prompt: System prompt for the LLM. + gemini_client: Google Gemini client instance. + form_url: URL of the web form to fill. + model_id: Gemini model ID. + temperature: Temperature for generation. + max_context_length: Maximum conversation context length. + max_output_tokens: Maximum tokens for generation. + """ + super().__init__(system_prompt=system_prompt, max_context_length=max_context_length) + + self.client = gemini_client + self.model_id = model_id + self.temperature = temperature + + # Browser automation + self.form_url = form_url + self.stagehand_filler: Optional[StagehandFormFiller] = None + + # Form state + self.collected_data: Dict[str, str] = {} + # Pre-initialize questions so conversation can start immediately + self.questions: List[FormQuestion] = self._create_questions() + self.current_question_index = 0 + + # Browser initialization + self.browser_init_task = None + self.browser_initializing = False + + # Enhanced prompt for form filling + enhanced_prompt = ( + system_prompt + + """ + + You are conducting a voice conversation to help fill out a web + form. As you collect information, it's being entered into an + actual online form in real-time. Ask natural questions to gather + the required information. Use the record_form_field tool to save + each piece of information. Keep the conversation friendly and + natural. + """ + ) + + # Generation config + self.generation_config = gemini_types.GenerateContentConfig( + system_instruction=enhanced_prompt, + temperature=self.temperature, + tools=[RecordFormFieldTool.to_gemini_tool()], + max_output_tokens=max_output_tokens, + thinking_config=gemini_types.ThinkingConfig(thinking_budget=0), + ) + + logger.info(f"FormFillingNode initialized for form: {form_url}") + + # Track if form was submitted + self.form_submitted = False + + async def cleanup_and_submit(self) -> None: + """Ensure form is submitted and cleanup when call ends. + + Returns: + None. + """ + # Submit form if we have any data and haven't submitted yet + if not self.form_submitted and self.collected_data and self.stagehand_filler: + logger.info("Call ending - auto-submitting form with collected data") + try: + await self._submit_form() + except Exception as e: + logger.error(f"Error during cleanup submission: {e}") + + # Clean up browser + if self.stagehand_filler: + await self.stagehand_filler.cleanup() + + async def _initialize_browser(self) -> None: + """Initialize browser and extract form fields. + + Returns: + None. + """ + # Prevent multiple initializations + if self.browser_initializing or self.stagehand_filler: + logger.info("Browser already initializing or initialized, skipping") + return + + self.browser_initializing = True + try: + logger.info("Initializing browser and analyzing form") + self.stagehand_filler = StagehandFormFiller(form_url=self.form_url) + await self.stagehand_filler.initialize() + + logger.info("Browser ready, form can now be filled") + + except Exception as e: + logger.error(f"Failed to initialize browser: {e}") + self.browser_initializing = False + raise + finally: + self.browser_initializing = False + + def _create_questions(self) -> List[FormQuestion]: + """Create questions for the form. + + Returns: + A list of FormQuestion objects to ask the user. + """ + # Define questions for the form fields we know about + # This matches form at https://forms.fillout.com/t/34ccsqafUFus + form_questions = [ + FormQuestion( + field_name="full_name", question="What is your full name?", field_type="text", required=True + ), + FormQuestion( + field_name="email", question="What is your email address?", field_type="email", required=True + ), + FormQuestion( + field_name="phone", question="What is your phone number?", field_type="phone", required=False + ), + FormQuestion( + field_name="work_eligibility", + question="Are you legally eligible to work in this country?", + field_type="radio", + required=True, + ), + FormQuestion( + field_name="availability_type", + question=("What's your availability - temporary, part-time, or full-time?"), + field_type="radio", + required=True, + ), + FormQuestion( + field_name="role_selection", + question=( + "Which role are you applying for? We have openings " + "for Sales Manager, IT Support, Recruiting, " + "Software Engineer, or Marketing Specialist." + ), + field_type="checkbox", + required=True, + ), + FormQuestion( + field_name="previous_experience", + question="Have you worked in a similar role before?", + field_type="radio", + required=True, + ), + FormQuestion( + field_name="skills_experience", + question=( + "What relevant skills and experience do you have " + "that make you a strong candidate for this position?" + ), + field_type="textarea", + required=True, + ), + FormQuestion( + field_name="additional_info", + question=("Is there anything else you'd like to tell us about yourself?"), + field_type="textarea", + required=False, + ), + ] + + return form_questions + + async def _fill_form_field_async(self, field_name: str, value: str) -> None: + """Fill a form field asynchronously in background (non-blocking). + + Args: + field_name: The name of the form field to fill. + value: The value to enter in the field. + """ + try: + # Wait for browser initialization if needed + if self.browser_init_task: + logger.info(f"Waiting for browser to initialize before filling {field_name}") + await self.browser_init_task + + logger.info(f"Filling field '{field_name}' with: {value} in background") + # Use StagehandFormFiller's fill_field method which + # handles the mapping + success = await self.stagehand_filler.fill_field(field_name, value) + + if success: + logger.info(f"Successfully filled field: {field_name} in browser") + else: + logger.warning(f"Failed to fill field: {field_name}") + + except Exception as e: + logger.error(f"Error filling field {field_name}: {e}") + raise # Re-raise so background task can catch it + + async def _submit_form(self) -> bool: + """Submit the completed form. + + Returns: + True if submission succeeded, False otherwise. + """ + # Wait for browser initialization if needed + if self.browser_init_task and not self.stagehand_filler: + logger.info("Waiting for browser to initialize before submitting form") + await self.browser_init_task + + if not self.stagehand_filler: + return False + + try: + logger.info("Submitting web form with collected data") + logger.info(f"Data collected: {self.collected_data}") + + # Ensure StagehandFormFiller has all collected data + # (it should already have it from fill_field calls, + # but ensure consistency) + self.stagehand_filler.collected_data.update(self.collected_data) + + # Use StagehandFormFiller's submit_form method which now + # uses collected_data + success = await self.stagehand_filler.submit_form() + + if success: + logger.info("Form submitted successfully!") + return True + else: + logger.warning("Form submission may have failed") + return False + + except Exception as e: + logger.error(f"Error submitting form: {e}") + return False + + def get_current_question(self) -> Optional[FormQuestion]: + """Get the current question to ask. + + Returns: + The current FormQuestion or None if all questions answered. + """ + if self.current_question_index < len(self.questions): + return self.questions[self.current_question_index] + return None + + async def process_context( + self, context: ConversationContext + ) -> AsyncGenerator[Union[AgentResponse, EndCall], None]: + """Process conversation context with real-time form filling. + + Args: + context: The conversation context with events. + + Yields: + AgentResponse: Text responses to the user. + EndCall: Call termination when form is complete. + """ + # Initialize browser on first call (non-blocking) + if not self.browser_init_task and not self.stagehand_filler: + self.browser_init_task = asyncio.create_task(self._initialize_browser()) + logger.info("Browser initialization started in background") + + # Get current question after initialization + current_question = self.get_current_question() + question_name = current_question.field_name if current_question else "None" + logger.info(f"Current question: {question_name}") + logger.info(f"Question index: {self.current_question_index}/{len(self.questions)}") + logger.info(f"Events count: {len(context.events)}") + + # Check latest event to determine what to do + latest_event = context.events[-1] if context.events else None + is_agent_response = isinstance(latest_event, AgentResponse) if latest_event else False + + # Handle initial greeting - speak first when conversation starts + if not context.events: + logger.info("Starting conversation - Agent speaks first") + initial_greeting = ( + "Hello! I'm here to help you fill out an application " + "form today. I'll ask you a series of questions and " + "fill in the form as we go. Ready to get started?" + ) + yield AgentResponse(content=initial_greeting) + return + + # If last event was our greeting, and user responded, ask + # first question + if len(context.events) == 2 and not is_agent_response and self.current_question_index == 0: + user_message = context.get_latest_user_transcript_message() + if user_message and current_question: + logger.info(f"User ready to start: '{user_message}'") + logger.info(f"Asking first question: {current_question.field_name}") + yield AgentResponse(content=f"Great! Let's begin. {current_question.question}") + return + + # Check if all questions have been answered + # Only submit if we've actually collected data + if not current_question and self.current_question_index > 0 and len(self.collected_data) > 0: + # All questions answered - submit the form + logger.info(f"All {self.current_question_index} questions answered") + logger.info(f"Collected data for {len(self.collected_data)} fields") + + submission_success = await self._submit_form() + self.form_submitted = True + + if submission_success: + goodbye = "Perfect! I've submitted your application. Thank you!" + else: + goodbye = "Thank you for providing all the information. Your responses have been recorded." + + # Clean up + await self.cleanup_and_submit() + + # End call + args = EndCallArgs(goodbye_message=goodbye) + async for item in end_call(args): + yield item + return + + # Guard against no questions or empty state + if not current_question and self.current_question_index == 0: + logger.warning("No questions available or not properly initialized") + return + + # Process user response + messages = convert_messages_to_gemini(context.events, text_events_only=True) + + # Add context about current question + question_context = f""" + + Current form field: {current_question.field_name} + Question: {current_question.question} + + Listen to the user's response and use the record_form_field + tool to save it. Then acknowledge their answer naturally. + """ + + enhanced_config = gemini_types.GenerateContentConfig( + system_instruction=(self.generation_config.system_instruction + question_context), + temperature=self.temperature, + tools=[RecordFormFieldTool.to_gemini_tool()], + max_output_tokens=self.generation_config.max_output_tokens, + thinking_config=gemini_types.ThinkingConfig(thinking_budget=0), + ) + + # Get user's latest message + user_message = context.get_latest_user_transcript_message() + if user_message: + logger.info(f'User response: "{user_message}"') + + # Stream Gemini response + full_response = "" + stream = await self.client.aio.models.generate_content_stream( + model=self.model_id, + contents=messages, + config=enhanced_config, + ) + + async for msg in stream: + if msg.text: + full_response += msg.text + yield AgentResponse(content=msg.text) + + if msg.function_calls: + for function_call in msg.function_calls: + if function_call.name == RecordFormFieldTool.name(): + field_name = function_call.args.get("field_name", current_question.field_name) + value = function_call.args.get("value", "") + + logger.info(f"Recording: {field_name} = {value}") + + # Store data first + self.collected_data[field_name] = value + # Fill the form field asynchronously in background + # (non-blocking) + asyncio.create_task(self._fill_form_field_async(field_name, value)) + # Log the collected data + logger.info(f"Collected: {field_name}={value}") + # Move to next question immediately + # (don't wait for form filling) + self.current_question_index += 1 + + # Clear context + self.clear_context() + + # Get next question + next_question = self.get_current_question() + if next_question: + yield AgentResponse(content=f"Great! {next_question.question}") + + # Yield tool result immediately + yield ToolResult( + tool_name="record_form_field", + tool_args={"field_name": field_name, "value": value}, + result=f"Recorded: {field_name}={value}", + ) + + if full_response: + logger.info(f'Agent response: "{full_response}"') diff --git a/examples/integrations/cartesia/main.py b/examples/integrations/cartesia/main.py new file mode 100644 index 0000000..95b465f --- /dev/null +++ b/examples/integrations/cartesia/main.py @@ -0,0 +1,75 @@ +"""Cartesia Line Voice Agent with real-time web form filling. + +This module implements a voice agent that conducts phone questionnaires +while automatically filling out web forms in real-time using Stagehand +browser automation. +""" + +import os + +from config import SYSTEM_PROMPT +from form_filling_node import FormFillingNode +from google import genai + +from line import Bridge, CallRequest, VoiceAgentApp, VoiceAgentSystem +from line.events import UserStartedSpeaking, UserStoppedSpeaking, UserTranscriptionReceived + +# Target form URL - the actual web form to fill +FORM_URL = "https://forms.fillout.com/t/rff6XZTSApus" + +# Initialize Gemini client +gemini_client = genai.Client(api_key=os.getenv("GEMINI_API_KEY")) + + +async def handle_new_call(system: VoiceAgentSystem, call_request: CallRequest) -> None: + """Handle incoming voice calls with real-time web form filling. + + This agent will: + 1. Conduct a voice conversation to gather information + 2. Open and fill an actual web form in the background + 3. Submit the form when the conversation is complete + + Args: + system: The voice agent system instance. + call_request: The incoming call request. + """ + + # Create form filling node with browser automation + form_node = FormFillingNode( + system_prompt=SYSTEM_PROMPT, gemini_client=gemini_client, form_url=FORM_URL + ) + + # Set up bridge for event handling + form_bridge = Bridge(form_node) + system.with_speaking_node(form_node, bridge=form_bridge) + + # Connect transcription events + form_bridge.on(UserTranscriptionReceived).map(form_node.add_event) + + # Handle interruptions and streaming + ( + form_bridge.on(UserStoppedSpeaking) + .interrupt_on(UserStartedSpeaking, handler=form_node.on_interrupt_generate) + .stream(form_node.generate) + .broadcast() + ) + + # Start the system + await system.start() + + # Wait for call to end + await system.wait_for_shutdown() + + # Ensure form is submitted when call ends + await form_node.cleanup_and_submit() + + +# Create the voice agent application +app = VoiceAgentApp(handle_new_call) + +if __name__ == "__main__": + print("Starting Voice Agent with Web Form Automation") + print(f"Will fill form at: {FORM_URL}") + print("Ready to receive calls...") + print("Form filling happens invisibly while processing voice calls.\n") + app.run() diff --git a/examples/integrations/cartesia/requirements.txt b/examples/integrations/cartesia/requirements.txt new file mode 100644 index 0000000..25b3aa7 --- /dev/null +++ b/examples/integrations/cartesia/requirements.txt @@ -0,0 +1,8 @@ +cartesia-line +aiohttp>=3.12.0 +google-genai>=1.26.0; python_version>='3.9' +loguru>=0.7.0 +python-dotenv>=1.0.0 +PyYAML>=6.0.0 +stagehand>=0.5.4 +pydantic>=2.0.0 diff --git a/examples/integrations/cartesia/stagehand_form_filler.py b/examples/integrations/cartesia/stagehand_form_filler.py new file mode 100644 index 0000000..2b3f20c --- /dev/null +++ b/examples/integrations/cartesia/stagehand_form_filler.py @@ -0,0 +1,270 @@ +"""Browser automation for filling web forms during voice conversations. + +This module provides the StagehandFormFiller class which manages browser +automation for filling forms using Stagehand. It handles form field +mapping, field filling, and form submission. +""" + +import asyncio +from dataclasses import dataclass +from enum import Enum +import os +from typing import Dict, List, Optional + +from loguru import logger +from stagehand import Stagehand, StagehandConfig + + +class FieldType(Enum): + TEXT = "text" + EMAIL = "email" + PHONE = "phone" + SELECT = "select" + RADIO = "radio" + CHECKBOX = "checkbox" + TEXTAREA = "textarea" + + +@dataclass +class FormField: + """Represents a form field with its metadata""" + + field_id: str + field_type: FieldType + label: str + required: bool = False + options: Optional[List[str]] = None + + +class FormFieldMapping: + """Maps conversation questions to actual form fields""" + + def __init__(self): + self.field_mappings = { + "full_name": FormField( + field_id="full_name", + field_type=FieldType.TEXT, + label="What is your full name?", + required=True, + ), + "email": FormField( + field_id="email", + field_type=FieldType.EMAIL, + label="What is your email address?", + required=True, + ), + "phone": FormField( + field_id="phone", + field_type=FieldType.PHONE, + label="What is your phone number?", + required=False, + ), + "work_eligibility": FormField( + field_id="work_eligibility", + field_type=FieldType.RADIO, + label="Are you legally eligible to work in this country?", + options=["Yes", "No"], + required=True, + ), + "availability_type": FormField( + field_id="availability", + field_type=FieldType.RADIO, + label="What's your availability?", + options=["Temporary", "Part-time", "Full-time"], + required=True, + ), + "additional_info": FormField( + field_id="additional_info", + field_type=FieldType.TEXTAREA, + label="Anything else you'd like to let us know about you?", + required=False, + ), + "role_selection": FormField( + field_id="role_selection", + field_type=FieldType.CHECKBOX, + label="Which of these roles are you applying for?", + options=[ + "Sales manager", + "IT Support", + "Recruiting", + "Software engineer", + "Marketing specialist", + ], + required=True, + ), + "previous_experience": FormField( + field_id="previous_experience", + field_type=FieldType.RADIO, + label=("Have you worked in a role similar to this one in the past?"), + options=["Yes", "No"], + required=True, + ), + "skills_experience": FormField( + field_id="skills_experience", + field_type=FieldType.TEXTAREA, + label=( + "What relevant skills and experience do you have " + "that make you a strong candidate for this position?" + ), + required=True, + ), + } + + def get_form_field(self, question_id: str) -> Optional[FormField]: + """Get the form field mapping for a question ID. + + Args: + question_id: The question identifier. + + Returns: + The FormField object or None if not found. + """ + return self.field_mappings.get(question_id) + + +class StagehandFormFiller: + """Manages browser automation for filling forms using Stagehand""" + + def __init__(self, form_url: str): + self.form_url = form_url + self.stagehand: Optional[Stagehand] = None + self.page = None + self.is_initialized = False + self.field_mapper = FormFieldMapping() + self.collected_data: Dict[str, str] = {} + + async def initialize(self) -> None: + """Initialize Stagehand and open the form. + + Returns: + None. + """ + if self.is_initialized: + return + + try: + logger.info("Initializing Stagehand browser automation") + + # Configure Stagehand + config = StagehandConfig( + env="BROWSERBASE", + # Fast model for form filling + model_name="google/gemini-2.0-flash-exp", + model_api_key=os.getenv("GEMINI_API_KEY"), + ) + + self.stagehand = Stagehand(config) + await self.stagehand.init() + + self.page = self.stagehand.page + + # Navigate to form + logger.info(f"Opening form: {self.form_url}") + await self.page.goto(self.form_url) + + # Wait for form to load + await asyncio.sleep(2) + + self.is_initialized = True + logger.info("Browser automation initialized successfully") + + except Exception as e: + logger.error(f"Failed to initialize Stagehand: {e}") + raise + + async def fill_field(self, question_id: str, answer: str) -> bool: + """Fill a specific form field based on the question ID and answer. + + Args: + question_id: The question identifier. + answer: The answer value to fill. + + Returns: + True if field was filled successfully, False otherwise. + """ + if not self.is_initialized: + # Initialize asynchronously without blocking + init_task = asyncio.create_task(self.initialize()) + await init_task + + try: + # Get field mapping + field = self.field_mapper.get_form_field(question_id) + if not field: + logger.warning(f"No field mapping found for question: {question_id}") + return False + + # Store and use the answer directly + answer = answer.strip() + self.collected_data[question_id] = answer + + logger.info(f"Async filling field '{field.label}' with: {answer}") + + # Create async task for the actual field filling + fill_action = None + + # Use Stagehand's natural language API to fill the field + if field.field_type in [FieldType.TEXT, FieldType.EMAIL, FieldType.PHONE]: + fill_action = self.page.act(f"Fill in the '{field.label}' field with: {answer}") + + elif field.field_type == FieldType.TEXTAREA: + fill_action = self.page.act(f"Type in the '{field.label}' text area: {answer}") + + elif field.field_type in [FieldType.SELECT, FieldType.RADIO]: + fill_action = self.page.act(f"Select '{answer}' for the '{field.label}' field") + + elif field.field_type == FieldType.CHECKBOX: + # For role selection, check the specific role checkbox + if question_id == "role_selection": + fill_action = self.page.act(f"Check the '{answer}' checkbox") + else: + # For other checkboxes, check/uncheck based on answer + if answer.lower() in ["yes", "true"]: + fill_action = self.page.act(f"Check the '{field.label}' checkbox") + else: + fill_action = self.page.act(f"Uncheck the '{field.label}' checkbox") + + # Execute the fill action asynchronously + if fill_action: + await fill_action + + return True + + except Exception as e: + logger.error(f"Error filling field {question_id}: {e}") + return False + + async def submit_form(self) -> bool: + """Submit the completed form. + + Returns: + True if form was submitted successfully, False otherwise. + """ + try: + logger.info("Submitting the form") + logger.info(f"Form has {len(self.collected_data)} fields filled") + + await self.page.act("Find and click the Submit button to submit the form") + + # Wait for submission to process + await asyncio.sleep(1) + + logger.info("Form submitted successfully!") + return True + + except Exception as e: + logger.error(f"Error submitting form: {e}") + return False + + async def cleanup(self) -> None: + """Clean up browser resources. + + Returns: + None. + """ + if self.stagehand or self.page: + try: + await self.stagehand.close() + logger.info("Browser closed") + except Exception as e: + logger.error(f"Error closing browser: {e}") diff --git a/examples/integrations/cartesia/workflow_diagram.png b/examples/integrations/cartesia/workflow_diagram.png new file mode 100644 index 0000000..59d60e8 Binary files /dev/null and b/examples/integrations/cartesia/workflow_diagram.png differ