diff --git a/README.md b/README.md
index f824e5da..457e101b 100644
--- a/README.md
+++ b/README.md
@@ -34,47 +34,57 @@
-
-
-
NOTE: This is a Python SDK for Stagehand. Original implementation is in TypeScript and is available
here.
-
+
+
NOTE: This is a Python SDK for Stagehand. The original implementation is in TypeScript and is available
here.
+
---
-A Python SDK for [Stagehand](https://stagehand.dev), enabling automated browser control and data extraction.
-
-Stagehand is the easiest way to build browser automations. It is fully compatible with Playwright, offering three simple AI APIs (act, extract, and observe) on top of the base Playwright Page class that provide the building blocks for web automation via natural language.
-
-You can write all of your Playwright commands as you normally would, while offloading the AI-powered `act/extract/observe` operations to Stagehand hosted on our Stagehand API.
-
-
-Here's a sample of what you can do with Stagehand:
-
-```python
-import asyncio
-
-async def main():
- # Keep your existing Playwright code unchanged
- await page.goto("https://docs.stagehand.dev");
-
- # Stagehand AI: Act on the page via Stagehand API
- await page.act("click on the 'Quickstart'");
-
- # Stagehand AI: Extract data from the page
- from pydantic import BaseModel
-
- class DescriptionSchema(BaseModel):
- description: str
-
- data = await page.extract(
- instruction="extract the description of the page",
- schema=DescriptionSchema
- )
- description = data.description
-
-if __name__ == "__main__":
- asyncio.run(main())
-```
+Stagehand is the easiest way to build browser automations with AI-powered interactions. It extends the Playwright API with three powerful AI primitives:
+
+- **act** — Instruct the AI to perform actions (e.g. click a button or scroll).
+- **extract** — Extract and validate data from a page using a JSON schema (generated either manually or via a Pydantic model).
+- **observe** — Get natural language interpretations to, for example, identify selectors or elements from the DOM.
+## Pydantic Schemas
+
+Stagehand uses Pydantic models to define the options for AI commands:
+
+- **ActOptions**
+ The `ActOptions` model takes an `action` field that tells the AI what to do on the page, plus optional fields such as `useVision` and `variables`:
+ ```python
+ from stagehand.schemas import ActOptions
+
+ # Example:
+ await page.act(ActOptions(action="click on the 'Quickstart' button"))
+ ```
+
+- **ObserveOptions**
+ The `ObserveOptions` model lets you find elements on the page using natural language. The `onlyVisible` option helps limit the results:
+ ```python
+ from stagehand.schemas import ObserveOptions
+
+ # Example:
+ await page.observe(ObserveOptions(instruction="find the button labeled 'News'", onlyVisible=True))
+ ```
+
+- **ExtractOptions**
+ The `ExtractOptions` model extracts structured data from the page. Pass your instructions and a schema defining your expected data format. **Note:** If you are using a Pydantic model for the schema, call its `.model_json_schema()` method to ensure JSON serializability.
+ ```python
+ from stagehand.schemas import ExtractOptions
+ from pydantic import BaseModel
+
+ class DescriptionSchema(BaseModel):
+ description: str
+
+ # Example:
+ data = await page.extract(
+ ExtractOptions(
+ instruction="extract the description of the page",
+ schemaDefinition=DescriptionSchema.model_json_schema()
+ )
+ )
+ description = data.get("description") if isinstance(data, dict) else data.description
+ ```
## Why?
**Stagehand adds determinism to otherwise unpredictable agents.**
@@ -87,56 +97,67 @@ While there's no limit to what you could instruct Stagehand to do, our primitive
## Installation
+Install the Python package via pip:
+
```bash
pip install stagehand-py
```
-## Quickstart
+## Environment Variables
-Before running your script, make sure you have exported the necessary environment variables:
+Before running your script, set the following environment variables:
```bash
export BROWSERBASE_API_KEY="your-api-key"
export BROWSERBASE_PROJECT_ID="your-project-id"
-export OPENAI_API_KEY="your-openai-api-key" # or other model
-export STAGEHAND_SERVER_URL="url-of-stagehand-server"
+export OPENAI_API_KEY="your-openai-api-key" # or your preferred model's API key
+export STAGEHAND_SERVER_URL="url-of-stagehand-server"
```
-## Usage
+## Quickstart
-Here is a minimal example to get started:
+Below is a minimal example to get started with Stagehand using the new schema-based options:
```python
import asyncio
import os
from stagehand.client import Stagehand
+from stagehand.schemas import ActOptions, ExtractOptions
+from pydantic import BaseModel
from dotenv import load_dotenv
load_dotenv()
+class DescriptionSchema(BaseModel):
+ description: str
+
async def main():
- # Create a Stagehand client - it will create a new session automatically
+ # Create a Stagehand client - it will automatically create a new session if needed
stagehand = Stagehand(
- model_name="gpt-4o", # optional - defaults to server's default
+ model_name="gpt-4o", # Optional: defaults are available from the server
)
- # Initialize - this will create a new session
- await stagehand.page.init()
+ # Initialize Stagehand and create a new session
+ await stagehand.init()
print(f"Created new session: {stagehand.session_id}")
- # Example: navigate to google.com - from Playwright in Python
- await stagehand.page.goto("https://www.google.com")
+ # Navigate to a webpage using local Playwright controls
+ await stagehand.page.goto("https://www.example.com")
print("Navigation complete.")
- # Example: ACT to do something like 'search for openai'
- # executes remote on a Typescript server and logs are streamed back
- await stagehand.page.act("search for openai")
+ # Perform an action using the AI (e.g. simulate a button click)
+ await stagehand.page.act(ActOptions(action="click on the 'Quickstart' button"))
- # Pure client side Playwright - after searching for OpenAI, click on the News tab
- await stagehand.page.get_by_role("link", name="News", exact=True).first.click()
- print("Clicked on News tab")
+ # Extract data from the page with schema validation
+ data = await stagehand.page.extract(
+ ExtractOptions(
+ instruction="extract the description of the page",
+ schemaDefinition=DescriptionSchema.model_json_schema()
+ )
+ )
+ description = data.get("description") if isinstance(data, dict) else data.description
+ print("Extracted description:", description)
- # Close the session (if needed)
await stagehand.close()
if __name__ == "__main__":
@@ -144,32 +165,63 @@ if __name__ == "__main__":
```
-## More Examples
+## Running Evaluations
-For further examples, you can check out the scripts in the “examples/” directory:
+To test all evaluations, run the following command in your terminal:
-1. “examples/example.py”: Demonstrates combined server-side/page navigation and AI-based actions.
-2. “examples/extract-example.py”: Shows how to use the “extract” functionality with JSON schema or a pydantic model.
-3. “examples/observe-example.py”: Demonstrates the “observe” functionality to get natural-language readings of the page.
+```bash
+python evals/run_all_evals.py
+```
+
+This script will dynamically discover and execute every evaluation module within the `evals` directory and print the results for each.
+
+
+## More Examples
+
+For further examples, check out the scripts in the `examples/` directory:
+
+1. **examples/example.py**: Demonstrates combined server-side/page navigation with AI-based actions.
+2. **examples/extract-example.py**: Shows how to use the extract functionality with a JSON schema or a Pydantic model.
+3. **examples/observe-example.py**: Demonstrates the observe functionality to get natural-language readings of the page.
## Configuration
-- `stagehand_server_url`: The Stagehand API server URL
-- `browserbase_api_key`: Your BrowserBase API key (can also be set via BROWSERBASE_API_KEY environment variable)
-- `browserbase_project_id`: Your BrowserBase project ID (can also be set via BROWSERBASE_PROJECT_ID environment variable)
-- `model_api_key`: Your model API key (e.g. OpenAI, Anthropic, etc) (can also be set via MODEL_API_KEY environment variable)
-- `verbose`: Verbosity level (default: 1)
-- `model_name`: (optional) Model name to use for the conversation
-- `dom_settle_timeout_ms`: (optional) Additional time for the DOM to settle
-- `debug_dom`: (optional) Whether or not to enable DOM debug mode
+Stagehand can be configured via environment variables or through a `StagehandConfig` object. Available configuration options include:
+
+- `stagehand_server_url`: URL of the Stagehand API server.
+- `browserbase_api_key`: Your Browserbase API key (`BROWSERBASE_API_KEY`).
+- `browserbase_project_id`: Your Browserbase project ID (`BROWSERBASE_PROJECT_ID`).
+- `model_api_key`: Your model API key (e.g. OpenAI, Anthropic, etc.) (`MODEL_API_KEY`).
+- `verbose`: Verbosity level (default: 1).
+- `model_name`: Optional model name for the AI.
+- `dom_settle_timeout_ms`: Additional time (in ms) to have the DOM settle.
+- `debug_dom`: Enable debug mode for DOM operations.
+
+Example using a unified configuration:
+
+```python
+from stagehand.config import StagehandConfig
+import os
+
+config = StagehandConfig(
+ env="BROWSERBASE" if os.getenv("BROWSERBASE_API_KEY") and os.getenv("BROWSERBASE_PROJECT_ID") else "LOCAL",
+ api_key=os.getenv("BROWSERBASE_API_KEY"),
+ project_id=os.getenv("BROWSERBASE_PROJECT_ID"),
+ debug_dom=True,
+ headless=False,
+ dom_settle_timeout_ms=3000,
+ model_name="gpt-4o-mini",
+ model_client_options={"apiKey": os.getenv("MODEL_API_KEY")}
+)
+```
## Features
-- Automated browser control with natural language commands
-- Data extraction with schema validation (either pydantic or JSON schema)
-- Async/await support
-- Extension of Playwright - run playwright commands normally, with act/extract/observe offloaded to an API
+- **AI-powered Browser Control**: Execute natural language instructions over a running browser.
+- **Validated Data Extraction**: Use JSON schemas (or Pydantic models) to extract and validate information from pages.
+- **Async/Await Support**: Built using Python's asyncio, making it easy to build scalable web automation workflows.
+- **Extensible**: Seamlessly extend Playwright functionality with AI enrichments.
## Requirements
@@ -177,9 +229,8 @@ For further examples, you can check out the scripts in the “examples/” direc
- httpx
- asyncio
- pydantic
-- python-dotenv (optional if using a .env file)
+- python-dotenv (optional, for .env support)
## License
-MIT License (c) Browserbase, Inc.
-
+MIT License (c) 2025 Browserbase, Inc.
diff --git a/evals/act/google_jobs.py b/evals/act/google_jobs.py
new file mode 100644
index 00000000..eda1be00
--- /dev/null
+++ b/evals/act/google_jobs.py
@@ -0,0 +1,155 @@
+import asyncio
+import traceback
+from typing import Optional, Any, Dict
+from pydantic import BaseModel
+from evals.init_stagehand import init_stagehand
+from stagehand.schemas import ActOptions, ExtractOptions
+
+
+class Qualifications(BaseModel):
+ degree: Optional[str] = None
+ yearsOfExperience: Optional[float] = None # Representing the number
+
+
+class JobDetails(BaseModel):
+ applicationDeadline: Optional[str] = None
+ minimumQualifications: Qualifications
+ preferredQualifications: Qualifications
+
+
+def is_job_details_valid(details: Dict[str, Any]) -> bool:
+ """
+ Validates that each top-level field in the extracted job details is not None.
+ For nested dictionary values, each sub-value must be non-null and a string or a number.
+ """
+ if not details:
+ return False
+ for key, value in details.items():
+ if value is None:
+ return False
+ if isinstance(value, dict):
+ for v in value.values():
+ if v is None or not isinstance(v, (str, int, float)):
+ return False
+ elif not isinstance(value, (str, int, float)):
+ return False
+ return True
+
+
+async def google_jobs(model_name: str, logger, use_text_extract: bool) -> dict:
+ """
+ Evaluates a Google jobs flow by:
+ 1. Initializing Stagehand with the given model name and logger.
+ 2. Navigating to "https://www.google.com/".
+ 3. Performing a series of act commands representing UI interactions:
+ - Clicking on the about page
+ - Clicking on the careers page
+ - Inputting "data scientist" into the role field
+ - Inputting "new york city" into the location field
+ - Clicking on the search button
+ - Clicking on the first job link
+ 4. Extracting job posting details using an AI-driven extraction schema.
+
+ The extraction schema requires:
+ - applicationDeadline: The opening date until which applications are accepted.
+ - minimumQualifications: An object with degree and yearsOfExperience.
+ - preferredQualifications: An object with degree and yearsOfExperience.
+
+ Returns a dictionary containing:
+ - _success (bool): Whether valid job details were extracted.
+ - jobDetails (dict): The extracted job details.
+ - debugUrl (str): The debug URL from Stagehand initialization.
+ - sessionUrl (str): The session URL from Stagehand initialization.
+ - logs (list): Logs collected from the provided logger.
+ - error (dict, optional): Error details if an exception was raised.
+ """
+ stagehand, init_response = await init_stagehand(model_name, logger)
+ debug_url = (
+ init_response.get("debugUrl", {}).get("value")
+ if isinstance(init_response.get("debugUrl"), dict)
+ else init_response.get("debugUrl")
+ )
+ session_url = (
+ init_response.get("sessionUrl", {}).get("value")
+ if isinstance(init_response.get("sessionUrl"), dict)
+ else init_response.get("sessionUrl")
+ )
+
+ try:
+ await stagehand.page.navigate("https://www.google.com/")
+ await asyncio.sleep(3)
+ await stagehand.page.act(ActOptions(action="click on the about page"))
+ await stagehand.page.act(ActOptions(action="click on the careers page"))
+ await stagehand.page.act(ActOptions(action="input data scientist into role"))
+ await stagehand.page.act(ActOptions(action="input new york city into location"))
+ await stagehand.page.act(ActOptions(action="click on the search button"))
+ await stagehand.page.act(ActOptions(action="click on the first job link"))
+
+ job_details = await stagehand.page.extract(ExtractOptions(
+ instruction=(
+ "Extract the following details from the job posting: application deadline, "
+ "minimum qualifications (degree and years of experience), and preferred qualifications "
+ "(degree and years of experience)"
+ ),
+ schemaDefinition=JobDetails.model_json_schema(),
+ useTextExtract=use_text_extract
+ ))
+
+ valid = is_job_details_valid(job_details)
+
+ await stagehand.close()
+
+ return {
+ "_success": valid,
+ "jobDetails": job_details,
+ "debugUrl": debug_url,
+ "sessionUrl": session_url,
+ "logs": logger.get_logs() if hasattr(logger, "get_logs") else []
+ }
+ except Exception as e:
+ err_message = str(e)
+ err_trace = traceback.format_exc()
+ logger.error({
+ "message": "error in google_jobs function",
+ "level": 0,
+ "auxiliary": {
+ "error": {"value": err_message, "type": "string"},
+ "trace": {"value": err_trace, "type": "string"}
+ }
+ })
+
+ await stagehand.close()
+
+ return {
+ "_success": False,
+ "debugUrl": debug_url,
+ "sessionUrl": session_url,
+ "error": {"message": err_message, "trace": err_trace},
+ "logs": logger.get_logs() if hasattr(logger, "get_logs") else []
+ }
+
+# For quick local testing
+if __name__ == "__main__":
+ import os
+ import asyncio
+ import logging
+ logging.basicConfig(level=logging.INFO)
+
+ class SimpleLogger:
+ def __init__(self):
+ self._logs = []
+ def info(self, message):
+ self._logs.append(message)
+ print("INFO:", message)
+ def error(self, message):
+ self._logs.append(message)
+ print("ERROR:", message)
+ def get_logs(self):
+ return self._logs
+
+ async def main():
+ logger = SimpleLogger()
+ result = await google_jobs("gpt-4o-mini", logger, use_text_extract=False) # TODO - use text extract
+ print("Result:", result)
+
+ asyncio.run(main())
\ No newline at end of file
diff --git a/evals/extract/extract_press_releases.py b/evals/extract/extract_press_releases.py
new file mode 100644
index 00000000..7c504052
--- /dev/null
+++ b/evals/extract/extract_press_releases.py
@@ -0,0 +1,152 @@
+import asyncio
+from pydantic import BaseModel
+from stagehand.schemas import ExtractOptions
+from evals.init_stagehand import init_stagehand
+from evals.utils import compare_strings
+
+# Define Pydantic models for validating press release data
+class PressRelease(BaseModel):
+ title: str
+ publish_date: str
+
+class PressReleases(BaseModel):
+ items: list[PressRelease]
+
+async def extract_press_releases(model_name: str, logger, use_text_extract: bool):
+ """
+ Extract press releases from the dummy press releases page using the Stagehand client.
+
+ Args:
+ model_name (str): Name of the AI model to use.
+ logger: A custom logger that provides .error() and .get_logs() methods.
+ use_text_extract (bool): Flag to control text extraction behavior.
+
+ Returns:
+ dict: A result object containing:
+ - _success (bool): Whether the eval was successful.
+ - error (Optional[str]): Error message (if any).
+ - logs (list): Collected logs from the logger.
+ - debugUrl (str): Debug URL.
+ - sessionUrl (str): Session URL.
+ """
+ stagehand = None
+ debug_url = None
+ session_url = None
+ try:
+ # Initialize Stagehand (mimicking the TS initStagehand)
+ stagehand, init_response = await init_stagehand(model_name, logger, dom_settle_timeout_ms=3000)
+ debug_url = init_response["debugUrl"]
+ session_url = init_response["sessionUrl"]
+
+ # Navigate to the dummy press releases page # TODO - choose a different page
+ await stagehand.page.navigate("https://dummy-press-releases.surge.sh/news", wait_until="networkidle")
+ # Wait for 5 seconds to ensure content has loaded
+ await asyncio.sleep(5)
+
+ # Extract data using Stagehand's extract method.
+ # TODO - FAILING - extract is likely timing out
+ raw_result = await stagehand.page.extract(
+ ExtractOptions(
+ instruction="extract the title and corresponding publish date of EACH AND EVERY press releases on this page. DO NOT MISS ANY PRESS RELEASES.",
+ schemaDefinition=PressReleases.model_json_schema(),
+ useTextExtract=use_text_extract
+ )
+ )
+ print("Raw result:", raw_result)
+ # Check that the extraction returned a valid dictionary
+ if not raw_result or not isinstance(raw_result, dict):
+ error_message = "Extraction did not return a valid dictionary."
+ logger.error({"message": error_message, "raw_result": raw_result})
+ return {
+ "_success": False,
+ "error": error_message,
+ "logs": logger.get_logs() if hasattr(logger, "get_logs") else [],
+ "debugUrl": debug_url,
+ "sessionUrl": session_url,
+ }
+
+ # Parse the raw result using the defined schema.
+ parsed = PressReleases.parse_obj(raw_result)
+ items = parsed.items
+
+ # Expected results (from the TS eval)
+ expected_length = 28
+ expected_first = PressRelease(
+ title="UAW Region 9A Endorses Brad Lander for Mayor",
+ publish_date="Dec 4, 2024"
+ )
+ expected_last = PressRelease(
+ title="Fox Sued by New York City Pension Funds Over Election Falsehoods",
+ publish_date="Nov 12, 2023"
+ )
+
+ if len(items) <= expected_length:
+ logger.error({
+ "message": "Not enough items extracted",
+ "expected": f"> {expected_length}",
+ "actual": len(items)
+ })
+ return {
+ "_success": False,
+ "error": "Not enough items extracted",
+ "logs": logger.get_logs() if hasattr(logger, "get_logs") else [],
+ "debugUrl": debug_url,
+ "sessionUrl": session_url,
+ }
+
+ def is_item_match(item: PressRelease, expected: PressRelease) -> bool:
+ title_similarity = compare_strings(item.title, expected.title)
+ date_similarity = compare_strings(item.publish_date, expected.publish_date)
+ return title_similarity >= 0.9 and date_similarity >= 0.9
+
+ found_first = any(is_item_match(item, expected_first) for item in items)
+ found_last = any(is_item_match(item, expected_last) for item in items)
+
+ result = {
+ "_success": found_first and found_last,
+ "logs": logger.get_logs() if hasattr(logger, "get_logs") else [],
+ "debugUrl": debug_url,
+ "sessionUrl": session_url,
+ }
+ await stagehand.close()
+ return result
+ except Exception as e:
+ logger.error({
+ "message": "Error in extract_press_releases function",
+ "error": str(e)
+ })
+ return {
+ "_success": False,
+ "error": str(e),
+ "logs": logger.get_logs() if hasattr(logger, "get_logs") else [],
+ "debugUrl": debug_url,
+ "sessionUrl": session_url,
+ }
+ finally:
+ # Ensure we close the Stagehand client even upon error.
+ if stagehand:
+ await stagehand.close()
+
+# For quick local testing.
+if __name__ == "__main__":
+ import logging
+ logging.basicConfig(level=logging.INFO)
+
+ class SimpleLogger:
+ def __init__(self):
+ self._logs = []
+ def info(self, message):
+ self._logs.append(message)
+ print("INFO:", message)
+ def error(self, message):
+ self._logs.append(message)
+ print("ERROR:", message)
+ def get_logs(self):
+ return self._logs
+
+ async def main():
+ logger = SimpleLogger()
+ result = await extract_press_releases("gpt-4o", logger, use_text_extract=False) # TODO - use text extract
+ print("Result:", result)
+
+ asyncio.run(main())
\ No newline at end of file
diff --git a/evals/init_stagehand.py b/evals/init_stagehand.py
new file mode 100644
index 00000000..8ee3bb40
--- /dev/null
+++ b/evals/init_stagehand.py
@@ -0,0 +1,46 @@
+import os
+import asyncio
+from stagehand import Stagehand
+from stagehand.config import StagehandConfig
+
+async def init_stagehand(model_name: str, logger, dom_settle_timeout_ms: int = 3000):
+ """
+ Initialize a Stagehand client with the given model name, logger, and DOM settle timeout.
+
+ This function creates a configuration from environment variables, initializes the Stagehand client,
+ and returns a tuple of (stagehand, init_response). The init_response contains debug and session URLs.
+
+ Args:
+ model_name (str): The name of the AI model to use.
+ logger: A logger instance for logging errors and debug messages.
+ dom_settle_timeout_ms (int): Milliseconds to wait for the DOM to settle.
+
+ Returns:
+ tuple: (stagehand, init_response) where init_response is a dict containing:
+ - "debugUrl": A dict with a "value" key for the debug URL.
+ - "sessionUrl": A dict with a "value" key for the session URL.
+ """
+ # Build a Stagehand configuration object using environment variables
+ config = StagehandConfig(
+ env="BROWSERBASE" if os.getenv("BROWSERBASE_API_KEY") and os.getenv("BROWSERBASE_PROJECT_ID") else "LOCAL",
+ api_key=os.getenv("BROWSERBASE_API_KEY"),
+ project_id=os.getenv("BROWSERBASE_PROJECT_ID"),
+ debug_dom=True,
+ headless=True,
+ dom_settle_timeout_ms=dom_settle_timeout_ms,
+ model_name=model_name,
+ model_client_options={"apiKey": os.getenv("MODEL_API_KEY")},
+ )
+
+ # Create a Stagehand client with the configuration; server_url is taken from environment variables.
+ stagehand = Stagehand(config=config, server_url=os.getenv("STAGEHAND_SERVER_URL"), verbose=2)
+ await stagehand.init()
+
+ # Construct the URL from the session id using the new format.
+ # For example:
+ # "wss://connect.browserbase.com?apiKey=bb_live_1KG6TTh14CYTJdyNTLpnugz9kgk&sessionId="
+ api_key = os.getenv("BROWSERBASE_API_KEY")
+ url = f"wss://connect.browserbase.com?apiKey={api_key}&sessionId={stagehand.session_id}"
+
+ # Return both URLs as dictionaries with the "value" key.
+ return stagehand, {"debugUrl": {"value": url}, "sessionUrl": {"value": url}}
\ No newline at end of file
diff --git a/evals/observe/observe_yc_startup.py b/evals/observe/observe_yc_startup.py
new file mode 100644
index 00000000..88e41cb6
--- /dev/null
+++ b/evals/observe/observe_yc_startup.py
@@ -0,0 +1,132 @@
+import asyncio
+from evals.init_stagehand import init_stagehand
+from stagehand.schemas import ObserveOptions
+
+async def observe_yc_startup(model_name: str, logger) -> dict:
+ """
+ This function evaluates the YC startups page by:
+
+ 1. Initializing Stagehand with the provided model name and logger.
+ 2. Navigating to "https://www.ycombinator.com/companies" and waiting for the page to reach network idle.
+ 3. Invoking the observe command to locate the container element housing startup information.
+ 4. Checking against candidate locators to determine if a matching element is found.
+
+ Returns a dictionary containing:
+ - _success (bool): True if a matching container element is found.
+ - matchedLocator (Optional[str]): The candidate locator string that matched.
+ - observations (list): The raw observations returned from the observe command.
+ - debugUrl (str): Debug URL from the Stagehand initialization.
+ - sessionUrl (str): Session URL from the Stagehand initialization.
+ - logs (list): Logs collected via the provided logger.
+ """
+ # Initialize Stagehand and extract URLs from the initialization response
+ stagehand, init_response = await init_stagehand(model_name, logger)
+ debug_url = (
+ init_response.get("debugUrl", {}).get("value")
+ if isinstance(init_response.get("debugUrl"), dict)
+ else init_response.get("debugUrl")
+ )
+ session_url = (
+ init_response.get("sessionUrl", {}).get("value")
+ if isinstance(init_response.get("sessionUrl"), dict)
+ else init_response.get("sessionUrl")
+ )
+
+ # Navigate to the YC companies page and wait until network idle
+ await stagehand.page.goto("https://www.ycombinator.com/companies")
+ await stagehand.page.wait_for_load_state("networkidle")
+
+ # Use the observe command with the appropriate instruction
+ observations = await stagehand.page.observe(ObserveOptions(
+ instruction="Find the container element that holds links to each of the startup companies. The companies each have a name, a description, and a link to their website."
+ ))
+
+ # If no observations were returned, mark eval as unsuccessful and return early.
+ if not observations:
+ await stagehand.close()
+ return {
+ "_success": False,
+ "observations": observations,
+ "debugUrl": debug_url,
+ "sessionUrl": session_url,
+ "logs": logger.get_logs() if hasattr(logger, "get_logs") else []
+ }
+
+ # Define candidate locators for the container element.
+ possible_locators = [
+ "div._section_1pgsr_163._results_1pgsr_343",
+ "div._rightCol_1pgsr_592",
+ ]
+
+ possible_handles = []
+ for locator_str in possible_locators:
+ locator = stagehand.page.locator(locator_str)
+ handle = await locator.element_handle()
+ if handle:
+ possible_handles.append((locator_str, handle))
+
+ # Iterate over each observation to determine if it matches any of the candidate locators.
+ found_match = False
+ matched_locator = None
+ for observation in observations:
+ try:
+ # Get the first element matching the observation's selector
+ observation_locator = stagehand.page.locator(observation["selector"]).first
+ observation_handle = await observation_locator.element_handle()
+ if not observation_handle:
+ continue
+
+ # Compare this observation's element with candidate handles.
+ for locator_str, candidate_handle in possible_handles:
+ is_same_node = await observation_handle.evaluate(
+ "(node, otherNode) => node === otherNode", candidate_handle
+ )
+ if is_same_node:
+ found_match = True
+ matched_locator = locator_str
+ break
+
+ if found_match:
+ break
+ except Exception as e:
+ print(f"Warning: Failed to check observation with selector {observation.get('selector')}: {str(e)}")
+ continue
+
+ # Cleanup and close the Stagehand client.
+ await stagehand.close()
+
+ # Return the evaluation results.
+ return {
+ "_success": found_match,
+ "matchedLocator": matched_locator,
+ "observations": observations,
+ "debugUrl": debug_url,
+ "sessionUrl": session_url,
+ "logs": logger.get_logs() if hasattr(logger, "get_logs") else []
+ }
+
+# For quick local testing
+if __name__ == "__main__":
+ import os
+ import asyncio
+ import logging
+ logging.basicConfig(level=logging.INFO)
+
+ class SimpleLogger:
+ def __init__(self):
+ self._logs = []
+ def info(self, message):
+ self._logs.append(message)
+ print("INFO:", message)
+ def error(self, message):
+ self._logs.append(message)
+ print("ERROR:", message)
+ def get_logs(self):
+ return self._logs
+
+ async def main():
+ logger = SimpleLogger()
+ result = await observe_yc_startup("gpt-4o-mini", logger)
+ print("Result:", result)
+
+ asyncio.run(main())
\ No newline at end of file
diff --git a/evals/run_all_evals.py b/evals/run_all_evals.py
new file mode 100644
index 00000000..05a671b1
--- /dev/null
+++ b/evals/run_all_evals.py
@@ -0,0 +1,82 @@
+import asyncio
+import os
+import importlib
+import inspect
+
+# A simple logger to collect logs for the evals
+class SimpleLogger:
+ def __init__(self):
+ self._logs = []
+ def info(self, message):
+ self._logs.append(message)
+ print("INFO:", message)
+ def error(self, message):
+ self._logs.append(message)
+ print("ERROR:", message)
+ def get_logs(self):
+ return self._logs
+
+async def run_all_evals():
+ eval_functions = {}
+ # The base path is the directory in which this file resides (i.e. the evals folder)
+ base_path = os.path.dirname(__file__)
+ # Only process evals from these sub repositories
+ allowed_dirs = {"act", "extract", "observe"}
+
+ # Recursively walk through the evals directory and its children
+ for root, _, files in os.walk(base_path):
+ # Determine the relative path from the base
+ rel_path = os.path.relpath(root, base_path)
+ # Skip the base folder itself
+ if rel_path == ".":
+ continue
+ # Only process directories that start with an allowed subdirectory
+ first_dir = rel_path.split(os.sep)[0]
+ if first_dir not in allowed_dirs:
+ continue
+
+ for file in files:
+ # Skip __init__.py and the runner itself
+ if file.endswith(".py") and file not in ("__init__.py", "run_all_evals.py"):
+ # Build module import path relative to the package root (assumes folder "evals")
+ if rel_path == '.':
+ module_path = f"evals.{file[:-3]}"
+ else:
+ # Replace OS-specific path separators with dots ('.')
+ module_path = f"evals.{rel_path.replace(os.sep, '.')}.{file[:-3]}"
+ try:
+ module = importlib.import_module(module_path)
+ except Exception as e:
+ print(f"Skipping module {module_path} due to import error: {e}")
+ continue
+ # The convention is that the main eval function has the same name as the file
+ func_name = file[:-3]
+ if hasattr(module, func_name):
+ func = getattr(module, func_name)
+ if inspect.iscoroutinefunction(func):
+ eval_functions[module_path] = func
+
+ print("Collected eval functions:")
+ for name in eval_functions:
+ print(" -", name)
+
+ results = {}
+ logger = SimpleLogger()
+ model_name = "gpt-4o" # default model name to pass
+
+ # Run each eval function. If the function signature includes "use_text_extract", pass a default value.
+ for module_path, func in eval_functions.items():
+ sig = inspect.signature(func)
+ if "use_text_extract" in sig.parameters:
+ result = await func(model_name, logger, False)
+ else:
+ result = await func(model_name, logger)
+ results[module_path] = result
+
+ return results
+
+if __name__ == "__main__":
+ final_results = asyncio.run(run_all_evals())
+ print("Evaluation Results:")
+ for module, res in final_results.items():
+ print(f"{module}: {res}")
\ No newline at end of file
diff --git a/evals/utils.py b/evals/utils.py
new file mode 100644
index 00000000..a2854fb1
--- /dev/null
+++ b/evals/utils.py
@@ -0,0 +1,8 @@
+import difflib
+
+def compare_strings(a: str, b: str) -> float:
+ """
+ Compare two strings and return a similarity ratio.
+ This function uses difflib.SequenceMatcher to calculate the similarity between two strings.
+ """
+ return difflib.SequenceMatcher(None, a, b).ratio()
\ No newline at end of file
diff --git a/stagehand/client.py b/stagehand/client.py
index a0168463..218171de 100644
--- a/stagehand/client.py
+++ b/stagehand/client.py
@@ -89,10 +89,10 @@ def __init__(
self.verbose = verbose
self.httpx_client = httpx_client
self.timeout_settings = timeout_settings or httpx.Timeout(
- connect=90.0,
- read=90.0,
- write=90.0,
- pool=90.0,
+ connect=180.0,
+ read=180.0,
+ write=180.0,
+ pool=180.0,
)
self.streamed_response = True # Default to True for streamed responses
@@ -315,7 +315,7 @@ async def _execute(self, method: str, payload: Dict[str, Any]) -> Any:
headers["x-model-api-key"] = self.model_api_key
client = self.httpx_client or httpx.AsyncClient(timeout=self.timeout_settings)
-
+ print(f"Executing {method} with payload: {payload} and headers: {headers}")
async with client:
async with client.stream(
"POST",