diff --git a/evals/index.eval.ts b/evals/index.eval.ts index 3fc03fa05..69e25c5c6 100644 --- a/evals/index.eval.ts +++ b/evals/index.eval.ts @@ -23,7 +23,13 @@ import { generateExperimentName } from "./utils"; import { exactMatch, errorMatch } from "./scoring"; import { tasksByName, tasksConfig, getModelList } from "./taskConfig"; import { Eval, wrapAISDKModel, wrapOpenAI } from "braintrust"; -import { SummaryResult, Testcase, EvalInput } from "@/types/evals"; +import { + SummaryResult, + Testcase, + EvalInput, + ErrorType, + EvalOutput, +} from "@/types/evals"; import { EvalLogger } from "./logger"; import { AvailableModel, LLMClient } from "@browserbasehq/stagehand"; import { env } from "./env"; @@ -46,6 +52,14 @@ import { buildOnlineMind2WebTestcases } from "./suites/onlineMind2Web"; dotenv.config(); +process.on("uncaughtException", (err) => { + console.error("[eval-runner] Uncaught exception:", err); +}); + +process.on("unhandledRejection", (reason) => { + console.error("[eval-runner] Unhandled rejection:", reason); +}); + /** * Read max concurrency and trial count from environment variables set in args.ts. * Fallback to defaults (20 and 5) if they're not provided. @@ -107,20 +121,6 @@ const generateFilteredTestcases = (): Testcase[] => { ); } - // Check for dataset filter from environment - const datasetFilter = process.env.EVAL_DATASET; - - // If using external benchmarks via dataset filter, override category to use agent models - if ( - datasetFilter && - ["gaia", "webvoyager", "webbench", "osworld"].includes(datasetFilter) - ) { - effectiveCategory = "external_agent_benchmarks"; - console.log( - `Using dataset filter "${datasetFilter}", switching to external_agent_benchmarks category.`, - ); - } - // Dynamically determine the MODELS based on the effective category const currentModels = getModelList(effectiveCategory); @@ -130,18 +130,15 @@ const generateFilteredTestcases = (): Testcase[] => { ); // Special handling: fan out GAIA dataset for agent/gaia - const isGAIATaskIncluded = - taskNamesToRun.includes("agent/gaia") || datasetFilter === "gaia"; + const isGAIATaskIncluded = taskNamesToRun.includes("agent/gaia"); // Special handling: fan out WebVoyager dataset for agent/webvoyager - const isWebVoyagerTaskIncluded = - taskNamesToRun.includes("agent/webvoyager") || - datasetFilter === "webvoyager"; + const isWebVoyagerTaskIncluded = taskNamesToRun.includes("agent/webvoyager"); // Special handling: fan out WebBench dataset for agent/webbench - const isWebBenchTaskIncluded = - taskNamesToRun.includes("agent/webbench") || datasetFilter === "webbench"; + const isWebBenchTaskIncluded = taskNamesToRun.includes("agent/webbench"); + // Special handling: fan out OSWorld dataset for agent/osworld - const isOSWorldTaskIncluded = - taskNamesToRun.includes("agent/osworld") || datasetFilter === "osworld"; + const isOSWorldTaskIncluded = taskNamesToRun.includes("agent/osworld"); + // Special handling: fan out Mind2Web dataset for agent/onlineMind2Web const isMind2WebTaskIncluded = taskNamesToRun.includes( "agent/onlineMind2Web", @@ -150,100 +147,57 @@ const generateFilteredTestcases = (): Testcase[] => { let allTestcases: Testcase[] = []; // Only include GAIA if no dataset filter or if gaia is selected - if (isGAIATaskIncluded && (!datasetFilter || datasetFilter === "gaia")) { + if (isGAIATaskIncluded) { taskNamesToRun = taskNamesToRun.filter((t) => t !== "agent/gaia"); allTestcases.push(...buildGAIATestcases(currentModels)); - } else if ( - taskNamesToRun.includes("agent/gaia") && - datasetFilter && - datasetFilter !== "gaia" - ) { - // Remove GAIA from tasks to run if dataset filter excludes it - taskNamesToRun = taskNamesToRun.filter((t) => t !== "agent/gaia"); } // Only include WebVoyager if no dataset filter or if webvoyager is selected - if ( - isWebVoyagerTaskIncluded && - (!datasetFilter || datasetFilter === "webvoyager") - ) { + if (isWebVoyagerTaskIncluded) { taskNamesToRun = taskNamesToRun.filter((t) => t !== "agent/webvoyager"); allTestcases.push(...buildWebVoyagerTestcases(currentModels)); - } else if ( - taskNamesToRun.includes("agent/webvoyager") && - datasetFilter && - datasetFilter !== "webvoyager" - ) { - // Remove WebVoyager from tasks to run if dataset filter excludes it - taskNamesToRun = taskNamesToRun.filter((t) => t !== "agent/webvoyager"); } // Only include WebBench if no dataset filter or if webbench is selected - if ( - isWebBenchTaskIncluded && - (!datasetFilter || datasetFilter === "webbench") - ) { + if (isWebBenchTaskIncluded) { taskNamesToRun = taskNamesToRun.filter((t) => t !== "agent/webbench"); allTestcases.push(...buildWebBenchTestcases(currentModels)); - } else if ( - taskNamesToRun.includes("agent/webbench") && - datasetFilter && - datasetFilter !== "webbench" - ) { - // Remove WebBench from tasks to run if dataset filter excludes it - taskNamesToRun = taskNamesToRun.filter((t) => t !== "agent/webbench"); } // Only include OSWorld if no dataset filter or if osworld is selected - if ( - isOSWorldTaskIncluded && - (!datasetFilter || datasetFilter === "osworld") - ) { + if (isOSWorldTaskIncluded) { taskNamesToRun = taskNamesToRun.filter((t) => t !== "agent/osworld"); allTestcases.push(...buildOSWorldTestcases(currentModels)); - } else if ( - taskNamesToRun.includes("agent/osworld") && - datasetFilter && - datasetFilter !== "osworld" - ) { - // Remove OSWorld from tasks to run if dataset filter excludes it - taskNamesToRun = taskNamesToRun.filter((t) => t !== "agent/osworld"); } // Only include Mind2Web if no dataset filter or if onlineMind2Web is selected - if ( - isMind2WebTaskIncluded && - (!datasetFilter || datasetFilter === "onlineMind2Web") - ) { + if (isMind2WebTaskIncluded) { taskNamesToRun = taskNamesToRun.filter((t) => t !== "agent/onlineMind2Web"); allTestcases.push(...buildOnlineMind2WebTestcases(currentModels)); - } else if ( - isMind2WebTaskIncluded && - datasetFilter && - datasetFilter !== "onlineMind2Web" - ) { - // Remove Mind2Web from tasks to run if dataset filter excludes it - taskNamesToRun = taskNamesToRun.filter((t) => t !== "agent/onlineMind2Web"); } // Create a list of all remaining testcases using the determined task names and models const regularTestcases = currentModels.flatMap((model) => - taskNamesToRun.map((testName) => ({ - input: { name: testName, modelName: model as AvailableModel }, - name: testName, - tags: [ - model, - testName, - ...(tasksConfig.find((t) => t.name === testName)?.categories || []).map( - (x) => `category/${x}`, - ), - ], - metadata: { - model: model as AvailableModel, - test: testName, - }, - expected: true, - })), + taskNamesToRun.map((testName) => { + const taskCategories = + tasksConfig.find((t) => t.name === testName)?.categories || []; + return { + input: { name: testName, modelName: model as AvailableModel }, + name: testName, + tags: [ + model, + // Only include primary category as tag + taskCategories.length > 0 ? taskCategories[0] : "uncategorized", + ], + metadata: { + model: model as AvailableModel, + test: testName, + category: taskCategories[0], + categories: taskCategories, // Keep all categories in metadata for filtering + }, + expected: true, + }; + }), ); allTestcases = [...allTestcases, ...regularTestcases]; @@ -312,42 +266,27 @@ const generateFilteredTestcases = (): Testcase[] => { const logger = new EvalLogger(); try { // Dynamically import the task based on its name - const taskModulePath = path.join( - __dirname, - "tasks", - `${input.name}.ts`, - ); + const basePath = path.join(__dirname, "tasks", `${input.name}`); + const candidatePaths = [`${basePath}.js`, `${basePath}.ts`]; - // Check if file exists at direct path let taskModule; - try { - // First try to import directly (for backward compatibility) - taskModule = await import(taskModulePath); - } catch (error) { - if (input.name.includes("/")) { - // If the name includes a path separator, try to import from subdirectory - const subDirPath = path.join( - __dirname, - "tasks", - `${input.name}.ts`, - ); - try { - taskModule = await import(subDirPath); - } catch (subError) { - throw new StagehandEvalError( - `Failed to import task module for ${input.name}. Tried paths:\n` + - `- ${taskModulePath}\n` + - `- ${subDirPath}\n` + - `Error: ${subError.message}`, - ); - } - } else { - throw new StagehandEvalError( - `Failed to import task module for ${input.name} at path ${taskModulePath}: ${error.message}`, - ); + let lastError: unknown; + for (const candidate of candidatePaths) { + try { + taskModule = await import(candidate); + break; + } catch (err) { + lastError = err; } } + if (!taskModule) { + const tried = candidatePaths.join("\n- "); + throw new StagehandEvalError( + `Failed to import task module for ${input.name}. Tried paths:\n- ${tried}\nError: ${(lastError as Error)?.message}`, + ); + } + // Extract the task function const taskName = input.name.includes("/") ? input.name.split("/").pop() // Get the last part of the path for nested tasks @@ -362,9 +301,6 @@ const generateFilteredTestcases = (): Testcase[] => { } // Execute the task - console.log( - `🏃 Running eval: ${input.name} with model: ${input.modelName}`, - ); let taskInput: Awaited>; if (USE_API) { @@ -426,6 +362,7 @@ const generateFilteredTestcases = (): Testcase[] => { } // Pass full EvalInput to the task (data-driven params available via input.params) let result; + let isStagehandClosed = false; try { result = await taskFunction({ ...taskInput, input }); // Log result to console @@ -435,31 +372,80 @@ const generateFilteredTestcases = (): Testcase[] => { console.log(`❌ ${input.name}: Failed`); } } finally { - await taskInput.stagehand.close(); + // Only close if not already closed + if (taskInput.stagehand && !isStagehandClosed) { + try { + await taskInput.stagehand.close(); + isStagehandClosed = true; + } catch (closeError) { + console.warn("Error closing stagehand:", closeError); + } + } } return result; } catch (error) { + // Categorize the error + let errorType = ErrorType.UNKNOWN; + const errorMessage = + error instanceof Error ? error.message : String(error); + + if (error instanceof Error) { + if ( + error.message.includes("timeout") || + error.message.includes("Timeout") + ) { + errorType = ErrorType.TIMEOUT; + } else if ( + error.message.includes("network") || + error.message.includes("fetch") + ) { + errorType = ErrorType.NETWORK; + } else if ( + error.message.includes("parse") || + error.message.includes("JSON") + ) { + errorType = ErrorType.PARSING_ERROR; + } else if ( + error.message.includes("init") || + error.message.includes("setup") + ) { + errorType = ErrorType.SETUP_ERROR; + } + } + // Log any errors that occur during task execution - console.error(`❌ ${input.name}: Error - ${error}`); + console.error(`❌ ${input.name}: ${errorType} - ${errorMessage}`); logger.error({ message: `Error in task ${input.name}`, level: 0, auxiliary: { error: { - value: error.message, + value: errorMessage, + type: "string", + }, + error_type: { + value: errorType, type: "string", }, trace: { - value: error.stack, + value: error instanceof Error ? error.stack : "", type: "string", }, }, }); - return { + + const output: EvalOutput = { _success: false, error: JSON.parse(JSON.stringify(error, null, 2)), + error_type: errorType, + error_message: errorMessage, + error_stack: error instanceof Error ? error.stack : undefined, logs: logger.getLogs(), + debugUrl: "", + sessionUrl: "", }; + + return output; } }, // Use the scoring functions defined above @@ -475,6 +461,10 @@ const generateFilteredTestcases = (): Testcase[] => { ? { _success: result.output } : result.output; + // The full output object (including error_type, error_message, etc.) + // is already captured in result.output and will be visible in Braintrust + // We don't need to duplicate it in metadata + return { input: result.input, output, diff --git a/evals/tasks/agent/webbench.ts b/evals/tasks/agent/webbench.ts index 6e8feb434..3c3970055 100644 --- a/evals/tasks/agent/webbench.ts +++ b/evals/tasks/agent/webbench.ts @@ -126,27 +126,8 @@ export const webbench: EvalFunction = async ({ logs: logger.getLogs(), }; } catch (error) { - logger.error({ - category: "webbench", - level: 0, - message: `Unhandled error in WebBench task`, - auxiliary: { - error: { - value: error instanceof Error ? error.message : String(error), - type: "string", - }, - trace: { - value: error instanceof Error && error.stack ? error.stack : "", - type: "string", - }, - }, - }); - return { - _success: false, - error, - debugUrl, - sessionUrl, - logs: logger.getLogs(), - }; + // Let the error propagate - the parent runner will handle cleanup + console.error(error); + throw error; } }; diff --git a/evals/tasks/agent/webvoyager.ts b/evals/tasks/agent/webvoyager.ts index 1a9041839..c34c0310f 100644 --- a/evals/tasks/agent/webvoyager.ts +++ b/evals/tasks/agent/webvoyager.ts @@ -208,12 +208,8 @@ Today's date is ${new Date().toLocaleDateString()}`; logs: logger.getLogs(), }; } catch (error) { - return { - _success: false, - error, - debugUrl, - sessionUrl, - logs: logger.getLogs(), - }; + // Let the error propagate - the parent runner will handle cleanup + console.error(error); + throw error; } }; diff --git a/types/evals.ts b/types/evals.ts index 08474dd89..f3b8cbc0c 100644 --- a/types/evals.ts +++ b/types/evals.ts @@ -17,15 +17,37 @@ export type StagehandInitResult = { agent: AgentInstance; }; -export type EvalFunction = ( - taskInput: StagehandInitResult & { input: EvalInput }, -) => Promise<{ +export enum ErrorType { + TIMEOUT = "timeout", + NETWORK = "network", + AGENT_FAILURE = "agent_failure", + EVALUATION_ERROR = "evaluation_error", + SETUP_ERROR = "setup_error", + PARSING_ERROR = "parsing_error", + ANTIBOT = "bot_detected", + UNKNOWN = "unknown", +} + +export interface EvalOutput { _success: boolean; logs: LogLine[]; debugUrl: string; sessionUrl: string; error?: unknown; -}>; + error_type?: ErrorType; + error_message?: string; + error_stack?: string; + execution_time?: number; + agent_steps?: number; + final_answer?: string; + reasoning?: string; + observations?: string | unknown; // Allow both string and arrays for backward compatibility + [key: string]: unknown; // Allow additional fields for flexibility +} + +export type EvalFunction = ( + taskInput: StagehandInitResult & { input: EvalInput }, +) => Promise; export const EvalCategorySchema = z.enum([ "observe", @@ -49,16 +71,23 @@ export interface EvalInput { params?: Record; } +export interface TestcaseMetadata { + model: AvailableModel; + test: string; + category?: string; + dataset?: string; + dataset_id?: string; + dataset_level?: string | number; + dataset_category?: string; + [key: string]: unknown; +} + export interface Testcase - extends EvalCase< - EvalInput, - unknown, - { model: AvailableModel; test: string; categories?: string[] } - > { + extends EvalCase { input: EvalInput; name: string; tags: string[]; - metadata: { model: AvailableModel; test: string; categories?: string[] }; + metadata: TestcaseMetadata; expected: unknown; }