Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
252 changes: 121 additions & 131 deletions evals/index.eval.ts
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,13 @@ import { generateExperimentName } from "./utils";
import { exactMatch, errorMatch } from "./scoring";
import { tasksByName, tasksConfig, getModelList } from "./taskConfig";
import { Eval, wrapAISDKModel, wrapOpenAI } from "braintrust";
import { SummaryResult, Testcase, EvalInput } from "@/types/evals";
import {
SummaryResult,
Testcase,
EvalInput,
ErrorType,
EvalOutput,
} from "@/types/evals";
import { EvalLogger } from "./logger";
import { AvailableModel, LLMClient } from "@browserbasehq/stagehand";
import { env } from "./env";
Expand All @@ -46,6 +52,14 @@ import { buildOnlineMind2WebTestcases } from "./suites/onlineMind2Web";

dotenv.config();

process.on("uncaughtException", (err) => {
console.error("[eval-runner] Uncaught exception:", err);
});

process.on("unhandledRejection", (reason) => {
console.error("[eval-runner] Unhandled rejection:", reason);
});

/**
* Read max concurrency and trial count from environment variables set in args.ts.
* Fallback to defaults (20 and 5) if they're not provided.
Expand Down Expand Up @@ -107,20 +121,6 @@ const generateFilteredTestcases = (): Testcase[] => {
);
}

// Check for dataset filter from environment
const datasetFilter = process.env.EVAL_DATASET;

// If using external benchmarks via dataset filter, override category to use agent models
if (
datasetFilter &&
["gaia", "webvoyager", "webbench", "osworld"].includes(datasetFilter)
) {
effectiveCategory = "external_agent_benchmarks";
console.log(
`Using dataset filter "${datasetFilter}", switching to external_agent_benchmarks category.`,
);
}

// Dynamically determine the MODELS based on the effective category
const currentModels = getModelList(effectiveCategory);

Expand All @@ -130,18 +130,15 @@ const generateFilteredTestcases = (): Testcase[] => {
);

// Special handling: fan out GAIA dataset for agent/gaia
const isGAIATaskIncluded =
taskNamesToRun.includes("agent/gaia") || datasetFilter === "gaia";
const isGAIATaskIncluded = taskNamesToRun.includes("agent/gaia");
// Special handling: fan out WebVoyager dataset for agent/webvoyager
const isWebVoyagerTaskIncluded =
taskNamesToRun.includes("agent/webvoyager") ||
datasetFilter === "webvoyager";
const isWebVoyagerTaskIncluded = taskNamesToRun.includes("agent/webvoyager");
// Special handling: fan out WebBench dataset for agent/webbench
const isWebBenchTaskIncluded =
taskNamesToRun.includes("agent/webbench") || datasetFilter === "webbench";
const isWebBenchTaskIncluded = taskNamesToRun.includes("agent/webbench");

// Special handling: fan out OSWorld dataset for agent/osworld
const isOSWorldTaskIncluded =
taskNamesToRun.includes("agent/osworld") || datasetFilter === "osworld";
const isOSWorldTaskIncluded = taskNamesToRun.includes("agent/osworld");

// Special handling: fan out Mind2Web dataset for agent/onlineMind2Web
const isMind2WebTaskIncluded = taskNamesToRun.includes(
"agent/onlineMind2Web",
Expand All @@ -150,100 +147,57 @@ const generateFilteredTestcases = (): Testcase[] => {
let allTestcases: Testcase[] = [];

// Only include GAIA if no dataset filter or if gaia is selected
if (isGAIATaskIncluded && (!datasetFilter || datasetFilter === "gaia")) {
if (isGAIATaskIncluded) {
taskNamesToRun = taskNamesToRun.filter((t) => t !== "agent/gaia");
allTestcases.push(...buildGAIATestcases(currentModels));
} else if (
taskNamesToRun.includes("agent/gaia") &&
datasetFilter &&
datasetFilter !== "gaia"
) {
// Remove GAIA from tasks to run if dataset filter excludes it
taskNamesToRun = taskNamesToRun.filter((t) => t !== "agent/gaia");
}

// Only include WebVoyager if no dataset filter or if webvoyager is selected
if (
isWebVoyagerTaskIncluded &&
(!datasetFilter || datasetFilter === "webvoyager")
) {
if (isWebVoyagerTaskIncluded) {
taskNamesToRun = taskNamesToRun.filter((t) => t !== "agent/webvoyager");
allTestcases.push(...buildWebVoyagerTestcases(currentModels));
} else if (
taskNamesToRun.includes("agent/webvoyager") &&
datasetFilter &&
datasetFilter !== "webvoyager"
) {
// Remove WebVoyager from tasks to run if dataset filter excludes it
taskNamesToRun = taskNamesToRun.filter((t) => t !== "agent/webvoyager");
}

// Only include WebBench if no dataset filter or if webbench is selected
if (
isWebBenchTaskIncluded &&
(!datasetFilter || datasetFilter === "webbench")
) {
if (isWebBenchTaskIncluded) {
taskNamesToRun = taskNamesToRun.filter((t) => t !== "agent/webbench");
allTestcases.push(...buildWebBenchTestcases(currentModels));
} else if (
taskNamesToRun.includes("agent/webbench") &&
datasetFilter &&
datasetFilter !== "webbench"
) {
// Remove WebBench from tasks to run if dataset filter excludes it
taskNamesToRun = taskNamesToRun.filter((t) => t !== "agent/webbench");
}

// Only include OSWorld if no dataset filter or if osworld is selected
if (
isOSWorldTaskIncluded &&
(!datasetFilter || datasetFilter === "osworld")
) {
if (isOSWorldTaskIncluded) {
taskNamesToRun = taskNamesToRun.filter((t) => t !== "agent/osworld");
allTestcases.push(...buildOSWorldTestcases(currentModels));
} else if (
taskNamesToRun.includes("agent/osworld") &&
datasetFilter &&
datasetFilter !== "osworld"
) {
// Remove OSWorld from tasks to run if dataset filter excludes it
taskNamesToRun = taskNamesToRun.filter((t) => t !== "agent/osworld");
}

// Only include Mind2Web if no dataset filter or if onlineMind2Web is selected
if (
isMind2WebTaskIncluded &&
(!datasetFilter || datasetFilter === "onlineMind2Web")
) {
if (isMind2WebTaskIncluded) {
taskNamesToRun = taskNamesToRun.filter((t) => t !== "agent/onlineMind2Web");
allTestcases.push(...buildOnlineMind2WebTestcases(currentModels));
} else if (
isMind2WebTaskIncluded &&
datasetFilter &&
datasetFilter !== "onlineMind2Web"
) {
// Remove Mind2Web from tasks to run if dataset filter excludes it
taskNamesToRun = taskNamesToRun.filter((t) => t !== "agent/onlineMind2Web");
}

// Create a list of all remaining testcases using the determined task names and models
const regularTestcases = currentModels.flatMap((model) =>
taskNamesToRun.map((testName) => ({
input: { name: testName, modelName: model as AvailableModel },
name: testName,
tags: [
model,
testName,
...(tasksConfig.find((t) => t.name === testName)?.categories || []).map(
(x) => `category/${x}`,
),
],
metadata: {
model: model as AvailableModel,
test: testName,
},
expected: true,
})),
taskNamesToRun.map((testName) => {
const taskCategories =
tasksConfig.find((t) => t.name === testName)?.categories || [];
return {
input: { name: testName, modelName: model as AvailableModel },
name: testName,
tags: [
model,
// Only include primary category as tag
taskCategories.length > 0 ? taskCategories[0] : "uncategorized",
],
metadata: {
model: model as AvailableModel,
test: testName,
category: taskCategories[0],
categories: taskCategories, // Keep all categories in metadata for filtering
},
expected: true,
};
}),
);

allTestcases = [...allTestcases, ...regularTestcases];
Expand Down Expand Up @@ -312,42 +266,27 @@ const generateFilteredTestcases = (): Testcase[] => {
const logger = new EvalLogger();
try {
// Dynamically import the task based on its name
const taskModulePath = path.join(
__dirname,
"tasks",
`${input.name}.ts`,
);
const basePath = path.join(__dirname, "tasks", `${input.name}`);
const candidatePaths = [`${basePath}.js`, `${basePath}.ts`];

// Check if file exists at direct path
let taskModule;
try {
// First try to import directly (for backward compatibility)
taskModule = await import(taskModulePath);
} catch (error) {
if (input.name.includes("/")) {
// If the name includes a path separator, try to import from subdirectory
const subDirPath = path.join(
__dirname,
"tasks",
`${input.name}.ts`,
);
try {
taskModule = await import(subDirPath);
} catch (subError) {
throw new StagehandEvalError(
`Failed to import task module for ${input.name}. Tried paths:\n` +
`- ${taskModulePath}\n` +
`- ${subDirPath}\n` +
`Error: ${subError.message}`,
);
}
} else {
throw new StagehandEvalError(
`Failed to import task module for ${input.name} at path ${taskModulePath}: ${error.message}`,
);
let lastError: unknown;
for (const candidate of candidatePaths) {
try {
taskModule = await import(candidate);
break;
} catch (err) {
lastError = err;
}
}

if (!taskModule) {
const tried = candidatePaths.join("\n- ");
throw new StagehandEvalError(
`Failed to import task module for ${input.name}. Tried paths:\n- ${tried}\nError: ${(lastError as Error)?.message}`,
);
}

// Extract the task function
const taskName = input.name.includes("/")
? input.name.split("/").pop() // Get the last part of the path for nested tasks
Expand All @@ -362,9 +301,6 @@ const generateFilteredTestcases = (): Testcase[] => {
}

// Execute the task
console.log(
`🏃 Running eval: ${input.name} with model: ${input.modelName}`,
);
let taskInput: Awaited<ReturnType<typeof initStagehand>>;

if (USE_API) {
Expand Down Expand Up @@ -426,6 +362,7 @@ const generateFilteredTestcases = (): Testcase[] => {
}
// Pass full EvalInput to the task (data-driven params available via input.params)
let result;
let isStagehandClosed = false;
try {
result = await taskFunction({ ...taskInput, input });
// Log result to console
Expand All @@ -435,31 +372,80 @@ const generateFilteredTestcases = (): Testcase[] => {
console.log(`❌ ${input.name}: Failed`);
}
} finally {
await taskInput.stagehand.close();
// Only close if not already closed
if (taskInput.stagehand && !isStagehandClosed) {
try {
await taskInput.stagehand.close();
isStagehandClosed = true;
} catch (closeError) {
console.warn("Error closing stagehand:", closeError);
}
}
}
return result;
} catch (error) {
// Categorize the error
let errorType = ErrorType.UNKNOWN;
const errorMessage =
error instanceof Error ? error.message : String(error);

if (error instanceof Error) {
if (
error.message.includes("timeout") ||
error.message.includes("Timeout")
) {
errorType = ErrorType.TIMEOUT;
} else if (
error.message.includes("network") ||
error.message.includes("fetch")
) {
errorType = ErrorType.NETWORK;
} else if (
error.message.includes("parse") ||
error.message.includes("JSON")
) {
errorType = ErrorType.PARSING_ERROR;
} else if (
error.message.includes("init") ||
error.message.includes("setup")
) {
errorType = ErrorType.SETUP_ERROR;
}
}

// Log any errors that occur during task execution
console.error(`❌ ${input.name}: Error - ${error}`);
console.error(`❌ ${input.name}: ${errorType} - ${errorMessage}`);
logger.error({
message: `Error in task ${input.name}`,
level: 0,
auxiliary: {
error: {
value: error.message,
value: errorMessage,
type: "string",
},
error_type: {
value: errorType,
type: "string",
},
trace: {
value: error.stack,
value: error instanceof Error ? error.stack : "",
type: "string",
},
},
});
return {

const output: EvalOutput = {
_success: false,
error: JSON.parse(JSON.stringify(error, null, 2)),
error_type: errorType,
error_message: errorMessage,
error_stack: error instanceof Error ? error.stack : undefined,
logs: logger.getLogs(),
debugUrl: "",
sessionUrl: "",
};

return output;
}
},
// Use the scoring functions defined above
Expand All @@ -475,6 +461,10 @@ const generateFilteredTestcases = (): Testcase[] => {
? { _success: result.output }
: result.output;

// The full output object (including error_type, error_message, etc.)
// is already captured in result.output and will be visible in Braintrust
// We don't need to duplicate it in metadata

return {
input: result.input,
output,
Expand Down
Loading