-
Notifications
You must be signed in to change notification settings - Fork 1.5k
feat(verifier): add verifier evaluator shell and types #2157
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
ca4d3f0
2fb8bb3
d8b6dc3
017c70e
3f4b770
c44a3c7
2f81239
9b71108
f31fd98
1aa1578
99cf719
4825e92
36e9342
84042d4
3d8c324
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,5 @@ | ||
| --- | ||
| "@browserbasehq/stagehand": patch | ||
| --- | ||
|
|
||
| Add verifier trajectory, rubric, and evaluation-result types with normalized public naming. |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,28 @@ | ||
| /** | ||
| * Public re-exports for the verifier subsystem. | ||
| */ | ||
| export type { | ||
| AgentEvidence, | ||
| AgentEvidenceModality, | ||
| CriterionScore, | ||
| EvaluationResult, | ||
| FirstPointOfFailure, | ||
| ProbeEvidence, | ||
| Rubric, | ||
| RubricCriterion, | ||
| TaskSpec, | ||
| TaskValidity, | ||
| ToolOutput, | ||
| Trajectory, | ||
| TrajectoryStatus, | ||
| TrajectoryStep, | ||
| TrajectoryUsage, | ||
| Verifier, | ||
| VerifierFinding, | ||
| VerifierRawSteps, | ||
| } from "./types.js"; | ||
| export { | ||
| loadTrajectoryFromDisk, | ||
| nextResultFilename, | ||
| normalizeRubric, | ||
| } from "./trajectory.js"; |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,189 @@ | ||
| import type { | ||
| AgentEvidenceModality, | ||
| ProbeEvidence, | ||
| Rubric, | ||
| Trajectory, | ||
| TrajectoryStep, | ||
| } from "./types.js"; | ||
|
|
||
| type RawRubricCriterion = { | ||
| criterion: unknown; | ||
| description: unknown; | ||
| max_points?: unknown; | ||
| maxPoints?: unknown; | ||
| condition?: unknown; | ||
| }; | ||
|
|
||
| type RawRubric = { | ||
| items?: unknown; | ||
| }; | ||
|
|
||
| /** | ||
| * Convert dataset or generated rubric JSON into the public Stagehand shape. | ||
| * Snake-case dataset fields are accepted here so serialized quirks do not leak | ||
| * into the canonical rubric type. | ||
| */ | ||
| export function normalizeRubric(rubric: unknown): Rubric | undefined { | ||
| if (rubric == null) return undefined; | ||
| if (typeof rubric !== "object") { | ||
| throw new TypeError("Rubric must be an object"); | ||
| } | ||
|
|
||
| const rawRubric = rubric as RawRubric; | ||
| if (!Array.isArray(rawRubric.items)) { | ||
| throw new TypeError("Rubric is missing an items array"); | ||
| } | ||
|
|
||
| return { | ||
| items: rawRubric.items.map((item) => { | ||
| const criterion = normalizeRequiredString(item.criterion, "criterion"); | ||
| const description = normalizeRequiredString( | ||
| item.description, | ||
| "description", | ||
| ); | ||
| const maxPoints = normalizeMaxPoints(item); | ||
|
|
||
| if (typeof maxPoints !== "number" || !Number.isFinite(maxPoints)) { | ||
| throw new TypeError( | ||
| `Rubric criterion "${criterion}" is missing a numeric maxPoints value`, | ||
| ); | ||
| } | ||
|
|
||
| return { | ||
| criterion, | ||
| description, | ||
| maxPoints, | ||
| ...(typeof item.condition === "string" && { | ||
| condition: item.condition, | ||
| }), | ||
| }; | ||
| }), | ||
| }; | ||
| } | ||
|
|
||
| function normalizeRequiredString(value: unknown, fieldName: string): string { | ||
| if (typeof value === "string" && value.length) { | ||
| return value; | ||
| } | ||
|
|
||
| throw new TypeError(`Rubric criterion is missing a ${fieldName} value`); | ||
| } | ||
|
|
||
| function normalizeMaxPoints(item: RawRubricCriterion): unknown { | ||
| return item.maxPoints ?? item.max_points; | ||
| } | ||
|
|
||
| function normalizeResultLabel(label?: string): string { | ||
| return (label ?? `rescore-${new Date().toISOString()}`).replace( | ||
| /[^A-Za-z0-9._-]/g, | ||
| "_", | ||
| ); | ||
| } | ||
|
|
||
| // ───────────────────────────────────────────────────────────────────────────── | ||
| // On-disk loader | ||
| // ───────────────────────────────────────────────────────────────────────────── | ||
|
|
||
| /** | ||
| * Hydrate a Trajectory from the on-disk directory layout written by | ||
| * TrajectoryRecorder.persist(). Used by the offline re-scoring CLI (`bench | ||
| * verify`) and by any consumer that wants to feed a saved trajectory back | ||
| * into V3Evaluator.verify() without running an agent. | ||
| * | ||
| * Reverses the recorder's serialization tweaks: | ||
| * - `probeEvidence.screenshotPath` → read file into `probeEvidence.screenshot`. | ||
| * - Image modalities in `agentEvidence.modalities` carry `bytesBase64` on | ||
| * disk (human-readable JSON) instead of raw Buffer; we decode back. | ||
| * | ||
| * @param dir absolute or cwd-relative path to a `<run-id>/<task-id>/` directory. | ||
| */ | ||
| export async function loadTrajectoryFromDisk(dir: string): Promise<Trajectory> { | ||
| const fs = await import("node:fs/promises"); | ||
| const path = await import("node:path"); | ||
| const trajectoryDir = path.resolve(dir); | ||
|
|
||
| const trajectoryPath = path.join(trajectoryDir, "trajectory.json"); | ||
| const raw = await fs.readFile(trajectoryPath, "utf8"); | ||
| const parsed = JSON.parse(raw) as Trajectory & { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This could be made more typesafe at runtime if we used zod at the parsing boundary, like: and then you could z.infer to still have a Trajectory type (the array of trajectory steps could all be part of the zod schema too) but might be a nit!
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. going to add an extra pr at the end to parse as much as possible, this one is tricky because we use downstream some precomputed rubrics from |
||
| steps: Array< | ||
| TrajectoryStep & { | ||
| agentEvidence: { | ||
| modalities: Array< | ||
| | { type: "text"; content: string } | ||
| | { | ||
| type: "image"; | ||
| mediaType: string; | ||
| // On-disk form (recorder writes base64); accept either to | ||
| // tolerate hand-edited fixtures. | ||
| bytes?: unknown; | ||
| bytesBase64?: string; | ||
| } | ||
| | { type: "json"; content: unknown } | ||
| >; | ||
| }; | ||
| probeEvidence: ProbeEvidence; | ||
| } | ||
| >; | ||
| }; | ||
|
|
||
| const resolveWithinTrajectoryDir = (candidate: string): string => { | ||
| const resolved = path.resolve(trajectoryDir, candidate); | ||
| const relative = path.relative(trajectoryDir, resolved); | ||
| const outside = | ||
| relative === ".." || | ||
| relative.startsWith(`..${path.sep}`) || | ||
| path.isAbsolute(relative); | ||
|
|
||
| if (outside) { | ||
| throw new Error( | ||
| `Trajectory screenshotPath escapes trajectory directory: ${candidate}`, | ||
| ); | ||
| } | ||
|
|
||
| return resolved; | ||
| }; | ||
|
|
||
| for (const step of parsed.steps) { | ||
| // Rehydrate tier-2 probe screenshot from its on-disk file reference. | ||
| const probe = step.probeEvidence; | ||
| if (probe?.screenshotPath && !probe.screenshot) { | ||
| const resolved = resolveWithinTrajectoryDir(probe.screenshotPath); | ||
| try { | ||
| probe.screenshot = await fs.readFile(resolved); | ||
| } catch { | ||
| // Missing screenshot file: leave probe.screenshot unset. The verifier's | ||
| // evidence_insufficient path will handle it. | ||
| } | ||
| } | ||
|
|
||
| // Decode image modalities from base64 back to Buffer. | ||
| if (step.agentEvidence?.modalities) { | ||
| step.agentEvidence.modalities = step.agentEvidence.modalities.map((m) => { | ||
| // The on-disk shape carries bytesBase64 instead of bytes, so we look | ||
| // through `unknown` here rather than rely on the typed union. | ||
| const raw = m as unknown as { bytesBase64?: string }; | ||
| if (m.type === "image" && typeof raw.bytesBase64 === "string") { | ||
| return { | ||
| type: "image" as const, | ||
| bytes: Buffer.from(raw.bytesBase64, "base64"), | ||
| mediaType: m.mediaType, | ||
| }; | ||
| } | ||
| return m as AgentEvidenceModality; | ||
| }); | ||
| } | ||
| } | ||
|
|
||
| return parsed; | ||
| } | ||
|
|
||
| /** | ||
| * Build a `result*.json` filename for persisted evaluator output. | ||
| * | ||
| * Convention: the live run writes `result.json`; offline re-score attempts use | ||
| * a label-based name (e.g., `result_rescore-2026-05-11.json`) so they coexist | ||
| * without collisions and remain easy to diff. | ||
| */ | ||
| export function nextResultFilename(label?: string): string { | ||
| return `result_${normalizeResultLabel(label)}.json`; | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Also here - a rubric could just be a zod object + a z.infer type, and then this logic could be built into the zod object (via superrefine or otherwise), and it might be a bit simpler, but optional nit!