diff --git a/CHANGELOG.md b/CHANGELOG.md index f827bc6..6c6fd60 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,26 @@ documented with migration notes. - Additional docs for suite-specific adoption. - Better compatibility tests for Promptfoo variable contracts. +## [0.1.8] - 2026-07-04 + +### Added + +- Added generic pointwise summary helpers for advisory verdict counts and calibration notes. +- Documented the shared pointwise report summary pattern for curated manual evidence. + +### Fixed + +- Hardened pointwise judge result handling so provider, prompt version, rubric version, and run + manifest metadata must match the configured run before the result bundle is written. +- Added regression tests for malformed or missing pointwise run metadata. + +### Notes + +- Deterministic `run-case` and manual `report` compatibility are preserved. +- Consumer repos still own judge semantics, prompts, fixtures, and calibration policy. +- No npm package is published. +- Consumers may pin `github:agentic-workflow-kit/eval-kit#v0.1.8`. + ## [0.1.7] - 2026-07-04 ### Fixed @@ -135,7 +155,8 @@ documented with migration notes. - Suite-specific presets remain deferred. - Consumer repos own their own semantics, prompts, cases, and pass/fail policies. -[Unreleased]: https://github.com/agentic-workflow-kit/eval-kit/compare/v0.1.7...main +[Unreleased]: https://github.com/agentic-workflow-kit/eval-kit/compare/v0.1.8...main +[0.1.8]: https://github.com/agentic-workflow-kit/eval-kit/compare/v0.1.7...v0.1.8 [0.1.7]: https://github.com/agentic-workflow-kit/eval-kit/compare/v0.1.6...v0.1.7 [0.1.6]: https://github.com/agentic-workflow-kit/eval-kit/compare/v0.1.5...v0.1.6 [0.1.5]: https://github.com/agentic-workflow-kit/eval-kit/compare/v0.1.4...v0.1.5 diff --git a/README.md b/README.md index b24c8c8..41b0410 100644 --- a/README.md +++ b/README.md @@ -11,7 +11,7 @@ Shared evaluation infrastructure for `agentic-workflow-kit` repositories. ```json { "devDependencies": { - "@agentic-workflow-kit/eval-kit": "github:agentic-workflow-kit/eval-kit#v0.1.7" + "@agentic-workflow-kit/eval-kit": "github:agentic-workflow-kit/eval-kit#v0.1.8" } } ``` @@ -70,7 +70,7 @@ Install from a Git tag in a consumer repo: ```json { "devDependencies": { - "@agentic-workflow-kit/eval-kit": "github:agentic-workflow-kit/eval-kit#v0.1.7" + "@agentic-workflow-kit/eval-kit": "github:agentic-workflow-kit/eval-kit#v0.1.8" }, "scripts": { "eval:doctor": "eval-kit doctor --config evals/eval-kit.config.json", @@ -196,6 +196,7 @@ v0.1.4 v0.1.5 v0.1.6 v0.1.7 +v0.1.8 v0.2.0 ``` diff --git a/docs/design/consumer-integration.md b/docs/design/consumer-integration.md index 4916ee8..dee0719 100644 --- a/docs/design/consumer-integration.md +++ b/docs/design/consumer-integration.md @@ -9,7 +9,7 @@ Consumer repos should adopt eval-kit through a pinned Git tag and keep their eva ```json { "devDependencies": { - "@agentic-workflow-kit/eval-kit": "github:agentic-workflow-kit/eval-kit#v0.1.7" + "@agentic-workflow-kit/eval-kit": "github:agentic-workflow-kit/eval-kit#v0.1.8" } } ``` diff --git a/docs/guides/consumer-integration.md b/docs/guides/consumer-integration.md index 5ad79c6..c1e653a 100644 --- a/docs/guides/consumer-integration.md +++ b/docs/guides/consumer-integration.md @@ -18,7 +18,7 @@ If you cannot state the eval goal, do not bootstrap a suite yet. Empty harnesses ```json { "devDependencies": { - "@agentic-workflow-kit/eval-kit": "github:agentic-workflow-kit/eval-kit#v0.1.7" + "@agentic-workflow-kit/eval-kit": "github:agentic-workflow-kit/eval-kit#v0.1.8" } } ``` diff --git a/docs/guides/model-judge-calibration-reporting.md b/docs/guides/model-judge-calibration-reporting.md index 4c5cf80..a2358ed 100644 --- a/docs/guides/model-judge-calibration-reporting.md +++ b/docs/guides/model-judge-calibration-reporting.md @@ -43,5 +43,15 @@ Manual reports should be written for reviewer handoff, not CI: risks; - state that model-judge evidence cannot upgrade deterministic red or yellow results. +Eval-kit exposes `countPointwiseVerdicts` and `formatPointwiseCalibrationSummary` as a shared +summary pattern. Consumers may use these helpers when writing curated notes or report hooks, but the +consumer still owns expected-good/expected-bad labels, critical-item policy, and false-pass or +false-fail interpretation. + +For pointwise result bundles, eval-kit fails closed when required run metadata is absent or +mismatched. A valid pointwise run records run id, one case id, model, provider, reasoning effort when +present, prompt version, rubric version, runner version, and the artifact/output paths for the +pointwise result bundle. + Keep raw provider bundles under ignored `evals/results/` paths unless a human curates and commits a summary. diff --git a/docs/guides/quickstart.md b/docs/guides/quickstart.md index 1578331..f6b9463 100644 --- a/docs/guides/quickstart.md +++ b/docs/guides/quickstart.md @@ -7,7 +7,7 @@ This guide adds a generic deterministic eval suite to a consumer repo. ```json { "devDependencies": { - "@agentic-workflow-kit/eval-kit": "github:agentic-workflow-kit/eval-kit#v0.1.7" + "@agentic-workflow-kit/eval-kit": "github:agentic-workflow-kit/eval-kit#v0.1.8" } } ``` diff --git a/docs/reference/adapter-contract.md b/docs/reference/adapter-contract.md index 2033284..10537b0 100644 --- a/docs/reference/adapter-contract.md +++ b/docs/reference/adapter-contract.md @@ -136,6 +136,19 @@ export const canonicalizeExpectedItemMetadata = (actualItems, expectedItems) => })); ``` +Eval-kit exports generic pointwise helpers for consumers that curate summaries: + +```js +import { + countPointwiseVerdicts, + formatPointwiseCalibrationSummary, +} from "@agentic-workflow-kit/eval-kit"; +``` + +Use these helpers to report advisory counts for `covered`, `partial`, `missing`, `contradicted`, and +`unknown`, plus expected-good/expected-bad calibration labels and false-pass/false-fail notes. The +helpers do not define consumer semantics. + ## Pairwise judge hook Required for `judge-pairwise`: diff --git a/docs/reference/release-process.md b/docs/reference/release-process.md index 80a6693..01fc9b6 100644 --- a/docs/reference/release-process.md +++ b/docs/reference/release-process.md @@ -34,7 +34,7 @@ Consumers depend on tags like: Title: ```text -chore(release): v0.1.7 +chore(release): v0.1.8 ``` Required changes: @@ -63,18 +63,18 @@ git checkout main git pull --ff-only git rev-parse HEAD -git tag -a v0.1.7 -m "v0.1.7" -git push origin v0.1.7 +git tag -a v0.1.8 -m "v0.1.8" +git push origin v0.1.8 ``` Verify: ```bash -git rev-parse v0.1.7^{} -git show --no-patch --decorate v0.1.7 +git rev-parse v0.1.8^{} +git show --no-patch --decorate v0.1.8 ``` -`v0.1.7^{}` must point to the release commit. With an annotated tag, `git rev-parse v0.1.7` +`v0.1.8^{}` must point to the release commit. With an annotated tag, `git rev-parse v0.1.8` returns the tag object; `^{}` dereferences to the commit. ## GitHub Release @@ -93,7 +93,7 @@ For each consumer repo: ```json { - "@agentic-workflow-kit/eval-kit": "github:agentic-workflow-kit/eval-kit#v0.1.7" + "@agentic-workflow-kit/eval-kit": "github:agentic-workflow-kit/eval-kit#v0.1.8" } ``` @@ -108,7 +108,7 @@ pnpm check 3. Run consumer smoke commands, for example in `technical-design`: ```bash -pnpm eval:case -- --case case-tiny-laundry-pickup-v1 --candidate evals/cases/case-tiny-laundry-pickup-v1/reference-design.md --run-id verify-eval-kit-v0.1.7 +pnpm eval:case -- --case case-tiny-laundry-pickup-v1 --candidate evals/cases/case-tiny-laundry-pickup-v1/reference-design.md --run-id verify-eval-kit-v0.1.8 ``` 4. Open a PR with dependency, lockfile, and any compatibility fixes. @@ -120,7 +120,7 @@ Do not move the tag. Create a new patch release: ```text -v0.1.7 -> v0.1.8 +v0.1.8 -> v0.1.9 ``` Then open consumer bump PRs. diff --git a/docs/reference/results.md b/docs/reference/results.md index 1675541..a017f8e 100644 --- a/docs/reference/results.md +++ b/docs/reference/results.md @@ -39,7 +39,7 @@ Current schema: "run_type": "deterministic", "runner": { "id": "generic-eval-case", - "version": "0.1.7" + "version": "0.1.8" }, "case_ids": ["case-example-v1"], "started_at": "2026-07-03T00:00:00.000Z", @@ -113,3 +113,9 @@ CLI candidate labels. Its `randomization.original_order` field records the origi candidate keys were displayed as Candidate A/B for the model judge. Treat these as potentially sensitive. + +Pointwise `judge-coverage` manifests fail closed if required run metadata is missing or mismatched. +Required pointwise metadata includes the run id, exactly one case id, model, provider, reasoning +effort when supplied, prompt version, rubric version, runner version, and artifact/output paths for +the pointwise report, structured pointwise result, Promptfoo config, raw Promptfoo results, and HTML +report. diff --git a/docs/schemas.md b/docs/schemas.md index b754470..b33eca7 100644 --- a/docs/schemas.md +++ b/docs/schemas.md @@ -117,6 +117,20 @@ Optional model-run fields: - `randomization` - `provenance.parent_run_ids` +For `judge-coverage` pointwise runs, eval-kit additionally validates the run metadata before +writing the manifest. Required pointwise metadata is: + +- `run_id`; +- exactly one `case_ids` entry matching the judged case; +- `model`; +- `provider`; +- `reasoning_effort` when supplied by the run command; +- `prompt_version`; +- `rubric_version`; +- `runner.version`; +- artifact and output paths for the pointwise report, structured pointwise result, Promptfoo config, + raw Promptfoo results, and Promptfoo HTML report. + ### `finding.schema.json` Generic minimal finding shape: diff --git a/package.json b/package.json index 4ce0780..1547e9c 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "@agentic-workflow-kit/eval-kit", - "version": "0.1.7", + "version": "0.1.8", "description": "Portable eval runner primitives for local eval suites.", "private": true, "type": "module", diff --git a/skills/bootstrap-eval-suite/SKILL.md b/skills/bootstrap-eval-suite/SKILL.md index cc0db07..07b44f6 100644 --- a/skills/bootstrap-eval-suite/SKILL.md +++ b/skills/bootstrap-eval-suite/SKILL.md @@ -29,6 +29,8 @@ standard two-config pattern: - Document the local calibration policy before treating pointwise results as more than raw advisory evidence. The policy should define expected-good and expected-bad fixture labels, `partial` and `unknown` handling, and where curated summaries live. +- For curated summaries, use the shared count shape for `covered`, `partial`, `missing`, + `contradicted`, and `unknown`, then add consumer-owned false-pass and false-fail notes. ## Boundaries diff --git a/skills/review-eval-suite/SKILL.md b/skills/review-eval-suite/SKILL.md index b72d69e..bfe0966 100644 --- a/skills/review-eval-suite/SKILL.md +++ b/skills/review-eval-suite/SKILL.md @@ -29,6 +29,9 @@ Use this skill when auditing or reviewing an eval-kit suite. passes. - Treat `partial` as non-covered unless the consumer explicitly documents why a non-critical partial is acceptable. Repeated `unknown` verdicts are calibration or prompt-quality risks. +- Verify pointwise run metadata before trusting manual judge evidence: run id, one case id, model, + provider, reasoning effort when present, prompt version, rubric version, runner version, and + artifact/output paths must be present and coherent. - Treat run-producing semantic portfolios as local on-demand evidence before significant changes, not default CI. - Do not claim suite readiness without command evidence. diff --git a/skills/run-eval-suite/SKILL.md b/skills/run-eval-suite/SKILL.md index 56da791..1f0f472 100644 --- a/skills/run-eval-suite/SKILL.md +++ b/skills/run-eval-suite/SKILL.md @@ -29,6 +29,9 @@ Use this skill when executing a local eval-kit suite. scripts before any manual `eval:judge:coverage` run. - For pointwise model-judge summaries, treat `partial`, `missing`, `contradicted`, and `unknown` as non-covered unless the consumer policy explicitly accepts the item. +- Prefer the eval-kit pointwise summary helpers for curated report counts, and record + expected-good/expected-bad labels plus false-pass/false-fail notes when summarizing manual judge + evidence. - Expected-bad fixtures should remain adverse on their intended defect. Do not describe an adverse bad-fixture result as a failed eval when it matches the calibration label. - Preserve raw outputs according to the consumer repo's artifact policy. @@ -38,4 +41,6 @@ Use this skill when executing a local eval-kit suite. Report the config path, cases run, result directories, verdicts, report paths, and any skipped or advisory-only checks. For model-assisted runs, state that provider calls were explicitly requested. Report deterministic evidence first, then model-judge counts for `covered`, `partial`, `missing`, -`contradicted`, and `unknown`. +`contradicted`, and `unknown`. If a pointwise result manifest is missing run id, case id, model, +provider, prompt version, rubric version, runner version, or artifact paths, treat that run as +invalid evidence. diff --git a/src/index.mjs b/src/index.mjs index 44c828d..3c14479 100644 --- a/src/index.mjs +++ b/src/index.mjs @@ -20,6 +20,13 @@ export { runPromptfooRaw, } from "./promptfoo.mjs"; export { aggregateVerdict, criticalBlockerCount } from "./verdict.mjs"; +export { + POINTWISE_VERDICTS, + countPointwiseVerdicts, + formatPointwiseCalibrationSummary, + formatPointwiseVerdictCounts, + validatePointwiseRunMetadata, +} from "./pointwise.mjs"; export { loadConfig } from "./config.mjs"; export { diff --git a/src/pointwise.mjs b/src/pointwise.mjs new file mode 100644 index 0000000..c400614 --- /dev/null +++ b/src/pointwise.mjs @@ -0,0 +1,206 @@ +export const POINTWISE_VERDICTS = [ + "covered", + "partial", + "missing", + "contradicted", + "unknown", +]; + +const pointwiseVerdictSet = new Set(POINTWISE_VERDICTS); + +const requiredString = (value, label) => { + if (typeof value !== "string" || value.trim().length === 0) { + throw new Error(`pointwise run metadata missing ${label}`); + } + return value; +}; + +const requireEqual = (actual, expected, label) => { + requiredString(actual, label); + if (expected !== undefined && actual !== expected) { + throw new Error( + `pointwise run metadata ${label} mismatch: expected ${expected}, got ${actual}`, + ); + } +}; + +export const countPointwiseVerdicts = (items) => { + if (!Array.isArray(items)) { + throw new Error("pointwise items must be an array"); + } + const counts = Object.fromEntries( + POINTWISE_VERDICTS.map((verdict) => [verdict, 0]), + ); + for (const item of items) { + if (!pointwiseVerdictSet.has(item?.verdict)) { + throw new Error(`unknown pointwise verdict: ${item?.verdict}`); + } + counts[item.verdict] += 1; + } + return counts; +}; + +export const formatPointwiseVerdictCounts = (counts) => + POINTWISE_VERDICTS.map( + (verdict) => `- ${verdict}: ${counts?.[verdict] ?? 0}`, + ); + +export const formatPointwiseCalibrationSummary = ({ + title = "Advisory Pointwise Model-Judge Summary", + counts, + fixtureLabel = "not recorded", + expectedOutcome = "not recorded", + falsePass = "not reviewed", + falseFail = "not reviewed", + notes = "not reviewed", +} = {}) => + [ + `## ${title}`, + "", + "Model-judge evidence is manual and advisory. It cannot upgrade deterministic red or yellow results.", + "", + "### Verdict Counts", + "", + ...formatPointwiseVerdictCounts(counts), + "", + "### Calibration Record", + "", + `- fixture label: ${fixtureLabel}`, + `- expected outcome: ${expectedOutcome}`, + `- false pass: ${falsePass}`, + `- false fail: ${falseFail}`, + `- notes: ${notes}`, + ].join("\n"); + +const validateRelativeArtifactPath = (relativePath, label) => { + requiredString(relativePath, label); + if (relativePath.trim() !== relativePath) { + throw new Error( + `pointwise run metadata ${label} must not contain surrounding whitespace`, + ); + } + if (relativePath.includes("\\")) { + throw new Error( + `pointwise run metadata ${label} must use POSIX separators`, + ); + } + if (relativePath.startsWith("/") || relativePath.startsWith("../")) { + throw new Error( + `pointwise run metadata ${label} must be a relative contained path`, + ); + } + if ( + relativePath + .split("/") + .some((segment) => segment === "." || segment === "..") + ) { + throw new Error( + `pointwise run metadata ${label} must not contain . or .. path segments`, + ); + } + if ( + relativePath === "." || + relativePath === ".." || + relativePath.includes("/../") || + relativePath.includes("//") + ) { + throw new Error(`pointwise run metadata ${label} must be normalized`); + } + if (relativePath.startsWith("./")) { + throw new Error(`pointwise run metadata ${label} must not start with ./`); + } + return relativePath; +}; + +const requireArtifactPaths = (manifest, requiredRoles) => { + if (!Array.isArray(manifest.artifacts) || manifest.artifacts.length === 0) { + throw new Error("pointwise run metadata missing artifacts"); + } + const artifactPaths = new Set(); + const roles = new Set(); + for (const artifact of manifest.artifacts) { + requiredString(artifact?.role, "artifact role"); + validateRelativeArtifactPath( + artifact?.path, + `artifact path for ${artifact.role}`, + ); + roles.add(artifact.role); + artifactPaths.add(artifact.path); + } + for (const role of requiredRoles) { + if (!roles.has(role)) { + throw new Error(`pointwise run metadata missing artifact role ${role}`); + } + } + if (!Array.isArray(manifest.output_files)) { + throw new Error("pointwise run metadata missing output_files"); + } + const outputFiles = new Set( + manifest.output_files.map((outputPath, index) => + validateRelativeArtifactPath(outputPath, `output_files[${index}]`), + ), + ); + if (!outputFiles.has("manifest.json")) { + throw new Error("pointwise run metadata missing manifest.json output file"); + } + for (const artifactPath of artifactPaths) { + if (!outputFiles.has(artifactPath)) { + throw new Error( + `pointwise run metadata output_files missing artifact path ${artifactPath}`, + ); + } + } +}; + +export const validatePointwiseRunMetadata = ({ + manifest, + expected = {}, +} = {}) => { + if (!manifest || typeof manifest !== "object") { + throw new Error("pointwise run metadata manifest is required"); + } + requireEqual(manifest.run_id, expected.runId, "run_id"); + requireEqual(manifest.run_type, "judge-coverage", "run_type"); + if (!Array.isArray(manifest.case_ids) || manifest.case_ids.length !== 1) { + throw new Error("pointwise run metadata must contain exactly one case id"); + } + requireEqual(manifest.case_ids[0], expected.caseId, "case_id"); + requireEqual(manifest.model, expected.model, "model"); + requireEqual(manifest.provider, expected.provider, "provider"); + if ( + expected.effort !== undefined || + manifest.reasoning_effort !== undefined + ) { + requireEqual( + manifest.reasoning_effort, + expected.effort, + "reasoning_effort", + ); + } + requireEqual( + manifest.prompt_version, + expected.promptVersion, + "prompt_version", + ); + requireEqual( + manifest.rubric_version, + expected.rubricVersion, + "rubric_version", + ); + requireEqual( + manifest.runner?.version, + expected.runnerVersion, + "runner.version", + ); + requireArtifactPaths( + manifest, + expected.requiredArtifactRoles ?? [ + "report", + "pointwise_result", + "promptfoo_config", + "raw_promptfoo_results", + "promptfoo_html_report", + ], + ); + return manifest; +}; diff --git a/src/sdk.mjs b/src/sdk.mjs index 6e83549..b842c63 100644 --- a/src/sdk.mjs +++ b/src/sdk.mjs @@ -11,11 +11,17 @@ import { } from "./promptfoo.mjs"; import { aggregateVerdict, criticalBlockerCount } from "./verdict.mjs"; import { assertContainedPath, assertSafeId, toPosixPath } from "./paths.mjs"; +import { + countPointwiseVerdicts, + formatPointwiseCalibrationSummary, + formatPointwiseVerdictCounts, + validatePointwiseRunMetadata, +} from "./pointwise.mjs"; const DEFAULT_SANDBOX_MODE = "read-only"; const DEFAULT_APPROVAL_POLICY = "never"; const RANDOMIZATION_METHOD = "sha256-seed-parity-v1"; -const EVAL_KIT_VERSION = "0.1.7"; +const EVAL_KIT_VERSION = "0.1.8"; const escapeRegExp = (value) => value.replace(/[.*+?^${}()|[\]\\]/g, "\\$&"); @@ -762,6 +768,14 @@ export const judgeCoverage = async ({ // Validate properties match run config if (result.case_id !== caseId) throw new Error("case_id mismatch in result"); if (result.model !== model) throw new Error("model mismatch in result"); + if (result.provider !== provider) + throw new Error("provider mismatch in result"); + if (result.prompt_version !== promptVersion) { + throw new Error("prompt_version mismatch in result"); + } + if (result.rubric_version !== rubricVersion) { + throw new Error("rubric_version mismatch in result"); + } // Post-process pointwise items if the adapter provides custom canonicalization let finalResult = result; @@ -781,16 +795,7 @@ export const judgeCoverage = async ({ JSON.stringify(finalResult, null, 2) + "\n", ); - const counts = { - covered: 0, - partial: 0, - missing: 0, - contradicted: 0, - unknown: 0, - }; - for (const item of finalResult.items) { - counts[item.verdict] = (counts[item.verdict] ?? 0) + 1; - } + const counts = countPointwiseVerdicts(finalResult.items); fs.writeFileSync( path.join(resultDir, "report.md"), @@ -806,11 +811,9 @@ export const judgeCoverage = async ({ "", "## Coverage Summary", "", - `- covered: ${counts.covered}`, - `- partial: ${counts.partial}`, - `- missing: ${counts.missing}`, - `- contradicted: ${counts.contradicted}`, - `- unknown: ${counts.unknown}`, + ...formatPointwiseVerdictCounts(counts), + "", + formatPointwiseCalibrationSummary({ counts }), "", "## Item Results", "", @@ -831,67 +834,73 @@ export const judgeCoverage = async ({ redactionStatus: role.startsWith("raw_") ? "raw-local" : "public-safe", }); - writeManifest({ - runDir: resultDir, - schemaRegistry: config.schemaRegistry, - manifest: { - schema_version: "eval-kit.result-manifest.v2", - run_id: runId, - run_type: "judge-coverage", - runner: { - id: `${config.raw.suite_id}-pointwise-judge`, - version: EVAL_KIT_VERSION, - }, - case_ids: [caseId], - started_at: startedAt.toISOString(), - ended_at: endedAt.toISOString(), - duration_ms: endedAt.getTime() - startedAt.getTime(), - status: "completed", - git: { commit: getGitCommit(config) }, - command: process.argv.join(" "), - tool_versions: getToolVersions(config), - model_provider: codexProviderId({ provider, model }), - model, - provider, - reasoning_effort: effort, - sandbox_mode: DEFAULT_SANDBOX_MODE, - approval_policy: DEFAULT_APPROVAL_POLICY, - codex_auth_mode: authMode, - prompt_version: promptVersion, - rubric_version: rubricVersion, - artifacts: [ - artRecord("report", "report.md", "text/markdown"), - artRecord( - "pointwise_result", - "pointwise-result.json", - "application/json", - ), - artRecord( - "promptfoo_config", - "promptfooconfig.json", - "application/json", - ), - artRecord( - "raw_promptfoo_results", - "promptfoo-results.json", - "application/json", - ), - artRecord( - "promptfoo_html_report", - "promptfoo-report.html", - "text/html", - ), - ], - output_files: [ - "manifest.json", - "report.md", + const manifest = { + schema_version: "eval-kit.result-manifest.v2", + run_id: runId, + run_type: "judge-coverage", + runner: { + id: `${config.raw.suite_id}-pointwise-judge`, + version: EVAL_KIT_VERSION, + }, + case_ids: [caseId], + started_at: startedAt.toISOString(), + ended_at: endedAt.toISOString(), + duration_ms: endedAt.getTime() - startedAt.getTime(), + status: "completed", + git: { commit: getGitCommit(config) }, + command: process.argv.join(" "), + tool_versions: getToolVersions(config), + model_provider: codexProviderId({ provider, model }), + model, + provider, + reasoning_effort: effort, + sandbox_mode: DEFAULT_SANDBOX_MODE, + approval_policy: DEFAULT_APPROVAL_POLICY, + codex_auth_mode: authMode, + prompt_version: promptVersion, + rubric_version: rubricVersion, + artifacts: [ + artRecord("report", "report.md", "text/markdown"), + artRecord( + "pointwise_result", "pointwise-result.json", - "promptfooconfig.json", + "application/json", + ), + artRecord("promptfoo_config", "promptfooconfig.json", "application/json"), + artRecord( + "raw_promptfoo_results", "promptfoo-results.json", - "promptfoo-report.html", - ], + "application/json", + ), + artRecord("promptfoo_html_report", "promptfoo-report.html", "text/html"), + ], + output_files: [ + "manifest.json", + "report.md", + "pointwise-result.json", + "promptfooconfig.json", + "promptfoo-results.json", + "promptfoo-report.html", + ], + }; + validatePointwiseRunMetadata({ + manifest, + expected: { + runId, + caseId, + model, + provider, + effort, + promptVersion, + rubricVersion, + runnerVersion: EVAL_KIT_VERSION, }, }); + writeManifest({ + runDir: resultDir, + schemaRegistry: config.schemaRegistry, + manifest, + }); return { resultDir, finalResult }; }; diff --git a/tests/pointwise.test.mjs b/tests/pointwise.test.mjs new file mode 100644 index 0000000..0e52400 --- /dev/null +++ b/tests/pointwise.test.mjs @@ -0,0 +1,217 @@ +import { describe, expect, it } from "vitest"; + +import { + countPointwiseVerdicts, + formatPointwiseCalibrationSummary, + validatePointwiseRunMetadata, +} from "../src/index.mjs"; + +const pointwiseManifest = (overrides = {}) => ({ + schema_version: "eval-kit.result-manifest.v2", + run_id: "provider-20260704-case-good", + run_type: "judge-coverage", + runner: { + id: "suite-pointwise-judge", + version: "0.1.8", + }, + case_ids: ["case-alpha"], + started_at: "2026-07-04T00:00:00.000Z", + ended_at: "2026-07-04T00:00:01.000Z", + duration_ms: 1000, + status: "completed", + git: { + commit: "abc123", + }, + command: "pnpm eval:judge:coverage", + tool_versions: { + node: "v26.4.0", + }, + model: "gpt-5.4", + provider: "openai:codex-app-server", + model_provider: "openai:codex-app-server:gpt-5.4", + reasoning_effort: "medium", + prompt_version: "pointwise-v1", + rubric_version: "rubric-v1", + artifacts: [ + { role: "report", path: "report.md" }, + { role: "pointwise_result", path: "pointwise-result.json" }, + { role: "promptfoo_config", path: "promptfooconfig.json" }, + { role: "raw_promptfoo_results", path: "promptfoo-results.json" }, + { role: "promptfoo_html_report", path: "promptfoo-report.html" }, + ], + output_files: [ + "manifest.json", + "report.md", + "pointwise-result.json", + "promptfooconfig.json", + "promptfoo-results.json", + "promptfoo-report.html", + ], + ...overrides, +}); + +const expectedMetadata = { + runId: "provider-20260704-case-good", + caseId: "case-alpha", + model: "gpt-5.4", + provider: "openai:codex-app-server", + effort: "medium", + promptVersion: "pointwise-v1", + rubricVersion: "rubric-v1", + runnerVersion: "0.1.8", +}; + +describe("pointwise model-judge helpers", () => { + it("counts verdicts and formats a calibration summary with adverse categories", () => { + const counts = countPointwiseVerdicts([ + { verdict: "covered" }, + { verdict: "partial" }, + { verdict: "missing" }, + { verdict: "contradicted" }, + { verdict: "unknown" }, + { verdict: "unknown" }, + ]); + + expect(counts).toEqual({ + covered: 1, + partial: 1, + missing: 1, + contradicted: 1, + unknown: 2, + }); + + const summary = formatPointwiseCalibrationSummary({ + counts, + fixtureLabel: "expected-bad", + expectedOutcome: "adverse on targeted defect", + falsePass: "not observed", + falseFail: "not applicable", + notes: "bad fixture remained adverse", + }); + + expect(summary).toContain("Model-judge evidence is manual and advisory"); + expect(summary).toContain("- covered: 1"); + expect(summary).toContain("- unknown: 2"); + expect(summary).toContain("- fixture label: expected-bad"); + expect(summary).toContain("- false pass: not observed"); + }); + + it("fails closed when pointwise run metadata is missing or mismatched", () => { + expect(() => + validatePointwiseRunMetadata({ + manifest: pointwiseManifest(), + expected: expectedMetadata, + }), + ).not.toThrow(); + + expect(() => + validatePointwiseRunMetadata({ + manifest: pointwiseManifest({ run_id: "" }), + expected: expectedMetadata, + }), + ).toThrow(/missing run_id/); + + expect(() => + validatePointwiseRunMetadata({ + manifest: pointwiseManifest({ model: "gpt-5.5" }), + expected: expectedMetadata, + }), + ).toThrow(/model mismatch/); + + expect(() => + validatePointwiseRunMetadata({ + manifest: pointwiseManifest({ + artifacts: [{ role: "report", path: "report.md" }], + }), + expected: expectedMetadata, + }), + ).toThrow(/missing artifact role pointwise_result/); + + expect(() => + validatePointwiseRunMetadata({ + manifest: pointwiseManifest({ + output_files: ["manifest.json", "report.md"], + }), + expected: expectedMetadata, + }), + ).toThrow(/output_files missing artifact path pointwise-result.json/); + + expect(() => + validatePointwiseRunMetadata({ + manifest: pointwiseManifest({ + artifacts: [ + { role: "report", path: "../report.md" }, + { role: "pointwise_result", path: "/tmp/pointwise-result.json" }, + { role: "promptfoo_config", path: "promptfooconfig.json" }, + { role: "raw_promptfoo_results", path: "promptfoo-results.json" }, + { role: "promptfoo_html_report", path: "promptfoo-report.html" }, + ], + output_files: [ + "manifest.json", + "../report.md", + "/tmp/pointwise-result.json", + "promptfooconfig.json", + "promptfoo-results.json", + "promptfoo-report.html", + ], + }), + expected: expectedMetadata, + }), + ).toThrow(/relative contained path/); + + expect(() => + validatePointwiseRunMetadata({ + manifest: pointwiseManifest({ + output_files: [ + "manifest.json", + "report.md", + "pointwise-result.json", + "./promptfooconfig.json", + "promptfoo-results.json", + "promptfoo-report.html", + ], + }), + expected: expectedMetadata, + }), + ).toThrow(/must not contain \. or \.\. path segments/); + + expect(() => + validatePointwiseRunMetadata({ + manifest: pointwiseManifest({ + artifacts: [ + { role: "report", path: "reports/.." }, + { role: "pointwise_result", path: "pointwise-result.json" }, + { role: "promptfoo_config", path: "promptfooconfig.json" }, + { role: "raw_promptfoo_results", path: "promptfoo-results.json" }, + { role: "promptfoo_html_report", path: "promptfoo-report.html" }, + ], + output_files: [ + "manifest.json", + "reports/..", + "pointwise-result.json", + "promptfooconfig.json", + "promptfoo-results.json", + "promptfoo-report.html", + ], + }), + expected: expectedMetadata, + }), + ).toThrow(/must not contain \. or \.\. path segments/); + + expect(() => + validatePointwiseRunMetadata({ + manifest: pointwiseManifest({ + output_files: [ + "manifest.json", + "report.md", + "pointwise-result.json", + "promptfoo/./config.json", + "promptfoo-results.json", + "promptfoo-report.html", + ], + }), + expected: expectedMetadata, + }), + ).toThrow(/must not contain \. or \.\. path segments/); + }); +}); diff --git a/tests/schema.test.mjs b/tests/schema.test.mjs index 6237cd8..639adf4 100644 --- a/tests/schema.test.mjs +++ b/tests/schema.test.mjs @@ -137,6 +137,40 @@ describe("eval-kit schema registry", () => { ).not.toThrow(); }); + it("accepts current pointwise model-run metadata fields", () => { + const registry = createSchemaRegistry({ + schemaRoots: [path.resolve(import.meta.dirname, "../schemas")], + }); + expect(() => + registry.validateWithSchema( + "result-manifest.v2.schema.json", + { + schema_version: "eval-kit.result-manifest.v2", + run_id: "provider-20260704-case-good", + run_type: "judge-coverage", + runner: { id: "suite-pointwise-judge", version: "0.1.8" }, + case_ids: ["case-a"], + started_at: "2026-07-04T00:00:00.000Z", + ended_at: "2026-07-04T00:00:01.000Z", + duration_ms: 1000, + status: "completed", + git: { commit: "abc123" }, + command: "pnpm eval:judge:coverage", + tool_versions: { node: "v26.4.0" }, + artifacts: [], + output_files: ["manifest.json"], + model: "gpt-5.4", + provider: "openai:codex-app-server", + model_provider: "openai:codex-app-server:gpt-5.4", + reasoning_effort: "medium", + prompt_version: "pointwise-v1", + rubric_version: "rubric-v1", + }, + "manifest", + ), + ).not.toThrow(); + }); + it("resolves bundled prompt and schema fallbacks for consumer configs", () => { const config = loadConfig( path.resolve(