diff --git a/CHANGELOG.md b/CHANGELOG.md index 7cb661d..77b75e6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,6 +13,19 @@ documented with migration notes. - Additional docs for suite-specific adoption. - Better compatibility tests for Promptfoo variable contracts. +## [0.1.2] - 2026-07-03 + +### Fixed + +- Enforced disabled method flags across run-producing CLI commands. +- Added regression coverage for disabled generation, pointwise judging, deterministic runs, and + manual reports. + +### Notes + +- No npm package is published. +- Consumers may pin `github:agentic-workflow-kit/eval-kit#v0.1.2`. + ## [0.1.1] - 2026-07-03 ### Changed @@ -50,6 +63,7 @@ documented with migration notes. - Suite-specific presets remain deferred. - Consumer repos own their own semantics, prompts, cases, and pass/fail policies. -[Unreleased]: https://github.com/agentic-workflow-kit/eval-kit/compare/v0.1.1...main +[Unreleased]: https://github.com/agentic-workflow-kit/eval-kit/compare/v0.1.2...main +[0.1.2]: https://github.com/agentic-workflow-kit/eval-kit/compare/v0.1.1...v0.1.2 [0.1.1]: https://github.com/agentic-workflow-kit/eval-kit/compare/v0.1.0...v0.1.1 [0.1.0]: https://github.com/agentic-workflow-kit/eval-kit/releases/tag/v0.1.0 diff --git a/README.md b/README.md index 1229d7c..b31729e 100644 --- a/README.md +++ b/README.md @@ -11,7 +11,7 @@ Shared evaluation infrastructure for `agentic-workflow-kit` repositories. ```json { "devDependencies": { - "@agentic-workflow-kit/eval-kit": "github:agentic-workflow-kit/eval-kit#v0.1.1" + "@agentic-workflow-kit/eval-kit": "github:agentic-workflow-kit/eval-kit#v0.1.2" } } ``` @@ -70,7 +70,7 @@ Install from a Git tag in a consumer repo: ```json { "devDependencies": { - "@agentic-workflow-kit/eval-kit": "github:agentic-workflow-kit/eval-kit#v0.1.1" + "@agentic-workflow-kit/eval-kit": "github:agentic-workflow-kit/eval-kit#v0.1.2" }, "scripts": { "eval:doctor": "eval-kit doctor --config evals/eval-kit.config.json", @@ -188,6 +188,7 @@ For now, releases are Git tags that consumers pin in `package.json`: ```text v0.1.0 v0.1.1 +v0.1.2 v0.2.0 ``` diff --git a/docs/design/consumer-integration.md b/docs/design/consumer-integration.md index 990b177..cb5d8b9 100644 --- a/docs/design/consumer-integration.md +++ b/docs/design/consumer-integration.md @@ -9,7 +9,7 @@ Consumer repos should adopt eval-kit through a pinned Git tag and keep their eva ```json { "devDependencies": { - "@agentic-workflow-kit/eval-kit": "github:agentic-workflow-kit/eval-kit#v0.1.1" + "@agentic-workflow-kit/eval-kit": "github:agentic-workflow-kit/eval-kit#v0.1.2" } } ``` diff --git a/docs/guides/consumer-integration.md b/docs/guides/consumer-integration.md index ce2baaf..be71845 100644 --- a/docs/guides/consumer-integration.md +++ b/docs/guides/consumer-integration.md @@ -18,7 +18,7 @@ If you cannot state the eval goal, do not bootstrap a suite yet. Empty harnesses ```json { "devDependencies": { - "@agentic-workflow-kit/eval-kit": "github:agentic-workflow-kit/eval-kit#v0.1.1" + "@agentic-workflow-kit/eval-kit": "github:agentic-workflow-kit/eval-kit#v0.1.2" } } ``` diff --git a/docs/guides/quickstart.md b/docs/guides/quickstart.md index 7dc40aa..8ba878a 100644 --- a/docs/guides/quickstart.md +++ b/docs/guides/quickstart.md @@ -7,7 +7,7 @@ This guide adds a generic deterministic eval suite to a consumer repo. ```json { "devDependencies": { - "@agentic-workflow-kit/eval-kit": "github:agentic-workflow-kit/eval-kit#v0.1.1" + "@agentic-workflow-kit/eval-kit": "github:agentic-workflow-kit/eval-kit#v0.1.2" } } ``` diff --git a/docs/reference/cli.md b/docs/reference/cli.md index 6cfa5e6..2971c04 100644 --- a/docs/reference/cli.md +++ b/docs/reference/cli.md @@ -9,6 +9,19 @@ pnpm exec eval-kit [options] Package scripts may use plain `eval-kit` because npm/pnpm add local binaries to script `PATH`. Interactive consumer shell examples use `pnpm exec eval-kit`. +Run-producing commands fail closed when their configured method is explicitly disabled: + +| Command | Disabled by | +| ---------------- | -------------------------------------- | +| `run-case` | `methods.deterministic.enabled=false` | +| `generate` | `methods.generate.enabled=false` | +| `judge-coverage` | `methods.judge_coverage.enabled=false` | +| `judge-pairwise` | `methods.judge_pairwise.enabled=false` | +| `report` | `methods.report.enabled=false` | + +Disabled commands exit before Codex auth checks, Promptfoo execution, model provider calls, adapter +hooks, or result artifact writes. + ## `init` Create a deterministic generic eval skeleton. @@ -72,6 +85,8 @@ pnpm exec eval-kit run-case \ Exits non-zero when the deterministic verdict is `red`. +Fails closed when `methods.deterministic.enabled` is explicitly `false`. + ## `validate-fixtures` Validate case manifests and call the consumer `validateFixtures` hook when present. @@ -95,6 +110,7 @@ pnpm exec eval-kit generate \ ``` Requires Promptfoo and local Codex auth. This is manual/advisory evidence, not a default CI gate. +Fails closed before those checks when `methods.generate.enabled` is explicitly `false`. ## `judge-coverage` @@ -111,6 +127,8 @@ pnpm exec eval-kit judge-coverage \ [--config ] ``` +Fails closed when `methods.judge_coverage.enabled` is explicitly `false`. + ## `judge-pairwise` Run Promptfoo-backed pairwise comparison. @@ -149,3 +167,5 @@ pnpm exec eval-kit report \ [--outcome ] \ [--config ] ``` + +Fails closed when `methods.report.enabled` is explicitly `false`. diff --git a/docs/reference/config.md b/docs/reference/config.md index ae852d5..147b6bb 100644 --- a/docs/reference/config.md +++ b/docs/reference/config.md @@ -71,6 +71,9 @@ lanes: - `generate`, `judge_coverage`, and `judge_pairwise` are manual/advisory model-assisted methods. They require explicit local setup and must not require auth, network, Promptfoo provider calls, Codex/OpenAI calls, LLM judging, or manual calibration in `pnpm check`. +- Run-producing commands fail closed when their method's `enabled` flag is explicitly `false`, + before Codex auth checks, Promptfoo execution, model provider calls, adapter hooks, or result + artifact writes. ## Path rules diff --git a/docs/reference/release-process.md b/docs/reference/release-process.md index e09ef02..f581cc2 100644 --- a/docs/reference/release-process.md +++ b/docs/reference/release-process.md @@ -34,7 +34,7 @@ Consumers depend on tags like: Title: ```text -chore(release): v0.1.1 +chore(release): v0.1.2 ``` Required changes: @@ -63,18 +63,18 @@ git checkout main git pull --ff-only git rev-parse HEAD -git tag -a v0.1.1 -m "v0.1.1" -git push origin v0.1.1 +git tag -a v0.1.2 -m "v0.1.2" +git push origin v0.1.2 ``` Verify: ```bash -git rev-parse v0.1.1^{} -git show --no-patch --decorate v0.1.1 +git rev-parse v0.1.2^{} +git show --no-patch --decorate v0.1.2 ``` -`v0.1.1^{}` must point to the release commit. With an annotated tag, `git rev-parse v0.1.1` +`v0.1.2^{}` must point to the release commit. With an annotated tag, `git rev-parse v0.1.2` returns the tag object; `^{}` dereferences to the commit. ## GitHub Release @@ -93,7 +93,7 @@ For each consumer repo: ```json { - "@agentic-workflow-kit/eval-kit": "github:agentic-workflow-kit/eval-kit#v0.1.1" + "@agentic-workflow-kit/eval-kit": "github:agentic-workflow-kit/eval-kit#v0.1.2" } ``` @@ -108,7 +108,7 @@ pnpm check 3. Run consumer smoke commands, for example in `technical-design`: ```bash -pnpm eval:case -- --case case-tiny-laundry-pickup-v1 --candidate evals/cases/case-tiny-laundry-pickup-v1/reference-design.md --run-id verify-eval-kit-v0.1.1 +pnpm eval:case -- --case case-tiny-laundry-pickup-v1 --candidate evals/cases/case-tiny-laundry-pickup-v1/reference-design.md --run-id verify-eval-kit-v0.1.2 ``` 4. Open a PR with dependency, lockfile, and any compatibility fixes. @@ -120,7 +120,7 @@ Do not move the tag. Create a new patch release: ```text -v0.1.1 -> v0.1.2 +v0.1.2 -> v0.1.3 ``` Then open consumer bump PRs. diff --git a/docs/reference/results.md b/docs/reference/results.md index 331202c..d94d99d 100644 --- a/docs/reference/results.md +++ b/docs/reference/results.md @@ -14,6 +14,9 @@ Run-producing commands are local on-demand or manual/advisory evidence. They are gates. Keep `pnpm check` limited to fast, offline, structural validation unless a consumer documents a narrow deterministic subset that does not call external providers. +If a run-producing method is explicitly disabled in config, the CLI exits before writing a result +bundle. + Setup and inspection commands do not create result bundles. `init` and `scaffold-case` write suite or case files, while `doctor`, `list-cases`, and `validate-fixtures` validate or report current state without writing `//manifest.json`. @@ -36,7 +39,7 @@ Current schema: "run_type": "deterministic", "runner": { "id": "generic-eval-case", - "version": "0.1.1" + "version": "0.1.2" }, "case_ids": ["case-example-v1"], "started_at": "2026-07-03T00:00:00.000Z", diff --git a/docs/reference/versioning-policy.md b/docs/reference/versioning-policy.md index 01545d3..b126036 100644 --- a/docs/reference/versioning-policy.md +++ b/docs/reference/versioning-policy.md @@ -9,6 +9,7 @@ Use SemVer-style versions: ```text 0.1.0 0.1.1 +0.1.2 0.2.0 ``` @@ -26,7 +27,7 @@ Use patch for: Example: ```text -0.1.0 -> 0.1.1 +0.1.1 -> 0.1.2 ``` ## Minor version diff --git a/package.json b/package.json index 1a226b6..671003a 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "@agentic-workflow-kit/eval-kit", - "version": "0.1.1", + "version": "0.1.2", "description": "Portable eval runner primitives for local eval suites.", "private": true, "type": "module", diff --git a/src/cli.mjs b/src/cli.mjs index e8bf7d1..2cf8300 100644 --- a/src/cli.mjs +++ b/src/cli.mjs @@ -25,6 +25,21 @@ const requireEnabledMethod = (config, methodKey, commandName) => { } }; +const runProducingMethodByCommand = { + "run-case": "deterministic", + generate: "generate", + "judge-coverage": "judge_coverage", + "judge-pairwise": "judge_pairwise", + report: "report", +}; + +const requireEnabledCommandMethod = (config, commandName) => { + const methodKey = runProducingMethodByCommand[commandName]; + if (methodKey) { + requireEnabledMethod(config, methodKey, commandName); + } +}; + const printHelp = () => { console.log(` Usage: eval-kit [options] @@ -163,6 +178,8 @@ export const main = async () => { } try { + requireEnabledCommandMethod(config, subcommand); + switch (subcommand) { case "run-case": { const caseId = requireArg(parsed, "case"); @@ -221,7 +238,6 @@ export const main = async () => { } case "judge-pairwise": { - requireEnabledMethod(config, "judge_pairwise", "judge-pairwise"); const caseId = requireArg(parsed, "case"); const candidateA = requireArg(parsed, "candidate-a"); const candidateB = requireArg(parsed, "candidate-b"); diff --git a/src/sdk.mjs b/src/sdk.mjs index 325ae97..a625776 100644 --- a/src/sdk.mjs +++ b/src/sdk.mjs @@ -15,7 +15,7 @@ import { assertContainedPath, assertSafeId, toPosixPath } from "./paths.mjs"; const DEFAULT_SANDBOX_MODE = "read-only"; const DEFAULT_APPROVAL_POLICY = "never"; const RANDOMIZATION_METHOD = "sha256-seed-parity-v1"; -const EVAL_KIT_VERSION = "0.1.1"; +const EVAL_KIT_VERSION = "0.1.2"; const escapeRegExp = (value) => value.replace(/[.*+?^${}()|[\]\\]/g, "\\$&"); diff --git a/tests/cli.test.mjs b/tests/cli.test.mjs index e2fac02..793104e 100644 --- a/tests/cli.test.mjs +++ b/tests/cli.test.mjs @@ -1,8 +1,10 @@ import { spawnSync } from "node:child_process"; +import fs from "node:fs"; +import os from "node:os"; import path from "node:path"; import { fileURLToPath } from "node:url"; -import { describe, expect, it } from "vitest"; +import { afterEach, describe, expect, it } from "vitest"; const __filename = fileURLToPath(import.meta.url); const packageRoot = path.resolve(path.dirname(__filename), ".."); @@ -14,20 +16,63 @@ const configPath = path.join( "eval-kit.config.json", ); const cliPath = path.resolve(packageRoot, "bin/eval-kit.mjs"); +const tempDirs = []; + +const writeConfigWithDisabledMethod = (methodKey) => { + const tempDir = fs.mkdtempSync(path.join(os.tmpdir(), "eval-kit-cli-")); + tempDirs.push(tempDir); + + const config = JSON.parse(fs.readFileSync(configPath, "utf8")); + config.methods = { + ...config.methods, + [methodKey]: { + ...(config.methods?.[methodKey] ?? {}), + enabled: false, + }, + }; + + const tempConfigPath = path.join(tempDir, "eval-kit.config.json"); + fs.writeFileSync(tempConfigPath, `${JSON.stringify(config, null, 2)}\n`); + return tempConfigPath; +}; + +const runCli = (args) => + spawnSync(process.execPath, [cliPath, ...args], { + cwd: packageRoot, + encoding: "utf8", + stdio: ["ignore", "pipe", "pipe"], + }); describe("eval-kit CLI", () => { + afterEach(() => { + for (const tempDir of tempDirs.splice(0)) { + fs.rmSync(tempDir, { recursive: true, force: true }); + } + }); + it("fails closed when pairwise judging is disabled by config", () => { - const result = spawnSync( - process.execPath, - [cliPath, "judge-pairwise", "--config", configPath], - { - cwd: packageRoot, - encoding: "utf8", - stdio: ["ignore", "pipe", "pipe"], - }, - ); + const result = runCli(["judge-pairwise", "--config", configPath]); expect(result.status).toBe(1); expect(result.stderr).toContain("judge-pairwise is disabled"); }); + + it.each([ + ["run-case", "deterministic"], + ["generate", "generate"], + ["judge-coverage", "judge_coverage"], + ["report", "report"], + ])( + "fails closed when %s is disabled before requiring run arguments", + (commandName, methodKey) => { + const disabledConfigPath = writeConfigWithDisabledMethod(methodKey); + + const result = runCli([commandName, "--config", disabledConfigPath]); + + expect(result.status).toBe(1); + expect(result.stderr).toContain(`${commandName} is disabled`); + expect(result.stderr).toContain(`methods.${methodKey}.enabled=false`); + expect(result.stderr).not.toContain("missing required argument"); + }, + ); });