agentic-workflow-kit · aryeko · Jul 3, 2026 · Jul 3, 2026
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -13,6 +13,19 @@ documented with migration notes.
 - Additional docs for suite-specific adoption.
 - Better compatibility tests for Promptfoo variable contracts.
 
+## [0.1.2] - 2026-07-03
+
+### Fixed
+
+- Enforced disabled method flags across run-producing CLI commands.
+- Added regression coverage for disabled generation, pointwise judging, deterministic runs, and
+  manual reports.
+
+### Notes
+
+- No npm package is published.
+- Consumers may pin `github:agentic-workflow-kit/eval-kit#v0.1.2`.
+
 ## [0.1.1] - 2026-07-03
 
 ### Changed
@@ -50,6 +63,7 @@ documented with migration notes.
 - Suite-specific presets remain deferred.
 - Consumer repos own their own semantics, prompts, cases, and pass/fail policies.
 
-[Unreleased]: https://github.com/agentic-workflow-kit/eval-kit/compare/v0.1.1...main
+[Unreleased]: https://github.com/agentic-workflow-kit/eval-kit/compare/v0.1.2...main
+[0.1.2]: https://github.com/agentic-workflow-kit/eval-kit/compare/v0.1.1...v0.1.2
 [0.1.1]: https://github.com/agentic-workflow-kit/eval-kit/compare/v0.1.0...v0.1.1
 [0.1.0]: https://github.com/agentic-workflow-kit/eval-kit/releases/tag/v0.1.0
diff --git a/README.md b/README.md
@@ -11,7 +11,7 @@ Shared evaluation infrastructure for `agentic-workflow-kit` repositories.
 ```json
 {
   "devDependencies": {
-    "@agentic-workflow-kit/eval-kit": "github:agentic-workflow-kit/eval-kit#v0.1.1"
+    "@agentic-workflow-kit/eval-kit": "github:agentic-workflow-kit/eval-kit#v0.1.2"
   }
 }
 ```
@@ -70,7 +70,7 @@ Install from a Git tag in a consumer repo:
 ```json
 {
   "devDependencies": {
-    "@agentic-workflow-kit/eval-kit": "github:agentic-workflow-kit/eval-kit#v0.1.1"
+    "@agentic-workflow-kit/eval-kit": "github:agentic-workflow-kit/eval-kit#v0.1.2"
   },
   "scripts": {
     "eval:doctor": "eval-kit doctor --config evals/eval-kit.config.json",
@@ -188,6 +188,7 @@ For now, releases are Git tags that consumers pin in `package.json`:
 ```text
 v0.1.0
 v0.1.1
+v0.1.2
 v0.2.0
 ```
 

diff --git a/docs/design/consumer-integration.md b/docs/design/consumer-integration.md
@@ -9,7 +9,7 @@ Consumer repos should adopt eval-kit through a pinned Git tag and keep their eva
 ```json
 {
   "devDependencies": {
-    "@agentic-workflow-kit/eval-kit": "github:agentic-workflow-kit/eval-kit#v0.1.1"
+    "@agentic-workflow-kit/eval-kit": "github:agentic-workflow-kit/eval-kit#v0.1.2"
   }
 }
 ```

diff --git a/docs/guides/consumer-integration.md b/docs/guides/consumer-integration.md
@@ -18,7 +18,7 @@ If you cannot state the eval goal, do not bootstrap a suite yet. Empty harnesses
 ```json
 {
   "devDependencies": {
-    "@agentic-workflow-kit/eval-kit": "github:agentic-workflow-kit/eval-kit#v0.1.1"
+    "@agentic-workflow-kit/eval-kit": "github:agentic-workflow-kit/eval-kit#v0.1.2"
   }
 }
 ```

diff --git a/docs/guides/quickstart.md b/docs/guides/quickstart.md
@@ -7,7 +7,7 @@ This guide adds a generic deterministic eval suite to a consumer repo.
 ```json
 {
   "devDependencies": {
-    "@agentic-workflow-kit/eval-kit": "github:agentic-workflow-kit/eval-kit#v0.1.1"
+    "@agentic-workflow-kit/eval-kit": "github:agentic-workflow-kit/eval-kit#v0.1.2"
   }
 }
 ```

diff --git a/docs/reference/cli.md b/docs/reference/cli.md
@@ -9,6 +9,19 @@ pnpm exec eval-kit <command> [options]
 Package scripts may use plain `eval-kit` because npm/pnpm add local binaries to script `PATH`.
 Interactive consumer shell examples use `pnpm exec eval-kit`.
 
+Run-producing commands fail closed when their configured method is explicitly disabled:
+
+| Command          | Disabled by                            |
+| ---------------- | -------------------------------------- |
+| `run-case`       | `methods.deterministic.enabled=false`  |
+| `generate`       | `methods.generate.enabled=false`       |
+| `judge-coverage` | `methods.judge_coverage.enabled=false` |
+| `judge-pairwise` | `methods.judge_pairwise.enabled=false` |
+| `report`         | `methods.report.enabled=false`         |
+
+Disabled commands exit before Codex auth checks, Promptfoo execution, model provider calls, adapter
+hooks, or result artifact writes.
+
 ## `init`
 
 Create a deterministic generic eval skeleton.
@@ -72,6 +85,8 @@ pnpm exec eval-kit run-case \
 
 Exits non-zero when the deterministic verdict is `red`.
 
+Fails closed when `methods.deterministic.enabled` is explicitly `false`.
+
 ## `validate-fixtures`
 
 Validate case manifests and call the consumer `validateFixtures` hook when present.
@@ -95,6 +110,7 @@ pnpm exec eval-kit generate \
 ```
 
 Requires Promptfoo and local Codex auth. This is manual/advisory evidence, not a default CI gate.
+Fails closed before those checks when `methods.generate.enabled` is explicitly `false`.
 
 ## `judge-coverage`
 
@@ -111,6 +127,8 @@ pnpm exec eval-kit judge-coverage \
   [--config <path>]
 ```
 
+Fails closed when `methods.judge_coverage.enabled` is explicitly `false`.
+
 ## `judge-pairwise`
 
 Run Promptfoo-backed pairwise comparison.
@@ -149,3 +167,5 @@ pnpm exec eval-kit report \
   [--outcome <id>] \
   [--config <path>]
 ```
+
+Fails closed when `methods.report.enabled` is explicitly `false`.
diff --git a/docs/reference/config.md b/docs/reference/config.md
@@ -71,6 +71,9 @@ lanes:
 - `generate`, `judge_coverage`, and `judge_pairwise` are manual/advisory model-assisted methods.
   They require explicit local setup and must not require auth, network, Promptfoo provider calls,
   Codex/OpenAI calls, LLM judging, or manual calibration in `pnpm check`.
+- Run-producing commands fail closed when their method's `enabled` flag is explicitly `false`,
+  before Codex auth checks, Promptfoo execution, model provider calls, adapter hooks, or result
+  artifact writes.
 
 ## Path rules
 

diff --git a/docs/reference/release-process.md b/docs/reference/release-process.md
@@ -34,7 +34,7 @@ Consumers depend on tags like:
 Title:
 
 ```text
-chore(release): v0.1.1
+chore(release): v0.1.2
 ```
 
 Required changes:
@@ -63,18 +63,18 @@ git checkout main
 git pull --ff-only
 git rev-parse HEAD
 
-git tag -a v0.1.1 -m "v0.1.1"
-git push origin v0.1.1
+git tag -a v0.1.2 -m "v0.1.2"
+git push origin v0.1.2
 ```
 
 Verify:
 
 ```bash
-git rev-parse v0.1.1^{}
-git show --no-patch --decorate v0.1.1
+git rev-parse v0.1.2^{}
+git show --no-patch --decorate v0.1.2
 ```
 
-`v0.1.1^{}` must point to the release commit. With an annotated tag, `git rev-parse v0.1.1`
+`v0.1.2^{}` must point to the release commit. With an annotated tag, `git rev-parse v0.1.2`
 returns the tag object; `^{}` dereferences to the commit.
 
 ## GitHub Release
@@ -93,7 +93,7 @@ For each consumer repo:
 
 ```json
 {
-  "@agentic-workflow-kit/eval-kit": "github:agentic-workflow-kit/eval-kit#v0.1.1"
+  "@agentic-workflow-kit/eval-kit": "github:agentic-workflow-kit/eval-kit#v0.1.2"
 }
 ```
 
@@ -108,7 +108,7 @@ pnpm check
 3. Run consumer smoke commands, for example in `technical-design`:
 
 ```bash
-pnpm eval:case -- --case case-tiny-laundry-pickup-v1 --candidate evals/cases/case-tiny-laundry-pickup-v1/reference-design.md --run-id verify-eval-kit-v0.1.1
+pnpm eval:case -- --case case-tiny-laundry-pickup-v1 --candidate evals/cases/case-tiny-laundry-pickup-v1/reference-design.md --run-id verify-eval-kit-v0.1.2
 ```
 
 4. Open a PR with dependency, lockfile, and any compatibility fixes.
@@ -120,7 +120,7 @@ Do not move the tag.
 Create a new patch release:
 
 ```text
-v0.1.1 -> v0.1.2
+v0.1.2 -> v0.1.3
 ```
 
 Then open consumer bump PRs.

diff --git a/docs/reference/results.md b/docs/reference/results.md
@@ -14,6 +14,9 @@ Run-producing commands are local on-demand or manual/advisory evidence. They are
 gates. Keep `pnpm check` limited to fast, offline, structural validation unless a consumer
 documents a narrow deterministic subset that does not call external providers.
 
+If a run-producing method is explicitly disabled in config, the CLI exits before writing a result
+bundle.
+
 Setup and inspection commands do not create result bundles. `init` and `scaffold-case` write suite
 or case files, while `doctor`, `list-cases`, and `validate-fixtures` validate or report current
 state without writing `<results_root>/<run-id>/manifest.json`.
@@ -36,7 +39,7 @@ Current schema:
   "run_type": "deterministic",
   "runner": {
     "id": "generic-eval-case",
-    "version": "0.1.1"
+    "version": "0.1.2"
   },
   "case_ids": ["case-example-v1"],
   "started_at": "2026-07-03T00:00:00.000Z",

diff --git a/docs/reference/versioning-policy.md b/docs/reference/versioning-policy.md
@@ -9,6 +9,7 @@ Use SemVer-style versions:
 ```text
 0.1.0
 0.1.1
+0.1.2
 0.2.0
 ```
 
@@ -26,7 +27,7 @@ Use patch for:
 Example:
 
 ```text
-0.1.0 -> 0.1.1
+0.1.1 -> 0.1.2
 ```
 
 ## Minor version

diff --git a/package.json b/package.json
@@ -1,6 +1,6 @@
 {
   "name": "@agentic-workflow-kit/eval-kit",
-  "version": "0.1.1",
+  "version": "0.1.2",
   "description": "Portable eval runner primitives for local eval suites.",
   "private": true,
   "type": "module",

diff --git a/src/cli.mjs b/src/cli.mjs
@@ -25,6 +25,21 @@ const requireEnabledMethod = (config, methodKey, commandName) => {
   }
 };
 
+const runProducingMethodByCommand = {
+  "run-case": "deterministic",
+  generate: "generate",
+  "judge-coverage": "judge_coverage",
+  "judge-pairwise": "judge_pairwise",
+  report: "report",
+};
+
+const requireEnabledCommandMethod = (config, commandName) => {
+  const methodKey = runProducingMethodByCommand[commandName];
+  if (methodKey) {
+    requireEnabledMethod(config, methodKey, commandName);
+  }
+};
+
 const printHelp = () => {
   console.log(`
 Usage: eval-kit <command> [options]
@@ -163,6 +178,8 @@ export const main = async () => {
   }
 
   try {
+    requireEnabledCommandMethod(config, subcommand);
+
     switch (subcommand) {
       case "run-case": {
         const caseId = requireArg(parsed, "case");
@@ -221,7 +238,6 @@ export const main = async () => {
       }
 
       case "judge-pairwise": {
-        requireEnabledMethod(config, "judge_pairwise", "judge-pairwise");
         const caseId = requireArg(parsed, "case");
         const candidateA = requireArg(parsed, "candidate-a");
         const candidateB = requireArg(parsed, "candidate-b");

diff --git a/src/sdk.mjs b/src/sdk.mjs
@@ -15,7 +15,7 @@ import { assertContainedPath, assertSafeId, toPosixPath } from "./paths.mjs";
 const DEFAULT_SANDBOX_MODE = "read-only";
 const DEFAULT_APPROVAL_POLICY = "never";
 const RANDOMIZATION_METHOD = "sha256-seed-parity-v1";
-const EVAL_KIT_VERSION = "0.1.1";
+const EVAL_KIT_VERSION = "0.1.2";
 
 const escapeRegExp = (value) => value.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
 

diff --git a/tests/cli.test.mjs b/tests/cli.test.mjs
@@ -1,8 +1,10 @@
 import { spawnSync } from "node:child_process";
+import fs from "node:fs";
+import os from "node:os";
 import path from "node:path";
 import { fileURLToPath } from "node:url";
 
-import { describe, expect, it } from "vitest";
+import { afterEach, describe, expect, it } from "vitest";
 
 const __filename = fileURLToPath(import.meta.url);
 const packageRoot = path.resolve(path.dirname(__filename), "..");
@@ -14,20 +16,63 @@ const configPath = path.join(
   "eval-kit.config.json",
 );
 const cliPath = path.resolve(packageRoot, "bin/eval-kit.mjs");
+const tempDirs = [];
+
+const writeConfigWithDisabledMethod = (methodKey) => {
+  const tempDir = fs.mkdtempSync(path.join(os.tmpdir(), "eval-kit-cli-"));
+  tempDirs.push(tempDir);
+
+  const config = JSON.parse(fs.readFileSync(configPath, "utf8"));
+  config.methods = {
+    ...config.methods,
+    [methodKey]: {
+      ...(config.methods?.[methodKey] ?? {}),
+      enabled: false,
+    },
+  };
+
+  const tempConfigPath = path.join(tempDir, "eval-kit.config.json");
+  fs.writeFileSync(tempConfigPath, `${JSON.stringify(config, null, 2)}\n`);
+  return tempConfigPath;
+};
+
+const runCli = (args) =>
+  spawnSync(process.execPath, [cliPath, ...args], {
+    cwd: packageRoot,
+    encoding: "utf8",
+    stdio: ["ignore", "pipe", "pipe"],
+  });
 
 describe("eval-kit CLI", () => {
+  afterEach(() => {
+    for (const tempDir of tempDirs.splice(0)) {
+      fs.rmSync(tempDir, { recursive: true, force: true });
+    }
+  });
+
   it("fails closed when pairwise judging is disabled by config", () => {
-    const result = spawnSync(
-      process.execPath,
-      [cliPath, "judge-pairwise", "--config", configPath],
-      {
-        cwd: packageRoot,
-        encoding: "utf8",
-        stdio: ["ignore", "pipe", "pipe"],
-      },
-    );
+    const result = runCli(["judge-pairwise", "--config", configPath]);
 
     expect(result.status).toBe(1);
     expect(result.stderr).toContain("judge-pairwise is disabled");
   });
+
+  it.each([
+    ["run-case", "deterministic"],
+    ["generate", "generate"],
+    ["judge-coverage", "judge_coverage"],
+    ["report", "report"],
+  ])(
+    "fails closed when %s is disabled before requiring run arguments",
+    (commandName, methodKey) => {
+      const disabledConfigPath = writeConfigWithDisabledMethod(methodKey);
+
+      const result = runCli([commandName, "--config", disabledConfigPath]);
+
+      expect(result.status).toBe(1);
+      expect(result.stderr).toContain(`${commandName} is disabled`);
+      expect(result.stderr).toContain(`methods.${methodKey}.enabled=false`);
+      expect(result.stderr).not.toContain("missing required argument");
+    },
+  );
 });
-Original file line number
+Diff line change
@@ Expand Up @@
     ```json
     {
       "devDependencies": {
-        "@agentic-workflow-kit/eval-kit": "github:agentic-workflow-kit/eval-kit#v0.1.1"
+        "@agentic-workflow-kit/eval-kit": "github:agentic-workflow-kit/eval-kit#v0.1.2"
       }
     }
     ```
@@ Expand Down @@