agentic-workflow-kit · aryeko · Jul 3, 2026 · Jul 3, 2026
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -13,6 +13,19 @@ documented with migration notes.
 - Additional docs for suite-specific adoption.
 - Better compatibility tests for Promptfoo variable contracts.
 
+## [0.1.1] - 2026-07-03
+
+### Changed
+
+- Clarified generic CLI help wording for candidate artifacts and expected items.
+- Documented pairwise adapter display-slot semantics.
+- Fixed adapter contract reporter example.
+
+### Notes
+
+- No npm package is published.
+- Consumers may pin `github:agentic-workflow-kit/eval-kit#v0.1.1`.
+
 ## [0.1.0] - 2026-07-02
 
 ### Added
@@ -37,5 +50,6 @@ documented with migration notes.
 - Suite-specific presets remain deferred.
 - Consumer repos own their own semantics, prompts, cases, and pass/fail policies.
 
-[Unreleased]: https://github.com/agentic-workflow-kit/eval-kit/compare/v0.1.0...main
+[Unreleased]: https://github.com/agentic-workflow-kit/eval-kit/compare/v0.1.1...main
+[0.1.1]: https://github.com/agentic-workflow-kit/eval-kit/compare/v0.1.0...v0.1.1
 [0.1.0]: https://github.com/agentic-workflow-kit/eval-kit/releases/tag/v0.1.0
diff --git a/README.md b/README.md
@@ -11,7 +11,7 @@ Shared evaluation infrastructure for `agentic-workflow-kit` repositories.
 ```json
 {
   "devDependencies": {
-    "@agentic-workflow-kit/eval-kit": "github:agentic-workflow-kit/eval-kit#v0.1.0"
+    "@agentic-workflow-kit/eval-kit": "github:agentic-workflow-kit/eval-kit#v0.1.1"
   }
 }
 ```
@@ -54,7 +54,7 @@ Install from a Git tag in a consumer repo:
 ```json
 {
   "devDependencies": {
-    "@agentic-workflow-kit/eval-kit": "github:agentic-workflow-kit/eval-kit#v0.1.0"
+    "@agentic-workflow-kit/eval-kit": "github:agentic-workflow-kit/eval-kit#v0.1.1"
   },
   "scripts": {
     "eval:doctor": "eval-kit doctor --config evals/eval-kit.config.json",

diff --git a/docs/design/consumer-integration.md b/docs/design/consumer-integration.md
@@ -9,7 +9,7 @@ Consumer repos should adopt eval-kit through a pinned Git tag and keep their eva
 ```json
 {
   "devDependencies": {
-    "@agentic-workflow-kit/eval-kit": "github:agentic-workflow-kit/eval-kit#v0.1.0"
+    "@agentic-workflow-kit/eval-kit": "github:agentic-workflow-kit/eval-kit#v0.1.1"
   }
 }
 ```

diff --git a/docs/guides/consumer-integration.md b/docs/guides/consumer-integration.md
@@ -18,7 +18,7 @@ If you cannot state the eval goal, do not bootstrap a suite yet. Empty harnesses
 ```json
 {
   "devDependencies": {
-    "@agentic-workflow-kit/eval-kit": "github:agentic-workflow-kit/eval-kit#v0.1.0"
+    "@agentic-workflow-kit/eval-kit": "github:agentic-workflow-kit/eval-kit#v0.1.1"
   }
 }
 ```

diff --git a/docs/guides/model-assisted-evals.md b/docs/guides/model-assisted-evals.md
@@ -98,6 +98,12 @@ Pairwise:
 }
 ```
 
+For pairwise judging, eval-kit randomizes display slots before calling `resolvePairwiseVars`.
+`candidate_a` and `candidate_b` should contain the displayed Candidate A/B content supplied to the
+adapter. `original_order` records the original CLI labels, and `candidate_order` records which
+original candidate keys were displayed in Candidate A/B order. Do not randomize again inside the
+adapter.
+
 ## Bias controls
 
 Model judge prompts should:

diff --git a/docs/guides/quickstart.md b/docs/guides/quickstart.md
@@ -7,7 +7,7 @@ This guide adds a generic deterministic eval suite to a consumer repo.
 ```json
 {
   "devDependencies": {
-    "@agentic-workflow-kit/eval-kit": "github:agentic-workflow-kit/eval-kit#v0.1.0"
+    "@agentic-workflow-kit/eval-kit": "github:agentic-workflow-kit/eval-kit#v0.1.1"
   }
 }
 ```

diff --git a/docs/reference/adapter-contract.md b/docs/reference/adapter-contract.md
@@ -65,7 +65,12 @@ Input:
 
 ```js
 {
-  (caseId, grades, findings, caseDir, candidatePath, resolver);
+  caseId,
+  grades,
+  findings,
+  caseDir,
+  candidatePath,
+  resolver,
 }
 ```
 
@@ -168,6 +173,22 @@ export const resolvePairwiseVars = async ({
 });
 ```
 
+`judgePairwise` randomizes the displayed candidate order before it calls
+`resolvePairwiseVars`. The `candidateAContent`, `candidateBContent`, `candidateAPath`, and
+`candidateBPath` inputs are displayed Candidate A/B slots, not necessarily the original
+`--candidate-a` and `--candidate-b` CLI inputs.
+
+Use `randomizedOrder` to preserve the mapping:
+
+- `randomizedOrder.original_order` records the original labels: `candidate_a`, then
+  `candidate_b`.
+- `randomizedOrder.candidate_order` records which original candidate keys were shown in displayed
+  Candidate A/B order.
+
+Adapters must not randomize candidate order again. They should pass the displayed Candidate A/B
+values through to the prompt variables and include the provided randomization metadata in model
+judge inputs and outputs.
+
 ## Manual report hook
 
 Required for `report`:

diff --git a/docs/reference/cli.md b/docs/reference/cli.md
@@ -112,6 +112,11 @@ eval-kit judge-coverage \
 
 Run Promptfoo-backed pairwise comparison.
 
+Eval-kit randomizes which original CLI candidate is displayed as Candidate A or Candidate B before
+calling the consumer adapter. The adapter receives displayed Candidate A/B content and paths plus
+`randomizedOrder`, which records the original-to-displayed mapping. Adapters should not apply their
+own second randomization pass.
+
 ```bash
 eval-kit judge-pairwise \
   --case <id> \

diff --git a/docs/reference/results.md b/docs/reference/results.md
@@ -32,7 +32,7 @@ Current schema:
   "run_type": "deterministic",
   "runner": {
     "id": "generic-eval-case",
-    "version": "0.1.0"
+    "version": "0.1.1"
   },
   "case_ids": ["case-example-v1"],
   "started_at": "2026-07-03T00:00:00.000Z",
@@ -100,4 +100,9 @@ pointwise-result.json
 pairwise-result.json
 ```
 
+For `judge-pairwise`, `pairwise-result.json` stores the final winner normalized back to the original
+CLI candidate labels. Its `randomization.original_order` field records the original
+`candidate_a`/`candidate_b` labels, and `randomization.candidate_order` records which original
+candidate keys were displayed as Candidate A/B for the model judge.
+
 Treat these as potentially sensitive.
diff --git a/package.json b/package.json
@@ -1,6 +1,6 @@
 {
   "name": "@agentic-workflow-kit/eval-kit",
-  "version": "0.1.0",
+  "version": "0.1.1",
   "description": "Portable eval runner primitives for local eval suites.",
   "private": true,
   "type": "module",

diff --git a/src/cli.mjs b/src/cli.mjs
@@ -53,15 +53,15 @@ Commands:
     [--run-id <id>]
     [--config <path>]
 
-  generate          Generate a candidate design using Promptfoo
+  generate          Generate a candidate artifact using Promptfoo
     --case <id>
     --model <name>
     --provider <name>
     --effort <low|medium|high>
     --run-id <id>
     [--config <path>]
 
-  judge-coverage    Pointwise judge expected facts and boundaries coverage
+  judge-coverage    Pointwise judge expected item coverage
     --case <id>
     --candidate <path>
     --model <name>

diff --git a/src/sdk.mjs b/src/sdk.mjs
@@ -15,7 +15,7 @@ import { assertContainedPath, assertSafeId, toPosixPath } from "./paths.mjs";
 const DEFAULT_SANDBOX_MODE = "read-only";
 const DEFAULT_APPROVAL_POLICY = "never";
 const RANDOMIZATION_METHOD = "sha256-seed-parity-v1";
-const EVAL_KIT_VERSION = "0.1.0";
+const EVAL_KIT_VERSION = "0.1.1";
 
 const escapeRegExp = (value) => value.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
-Original file line number
+Diff line change
@@ Expand Up @@
     ```json
     {
       "devDependencies": {
-        "@agentic-workflow-kit/eval-kit": "github:agentic-workflow-kit/eval-kit#v0.1.0"
+        "@agentic-workflow-kit/eval-kit": "github:agentic-workflow-kit/eval-kit#v0.1.1"
       }
     }
     ```
@@ Expand Down @@