anomalyco · Aslemammad · Oct 23, 2025 · Oct 20, 2025 · Oct 20, 2025 · Oct 21, 2025
diff --git a/.github/workflows/judge-consistency.yml b/.github/workflows/judge-consistency.yml
@@ -0,0 +1,81 @@
+name: Judge Consistency Tests
+
+on:
+  push:
+    branches:
+      - "**"
+  pull_request:
+  workflow_dispatch:
+
+permissions:
+  contents: read
+
+jobs:
+  typecheck:
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Setup Bun
+        uses: oven-sh/setup-bun@v1
+        with:
+          bun-version: 1.2.21
+
+      - name: Install dependencies
+        run: bun install --frozen-lockfile
+
+      - name: Type check
+        run: bun run check
+
+  test-basic:
+    runs-on: ubuntu-latest
+    environment: production
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Setup Bun
+        uses: oven-sh/setup-bun@v1
+        with:
+          bun-version: 1.2.21
+
+      - name: Install dependencies
+        run: bun install --frozen-lockfile
+
+      - name: Run judge compatibility tests
+        env:
+          OPENCODE_API_KEY: ${{ secrets.OPENCODE_API_KEY }}
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        run: bun test:compatibility
+
+      - name: Run basic consistency tests
+        env:
+          OPENCODE_API_KEY: ${{ secrets.OPENCODE_API_KEY }}
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        run: bun test:consistency
+
+  test-full:
+    runs-on: ubuntu-latest
+    if: github.ref == 'refs/heads/main'
+    environment: production
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Setup Bun
+        uses: oven-sh/setup-bun@v1
+        with:
+          bun-version: 1.2.21
+
+      - name: Install dependencies
+        run: bun install --frozen-lockfile
+
+      - name: Run full consistency test suite
+        env:
+          OPENCODE_API_KEY: ${{ secrets.OPENCODE_API_KEY }}
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        run: bun test:consistency:full
diff --git a/dataset.yaml b/dataset.yaml
@@ -55,4 +55,12 @@
     integration-points:
       weight: 0.22
     test-coverage:
-      weight: 0.22
+      weight: 0.2
+    checks:
+      weight: 0.1
+      args:
+        setup:
+          - npm ci
+        commands:
+          - npm run lint
+          - npm run build
diff --git a/package.json b/package.json
@@ -12,7 +12,14 @@
   "scripts": {
     "build": "bun build cli.ts --outfile dist/cli.js --target node --format esm --external node:*",
     "check": "tsc --noEmit",
-    "dev": "bun run cli.ts"
+    "dev": "bun run cli.ts",
+    "test": "bun test",
+    "test:consistency": "bun test tests/judgeConsistency.test.ts",
+    "test:consistency:medium": "bun test tests/judgeConsistencyMedium.test.ts",
+    "test:consistency:complex": "bun test tests/judgeConsistencyComplex.test.ts",
+    "test:consistency:full": "bun test tests/judgeConsistency*.test.ts",
+    "test:compatibility": "bun test tests/judgeCompatibility.test.ts",
+    "show-test-outputs": "bun run scripts/show-test-outputs.ts"
   },
   "packageManager": "bun@1.2.21",
   "dependencies": {

diff --git a/scores/api-signature.ts b/scores/api-signature.ts
@@ -7,7 +7,7 @@ import { createScore, scoreResultSchema } from "~/lib/createScore.js";
 import { fetchComparisonDiff } from "~/lib/github.js";
 import { finalizeAgentChanges } from "~/lib/finalizeAgentChanges.js";
 
-const systemPrompt = `You are evaluating whether an autonomous agent reproduced the exact API signatures from a reference git commit.
+export const systemPrompt = `You are evaluating whether an autonomous agent reproduced the exact API signatures from a reference git commit.
 
 **YOUR ROLE**: Check if function/method/class signatures match EXACTLY.
 
@@ -139,6 +139,13 @@ This evaluation is STRICT. API signatures must match EXACTLY because:
 
 Return JSON with 'score' (0 or 1) and detailed rationale listing all signature mismatches found.`;
 
+export function createUserPrompt(
+  reference: string,
+  candidateDiff: string,
+): string {
+  return `Reference diff:\n${reference}\n\nCandidate diff:\n${candidateDiff}\n\nCompare ONLY the API signatures (function names, parameter order, parameter names). Ignore implementation details. Respond with JSON.`;
+}
+
 export default createScore({
   prepare: async ({ evaluation }) => {
     try {
@@ -191,7 +198,7 @@ export default createScore({
         schema: scoreResultSchema,
         system: systemPrompt,
         temperature: 0,
-        prompt: `Reference diff:\n${reference}\n\nCandidate diff:\n${candidateDiff}\n\nCompare ONLY the API signatures (function names, parameter order, parameter names). Ignore implementation details. Respond with JSON.`,
+        prompt: createUserPrompt(reference, candidateDiff),
       });
 
       return object;

diff --git a/scores/integration-points.ts b/scores/integration-points.ts
@@ -7,7 +7,7 @@ import { createScore, scoreResultSchema } from "~/lib/createScore.js";
 import { fetchComparisonDiff } from "~/lib/github.js";
 import { finalizeAgentChanges } from "~/lib/finalizeAgentChanges.js";
 
-const systemPrompt = `You are evaluating whether an autonomous agent integrated functions in the same places as a reference git commit.
+export const systemPrompt = `You are evaluating whether an autonomous agent integrated functions in the same places as a reference git commit.
 
 **YOUR ROLE**: Check if functions are called from the same locations and in the same way.
 
@@ -170,6 +170,13 @@ Integration points should match because:
 
 Return JSON with 'score' (0 or 1) and detailed rationale listing all integration mismatches found.`;
 
+export function createUserPrompt(
+  reference: string,
+  candidateDiff: string,
+): string {
+  return `Reference diff:\n${reference}\n\nCandidate diff:\n${candidateDiff}\n\nCompare ONLY the integration points (imports, function calls, call locations, timing). Ignore implementation details. Respond with JSON.`;
+}
+
 export default createScore({
   prepare: async ({ evaluation }) => {
     try {
@@ -222,7 +229,7 @@ export default createScore({
         schema: scoreResultSchema,
         system: systemPrompt,
         temperature: 0,
-        prompt: `Reference diff:\n${reference}\n\nCandidate diff:\n${candidateDiff}\n\nCompare ONLY the integration points (imports, function calls, call locations, timing). Ignore implementation details. Respond with JSON.`,
+        prompt: createUserPrompt(reference, candidateDiff),
       });
 
       return object;

diff --git a/scores/logic-equivalence.ts b/scores/logic-equivalence.ts
@@ -7,7 +7,7 @@ import { createScore, scoreResultSchema } from "~/lib/createScore.js";
 import { fetchComparisonDiff } from "~/lib/github.js";
 import { finalizeAgentChanges } from "~/lib/finalizeAgentChanges.js";
 
-const systemPrompt = `You are evaluating whether an autonomous agent reproduced the logical behavior from a reference git commit.
+export const systemPrompt = `You are evaluating whether an autonomous agent reproduced the logical behavior from a reference git commit.
 
 **YOUR ROLE**: Check if the conditional logic and control flow produce the same outcomes.
 
@@ -214,6 +214,13 @@ if failures is not None and isinstance(failures, list) {
 
 Return JSON with 'score' (0 or 1) and detailed rationale explaining any logic differences found.`;
 
+export function createUserPrompt(
+  reference: string,
+  candidateDiff: string,
+): string {
+  return `Reference diff:\n${reference}\n\nCandidate diff:\n${candidateDiff}\n\nCompare ONLY the logical behavior (conditions, edge cases, side effects). Ignore code structure and style. Respond with JSON.`;
+}
+
 export default createScore({
   prepare: async ({ evaluation }) => {
     try {
@@ -266,7 +273,7 @@ export default createScore({
         schema: scoreResultSchema,
         system: systemPrompt,
         temperature: 0,
-        prompt: `Reference diff:\n${reference}\n\nCandidate diff:\n${candidateDiff}\n\nCompare ONLY the logical behavior (conditions, edge cases, side effects). Ignore code structure and style. Respond with JSON.`,
+        prompt: createUserPrompt(reference, candidateDiff),
       });
 
       return object;

diff --git a/scripts/show-test-outputs.ts b/scripts/show-test-outputs.ts
@@ -0,0 +1,111 @@
+/**
+ * Script to show full judge consistency test outputs
+ */
+import { generateObject } from "ai";
+import { scoreResultSchema } from "~/lib/createScore.js";
+import { judges } from "~/judges.js";
+import { systemPrompt as logicEquivalencePrompt } from "~/scores/logic-equivalence.js";
+import { systemPrompt as apiSignaturePrompt } from "~/scores/api-signature.js";
+import {
+  logicEquivalenceFixtures,
+  apiSignatureFixtures,
+} from "~/tests/fixtures/judgeConsistencyFixtures.js";
+
+const judge = judges[0]; // claude-4.5
+
+async function evaluateAndShow(
+  name: string,
+  systemPrompt: string,
+  reference: string,
+  candidate: string,
+  scoreType: string,
+) {
+  console.log(`\n${"=".repeat(80)}`);
+  console.log(`TEST: ${name}`);
+  console.log(`${"=".repeat(80)}\n`);
+
+  const userPrompt =
+    scoreType === "logic"
+      ? `Reference diff:\n${reference}\n\nCandidate diff:\n${candidate}\n\nCompare ONLY the logical behavior (conditions, edge cases, side effects). Ignore code structure and style. Respond with JSON.`
+      : `Reference diff:\n${reference}\n\nCandidate diff:\n${candidate}\n\nCompare ONLY the API signatures (function names, parameter order, parameter names). Ignore implementation details. Respond with JSON.`;
+
+  for (let i = 1; i <= 3; i++) {
+    console.log(`\n--- Run ${i} ---\n`);
+
+    const { object } = await generateObject({
+      model: judge.model,
+      schema: scoreResultSchema,
+      system: systemPrompt,
+      temperature: 0,
+      prompt: userPrompt,
+    });
+
+    console.log(`Score: ${object.score}`);
+    console.log(`\nRationale:\n${object.rationale}`);
+    console.log();
+  }
+}
+
+// Run tests
+(async () => {
+  console.log("JUDGE CONSISTENCY TEST OUTPUT - FULL RATIONALES");
+  console.log("Using judge: claude-4.5");
+
+  // Logic Equivalence - Perfect Match
+  await evaluateAndShow(
+    "Logic Equivalence - Perfect Match",
+    logicEquivalencePrompt,
+    logicEquivalenceFixtures.perfect.reference,
+    logicEquivalenceFixtures.perfect.candidate,
+    "logic",
+  );
+
+  // Logic Equivalence - Wrong Implementation
+  await evaluateAndShow(
+    "Logic Equivalence - Wrong Implementation",
+    logicEquivalencePrompt,
+    logicEquivalenceFixtures.wrong.reference,
+    logicEquivalenceFixtures.wrong.candidate,
+    "logic",
+  );
+
+  // Logic Equivalence - Ambiguous
+  await evaluateAndShow(
+    "Logic Equivalence - Ambiguous (Guard vs Nested)",
+    logicEquivalencePrompt,
+    logicEquivalenceFixtures.ambiguous.reference,
+    logicEquivalenceFixtures.ambiguous.candidate,
+    "logic",
+  );
+
+  // API Signature - Perfect Match
+  await evaluateAndShow(
+    "API Signature - Perfect Match",
+    apiSignaturePrompt,
+    apiSignatureFixtures.perfect.reference,
+    apiSignatureFixtures.perfect.candidate,
+    "api",
+  );
+
+  // API Signature - Wrong Implementation
+  await evaluateAndShow(
+    "API Signature - Wrong Implementation (Parameter Name)",
+    apiSignaturePrompt,
+    apiSignatureFixtures.wrong.reference,
+    apiSignatureFixtures.wrong.candidate,
+    "api",
+  );
+
+  // API Signature - Ambiguous
+  await evaluateAndShow(
+    "API Signature - Ambiguous (Parameter Order)",
+    apiSignaturePrompt,
+    apiSignatureFixtures.ambiguous.reference,
+    apiSignatureFixtures.ambiguous.candidate,
+    "api",
+  );
+
+  console.log(`\n${"=".repeat(80)}`);
+  console.log("ALL TESTS COMPLETE");
+  console.log(`${"=".repeat(80)}\n`);
+})();