Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
81 changes: 81 additions & 0 deletions .github/workflows/judge-consistency.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
name: Judge Consistency Tests

on:
push:
branches:
- "**"
pull_request:
workflow_dispatch:

permissions:
contents: read

jobs:
typecheck:
runs-on: ubuntu-latest

steps:
- name: Checkout repository
uses: actions/checkout@v4

- name: Setup Bun
uses: oven-sh/setup-bun@v1
with:
bun-version: 1.2.21

- name: Install dependencies
run: bun install --frozen-lockfile

- name: Type check
run: bun run check

test-basic:
runs-on: ubuntu-latest
environment: production

steps:
- name: Checkout repository
uses: actions/checkout@v4

- name: Setup Bun
uses: oven-sh/setup-bun@v1
with:
bun-version: 1.2.21

- name: Install dependencies
run: bun install --frozen-lockfile

- name: Run judge compatibility tests
env:
OPENCODE_API_KEY: ${{ secrets.OPENCODE_API_KEY }}
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: bun test:compatibility

- name: Run basic consistency tests
env:
OPENCODE_API_KEY: ${{ secrets.OPENCODE_API_KEY }}
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: bun test:consistency

test-full:
runs-on: ubuntu-latest
if: github.ref == 'refs/heads/main'
environment: production

steps:
- name: Checkout repository
uses: actions/checkout@v4

- name: Setup Bun
uses: oven-sh/setup-bun@v1
with:
bun-version: 1.2.21

- name: Install dependencies
run: bun install --frozen-lockfile

- name: Run full consistency test suite
env:
OPENCODE_API_KEY: ${{ secrets.OPENCODE_API_KEY }}
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: bun test:consistency:full
10 changes: 9 additions & 1 deletion dataset.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -55,4 +55,12 @@
integration-points:
weight: 0.22
test-coverage:
weight: 0.22
weight: 0.2
checks:
weight: 0.1
args:
setup:
- npm ci
commands:
- npm run lint
- npm run build
9 changes: 8 additions & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,14 @@
"scripts": {
"build": "bun build cli.ts --outfile dist/cli.js --target node --format esm --external node:*",
"check": "tsc --noEmit",
"dev": "bun run cli.ts"
"dev": "bun run cli.ts",
"test": "bun test",
"test:consistency": "bun test tests/judgeConsistency.test.ts",
"test:consistency:medium": "bun test tests/judgeConsistencyMedium.test.ts",
"test:consistency:complex": "bun test tests/judgeConsistencyComplex.test.ts",
"test:consistency:full": "bun test tests/judgeConsistency*.test.ts",
"test:compatibility": "bun test tests/judgeCompatibility.test.ts",
"show-test-outputs": "bun run scripts/show-test-outputs.ts"
},
"packageManager": "bun@1.2.21",
"dependencies": {
Expand Down
11 changes: 9 additions & 2 deletions scores/api-signature.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ import { createScore, scoreResultSchema } from "~/lib/createScore.js";
import { fetchComparisonDiff } from "~/lib/github.js";
import { finalizeAgentChanges } from "~/lib/finalizeAgentChanges.js";

const systemPrompt = `You are evaluating whether an autonomous agent reproduced the exact API signatures from a reference git commit.
export const systemPrompt = `You are evaluating whether an autonomous agent reproduced the exact API signatures from a reference git commit.

**YOUR ROLE**: Check if function/method/class signatures match EXACTLY.

Expand Down Expand Up @@ -139,6 +139,13 @@ This evaluation is STRICT. API signatures must match EXACTLY because:

Return JSON with 'score' (0 or 1) and detailed rationale listing all signature mismatches found.`;

export function createUserPrompt(
reference: string,
candidateDiff: string,
): string {
return `Reference diff:\n${reference}\n\nCandidate diff:\n${candidateDiff}\n\nCompare ONLY the API signatures (function names, parameter order, parameter names). Ignore implementation details. Respond with JSON.`;
}

export default createScore({
prepare: async ({ evaluation }) => {
try {
Expand Down Expand Up @@ -191,7 +198,7 @@ export default createScore({
schema: scoreResultSchema,
system: systemPrompt,
temperature: 0,
prompt: `Reference diff:\n${reference}\n\nCandidate diff:\n${candidateDiff}\n\nCompare ONLY the API signatures (function names, parameter order, parameter names). Ignore implementation details. Respond with JSON.`,
prompt: createUserPrompt(reference, candidateDiff),
});

return object;
Expand Down
11 changes: 9 additions & 2 deletions scores/integration-points.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ import { createScore, scoreResultSchema } from "~/lib/createScore.js";
import { fetchComparisonDiff } from "~/lib/github.js";
import { finalizeAgentChanges } from "~/lib/finalizeAgentChanges.js";

const systemPrompt = `You are evaluating whether an autonomous agent integrated functions in the same places as a reference git commit.
export const systemPrompt = `You are evaluating whether an autonomous agent integrated functions in the same places as a reference git commit.

**YOUR ROLE**: Check if functions are called from the same locations and in the same way.

Expand Down Expand Up @@ -170,6 +170,13 @@ Integration points should match because:

Return JSON with 'score' (0 or 1) and detailed rationale listing all integration mismatches found.`;

export function createUserPrompt(
reference: string,
candidateDiff: string,
): string {
return `Reference diff:\n${reference}\n\nCandidate diff:\n${candidateDiff}\n\nCompare ONLY the integration points (imports, function calls, call locations, timing). Ignore implementation details. Respond with JSON.`;
}

export default createScore({
prepare: async ({ evaluation }) => {
try {
Expand Down Expand Up @@ -222,7 +229,7 @@ export default createScore({
schema: scoreResultSchema,
system: systemPrompt,
temperature: 0,
prompt: `Reference diff:\n${reference}\n\nCandidate diff:\n${candidateDiff}\n\nCompare ONLY the integration points (imports, function calls, call locations, timing). Ignore implementation details. Respond with JSON.`,
prompt: createUserPrompt(reference, candidateDiff),
});

return object;
Expand Down
11 changes: 9 additions & 2 deletions scores/logic-equivalence.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ import { createScore, scoreResultSchema } from "~/lib/createScore.js";
import { fetchComparisonDiff } from "~/lib/github.js";
import { finalizeAgentChanges } from "~/lib/finalizeAgentChanges.js";

const systemPrompt = `You are evaluating whether an autonomous agent reproduced the logical behavior from a reference git commit.
export const systemPrompt = `You are evaluating whether an autonomous agent reproduced the logical behavior from a reference git commit.

**YOUR ROLE**: Check if the conditional logic and control flow produce the same outcomes.

Expand Down Expand Up @@ -214,6 +214,13 @@ if failures is not None and isinstance(failures, list) {

Return JSON with 'score' (0 or 1) and detailed rationale explaining any logic differences found.`;

export function createUserPrompt(
reference: string,
candidateDiff: string,
): string {
return `Reference diff:\n${reference}\n\nCandidate diff:\n${candidateDiff}\n\nCompare ONLY the logical behavior (conditions, edge cases, side effects). Ignore code structure and style. Respond with JSON.`;
}

export default createScore({
prepare: async ({ evaluation }) => {
try {
Expand Down Expand Up @@ -266,7 +273,7 @@ export default createScore({
schema: scoreResultSchema,
system: systemPrompt,
temperature: 0,
prompt: `Reference diff:\n${reference}\n\nCandidate diff:\n${candidateDiff}\n\nCompare ONLY the logical behavior (conditions, edge cases, side effects). Ignore code structure and style. Respond with JSON.`,
prompt: createUserPrompt(reference, candidateDiff),
});

return object;
Expand Down
111 changes: 111 additions & 0 deletions scripts/show-test-outputs.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
/**
* Script to show full judge consistency test outputs
*/
import { generateObject } from "ai";
import { scoreResultSchema } from "~/lib/createScore.js";
import { judges } from "~/judges.js";
import { systemPrompt as logicEquivalencePrompt } from "~/scores/logic-equivalence.js";
import { systemPrompt as apiSignaturePrompt } from "~/scores/api-signature.js";
import {
logicEquivalenceFixtures,
apiSignatureFixtures,
} from "~/tests/fixtures/judgeConsistencyFixtures.js";

const judge = judges[0]; // claude-4.5

async function evaluateAndShow(
name: string,
systemPrompt: string,
reference: string,
candidate: string,
scoreType: string,
) {
console.log(`\n${"=".repeat(80)}`);
console.log(`TEST: ${name}`);
console.log(`${"=".repeat(80)}\n`);

const userPrompt =
scoreType === "logic"
? `Reference diff:\n${reference}\n\nCandidate diff:\n${candidate}\n\nCompare ONLY the logical behavior (conditions, edge cases, side effects). Ignore code structure and style. Respond with JSON.`
: `Reference diff:\n${reference}\n\nCandidate diff:\n${candidate}\n\nCompare ONLY the API signatures (function names, parameter order, parameter names). Ignore implementation details. Respond with JSON.`;

for (let i = 1; i <= 3; i++) {
console.log(`\n--- Run ${i} ---\n`);

const { object } = await generateObject({
model: judge.model,
schema: scoreResultSchema,
system: systemPrompt,
temperature: 0,
prompt: userPrompt,
});

console.log(`Score: ${object.score}`);
console.log(`\nRationale:\n${object.rationale}`);
console.log();
}
}

// Run tests
(async () => {
console.log("JUDGE CONSISTENCY TEST OUTPUT - FULL RATIONALES");
console.log("Using judge: claude-4.5");

// Logic Equivalence - Perfect Match
await evaluateAndShow(
"Logic Equivalence - Perfect Match",
logicEquivalencePrompt,
logicEquivalenceFixtures.perfect.reference,
logicEquivalenceFixtures.perfect.candidate,
"logic",
);

// Logic Equivalence - Wrong Implementation
await evaluateAndShow(
"Logic Equivalence - Wrong Implementation",
logicEquivalencePrompt,
logicEquivalenceFixtures.wrong.reference,
logicEquivalenceFixtures.wrong.candidate,
"logic",
);

// Logic Equivalence - Ambiguous
await evaluateAndShow(
"Logic Equivalence - Ambiguous (Guard vs Nested)",
logicEquivalencePrompt,
logicEquivalenceFixtures.ambiguous.reference,
logicEquivalenceFixtures.ambiguous.candidate,
"logic",
);

// API Signature - Perfect Match
await evaluateAndShow(
"API Signature - Perfect Match",
apiSignaturePrompt,
apiSignatureFixtures.perfect.reference,
apiSignatureFixtures.perfect.candidate,
"api",
);

// API Signature - Wrong Implementation
await evaluateAndShow(
"API Signature - Wrong Implementation (Parameter Name)",
apiSignaturePrompt,
apiSignatureFixtures.wrong.reference,
apiSignatureFixtures.wrong.candidate,
"api",
);

// API Signature - Ambiguous
await evaluateAndShow(
"API Signature - Ambiguous (Parameter Order)",
apiSignaturePrompt,
apiSignatureFixtures.ambiguous.reference,
apiSignatureFixtures.ambiguous.candidate,
"api",
);

console.log(`\n${"=".repeat(80)}`);
console.log("ALL TESTS COMPLETE");
console.log(`${"=".repeat(80)}\n`);
})();
Loading
Loading