From 9ff03ef333a6e67660d401f3a4704191e8e77738 Mon Sep 17 00:00:00 2001 From: Kristiyan Kostadinov Date: Thu, 2 Oct 2025 11:59:54 +0200 Subject: [PATCH] feat: add support for Codex Adds support for running evals using Codex. --- README.md | 2 +- package.json | 4 +- pnpm-lock.yaml | 11 +++++ report-app/src/app/shared/provider-label.ts | 1 + runner/codegen/claude-code-runner.ts | 2 + runner/codegen/codex-runner.ts | 53 +++++++++++++++++++++ runner/codegen/runner-creation.ts | 4 ++ runner/eval-cli.ts | 2 +- 8 files changed, 76 insertions(+), 3 deletions(-) create mode 100644 runner/codegen/codex-runner.ts diff --git a/README.md b/README.md index 31d9bd0..8604847 100644 --- a/README.md +++ b/README.md @@ -83,7 +83,7 @@ You can customize the `web-codegen-scorer eval` script with the following flags: - Example: `web-codegen-scorer eval --model=gemini-2.5-flash --autorater-model=gemini-2.5-flash --env=` - `--runner=`: Specifies the runner to use to execute the eval. Supported runners are - `genkit` (default), `gemini-cli` or `claude-code`. + `genkit` (default), `gemini-cli`, `claude-code` or `codex`. - `--local`: Runs the script in local mode for the initial code generation request. Instead of calling the LLM, it will attempt to read the initial code from a corresponding file in the diff --git a/package.json b/package.json index 93a525f..b4c003e 100644 --- a/package.json +++ b/package.json @@ -91,11 +91,13 @@ }, "optionalDependencies": { "@anthropic-ai/claude-code": "^2.0.0", - "@google/gemini-cli": "^0.5.0" + "@google/gemini-cli": "^0.5.0", + "@openai/codex": "^0.42.0" }, "devDependencies": { "@anthropic-ai/claude-code": "^2.0.0", "@google/gemini-cli": "^0.5.0", + "@openai/codex": "^0.42.0", "prettier": "^3.5.3", "tsx": "^4.20.3" } diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index c00278c..ac53c7e 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -130,6 +130,9 @@ importers: '@google/gemini-cli': specifier: ^0.5.0 version: 0.5.5(encoding@0.1.13) + '@openai/codex': + specifier: ^0.42.0 + version: 0.42.0 report-app: dependencies: @@ -1561,6 +1564,11 @@ packages: resolution: {integrity: sha512-vaQj4nccJbAslopIvd49pQH2NhUp7G9pY4byUtmwhe37ZZuubGrx0eB9hW2F37uVNRuDDK6byFGXF+7JCuMSZg==} engines: {node: ^20.17.0 || >=22.9.0} + '@openai/codex@0.42.0': + resolution: {integrity: sha512-jLpMrQuq1gIBzBKbKMwAzXOh+5uwE+ht3RHUb2Ov7P50fjAxPKDZa0+zpqkhHTspm8Rw6Vdrm4I4L+Z03usCkg==} + engines: {node: '>=20'} + hasBin: true + '@opentelemetry/api-logs@0.203.0': resolution: {integrity: sha512-9B9RU0H7Ya1Dx/Rkyc4stuBZSGVQF27WigitInx2QQoj6KUpEFYPKoWjdFTunJYxmXmh17HeBvbMa1EhGyPmqQ==} engines: {node: '>=8.0.0'} @@ -8186,6 +8194,9 @@ snapshots: transitivePeerDependencies: - supports-color + '@openai/codex@0.42.0': + optional: true + '@opentelemetry/api-logs@0.203.0': dependencies: '@opentelemetry/api': 1.9.0 diff --git a/report-app/src/app/shared/provider-label.ts b/report-app/src/app/shared/provider-label.ts index 14872ee..b3ab5af 100644 --- a/report-app/src/app/shared/provider-label.ts +++ b/report-app/src/app/shared/provider-label.ts @@ -8,6 +8,7 @@ const exactMatches: Record = { solid: 'frameworks/solid.svg', 'gemini-cli': 'gemini.webp', genkit: 'genkit.png', + codex: 'open-ai.png', }; @Component({ diff --git a/runner/codegen/claude-code-runner.ts b/runner/codegen/claude-code-runner.ts index b97a7b3..5c64bf5 100644 --- a/runner/codegen/claude-code-runner.ts +++ b/runner/codegen/claude-code-runner.ts @@ -16,6 +16,8 @@ export class ClaudeCodeRunner extends BaseCliAgentRunner implements LlmRunner { readonly hasBuiltInRepairLoop = true; protected ignoredFilePatterns = ['**/CLAUDE.md', '**/.claude/**']; protected binaryName = 'claude'; + + // Claude only outputs once at the end so we bump the inactivity timeout. protected override inactivityTimeoutMins = 10; protected override totalRequestTimeoutMins = 10; diff --git a/runner/codegen/codex-runner.ts b/runner/codegen/codex-runner.ts new file mode 100644 index 0000000..2c61933 --- /dev/null +++ b/runner/codegen/codex-runner.ts @@ -0,0 +1,53 @@ +import {LlmGenerateFilesRequestOptions, LlmRunner} from './llm-runner.js'; +import {join} from 'path'; +import {mkdirSync} from 'fs'; +import {writeFile} from 'fs/promises'; +import {BaseCliAgentRunner} from './base-cli-agent-runner.js'; + +const MODEL_MAPPING: Record = { + 'openai-o3': 'o3', + 'openai-o4-mini': 'o4-mini', + 'openai-gpt-5': 'gpt-5-codex', +}; + +/** Runner that generates code using Codex. */ +export class CodexRunner extends BaseCliAgentRunner implements LlmRunner { + readonly id = 'codex'; + readonly displayName = 'Codex'; + readonly hasBuiltInRepairLoop = true; + protected ignoredFilePatterns = ['**/AGENTS.md', '**/.codex/**']; + protected binaryName = 'codex'; + + getSupportedModels(): string[] { + return Object.keys(MODEL_MAPPING); + } + + protected getCommandLineFlags(options: LlmGenerateFilesRequestOptions): string[] { + return [ + 'exec', + '--model', + MODEL_MAPPING[options.model], + // Skip all confirmations. + '--dangerously-bypass-approvals-and-sandbox', + '--skip-git-repo-check', + options.context.executablePrompt, + ]; + } + + protected async writeAgentFiles(options: LlmGenerateFilesRequestOptions): Promise { + const {context} = options; + const instructionFilePath = join(context.directory, 'AGENTS.md'); + const settingsDir = join(context.directory, '.codex'); + + mkdirSync(settingsDir); + + await Promise.all([ + writeFile(join(settingsDir, 'config.toml'), this.getSettingsFile()), + writeFile(instructionFilePath, super.getCommonInstructions(options)), + ]); + } + + private getSettingsFile(): string { + return ['hide_agent_reasoning = true', ''].join('\n'); + } +} diff --git a/runner/codegen/runner-creation.ts b/runner/codegen/runner-creation.ts index 4ea677c..ccfee96 100644 --- a/runner/codegen/runner-creation.ts +++ b/runner/codegen/runner-creation.ts @@ -2,11 +2,13 @@ import {UserFacingError} from '../utils/errors.js'; import type {GeminiCliRunner} from './gemini-cli-runner.js'; import type {ClaudeCodeRunner} from './claude-code-runner.js'; import type {GenkitRunner} from './genkit/genkit-runner.js'; +import type {CodexRunner} from './codex-runner.js'; interface AvailableRunners { genkit: GenkitRunner; 'gemini-cli': GeminiCliRunner; 'claude-code': ClaudeCodeRunner; + 'codex': CodexRunner; } /** Names of supported runners. */ @@ -31,6 +33,8 @@ export async function getRunnerByName(name: T): Promise new m.ClaudeCodeRunner() as AvailableRunners[T], ); + case 'codex': + return import('./codex-runner.js').then(m => new m.CodexRunner() as AvailableRunners[T]); default: throw new UserFacingError(`Unsupported runner ${name}`); } diff --git a/runner/eval-cli.ts b/runner/eval-cli.ts index 6173b3a..8734583 100644 --- a/runner/eval-cli.ts +++ b/runner/eval-cli.ts @@ -57,7 +57,7 @@ function builder(argv: Argv): Argv { .option('runner', { type: 'string', default: 'genkit' as const, - choices: ['genkit', 'gemini-cli', 'claude-code'] as RunnerName[], + choices: ['genkit', 'gemini-cli', 'claude-code', 'codex'] as RunnerName[], description: 'Runner to use to execute the eval', }) .option('local', {