From 9ff03ef333a6e67660d401f3a4704191e8e77738 Mon Sep 17 00:00:00 2001
From: Kristiyan Kostadinov <crisbeto@abv.bg>
Date: Thu, 2 Oct 2025 11:59:54 +0200
Subject: [PATCH] feat: add support for Codex

Adds support for running evals using Codex.
---
 README.md                                   |  2 +-
 package.json                                |  4 +-
 pnpm-lock.yaml                              | 11 +++++
 report-app/src/app/shared/provider-label.ts |  1 +
 runner/codegen/claude-code-runner.ts        |  2 +
 runner/codegen/codex-runner.ts              | 53 +++++++++++++++++++++
 runner/codegen/runner-creation.ts           |  4 ++
 runner/eval-cli.ts                          |  2 +-
 8 files changed, 76 insertions(+), 3 deletions(-)
 create mode 100644 runner/codegen/codex-runner.ts
diff --git a/README.md b/README.md
index 31d9bd0..8604847 100644
--- a/README.md
+++ b/README.md
@@ -83,7 +83,7 @@ You can customize the `web-codegen-scorer eval` script with the following flags:
     - Example: `web-codegen-scorer eval --model=gemini-2.5-flash --autorater-model=gemini-2.5-flash --env=<config path>`
 
 - `--runner=<name>`: Specifies the runner to use to execute the eval. Supported runners are
-  `genkit` (default), `gemini-cli` or `claude-code`.
+  `genkit` (default), `gemini-cli`, `claude-code` or `codex`.
 
 - `--local`: Runs the script in local mode for the initial code generation request. Instead of
   calling the LLM, it will attempt to read the initial code from a corresponding file in the
diff --git a/package.json b/package.json
index 93a525f..b4c003e 100644
--- a/package.json
+++ b/package.json
@@ -91,11 +91,13 @@
   },
   "optionalDependencies": {
     "@anthropic-ai/claude-code": "^2.0.0",
-    "@google/gemini-cli": "^0.5.0"
+    "@google/gemini-cli": "^0.5.0",
+    "@openai/codex": "^0.42.0"
   },
   "devDependencies": {
     "@anthropic-ai/claude-code": "^2.0.0",
     "@google/gemini-cli": "^0.5.0",
+    "@openai/codex": "^0.42.0",
     "prettier": "^3.5.3",
     "tsx": "^4.20.3"
   }
diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml
index c00278c..ac53c7e 100644
--- a/pnpm-lock.yaml
+++ b/pnpm-lock.yaml
@@ -130,6 +130,9 @@ importers:
       '@google/gemini-cli':
         specifier: ^0.5.0
         version: 0.5.5(encoding@0.1.13)
+      '@openai/codex':
+        specifier: ^0.42.0
+        version: 0.42.0
 
   report-app:
     dependencies:
@@ -1561,6 +1564,11 @@ packages:
     resolution: {integrity: sha512-vaQj4nccJbAslopIvd49pQH2NhUp7G9pY4byUtmwhe37ZZuubGrx0eB9hW2F37uVNRuDDK6byFGXF+7JCuMSZg==}
     engines: {node: ^20.17.0 || >=22.9.0}
 
+  '@openai/codex@0.42.0':
+    resolution: {integrity: sha512-jLpMrQuq1gIBzBKbKMwAzXOh+5uwE+ht3RHUb2Ov7P50fjAxPKDZa0+zpqkhHTspm8Rw6Vdrm4I4L+Z03usCkg==}
+    engines: {node: '>=20'}
+    hasBin: true
+
   '@opentelemetry/api-logs@0.203.0':
     resolution: {integrity: sha512-9B9RU0H7Ya1Dx/Rkyc4stuBZSGVQF27WigitInx2QQoj6KUpEFYPKoWjdFTunJYxmXmh17HeBvbMa1EhGyPmqQ==}
     engines: {node: '>=8.0.0'}
@@ -8186,6 +8194,9 @@ snapshots:
     transitivePeerDependencies:
       - supports-color
 
+  '@openai/codex@0.42.0':
+    optional: true
+
   '@opentelemetry/api-logs@0.203.0':
     dependencies:
       '@opentelemetry/api': 1.9.0
diff --git a/report-app/src/app/shared/provider-label.ts b/report-app/src/app/shared/provider-label.ts
index 14872ee..b3ab5af 100644
--- a/report-app/src/app/shared/provider-label.ts
+++ b/report-app/src/app/shared/provider-label.ts
@@ -8,6 +8,7 @@ const exactMatches: Record<string, string> = {
   solid: 'frameworks/solid.svg',
   'gemini-cli': 'gemini.webp',
   genkit: 'genkit.png',
+  codex: 'open-ai.png',
 };
 
 @Component({
diff --git a/runner/codegen/claude-code-runner.ts b/runner/codegen/claude-code-runner.ts
index b97a7b3..5c64bf5 100644
--- a/runner/codegen/claude-code-runner.ts
+++ b/runner/codegen/claude-code-runner.ts
@@ -16,6 +16,8 @@ export class ClaudeCodeRunner extends BaseCliAgentRunner implements LlmRunner {
   readonly hasBuiltInRepairLoop = true;
   protected ignoredFilePatterns = ['**/CLAUDE.md', '**/.claude/**'];
   protected binaryName = 'claude';
+
+  // Claude only outputs once at the end so we bump the inactivity timeout.
   protected override inactivityTimeoutMins = 10;
   protected override totalRequestTimeoutMins = 10;
 
diff --git a/runner/codegen/codex-runner.ts b/runner/codegen/codex-runner.ts
new file mode 100644
index 0000000..2c61933
--- /dev/null
+++ b/runner/codegen/codex-runner.ts
@@ -0,0 +1,53 @@
+import {LlmGenerateFilesRequestOptions, LlmRunner} from './llm-runner.js';
+import {join} from 'path';
+import {mkdirSync} from 'fs';
+import {writeFile} from 'fs/promises';
+import {BaseCliAgentRunner} from './base-cli-agent-runner.js';
+
+const MODEL_MAPPING: Record<string, string> = {
+  'openai-o3': 'o3',
+  'openai-o4-mini': 'o4-mini',
+  'openai-gpt-5': 'gpt-5-codex',
+};
+
+/** Runner that generates code using Codex. */
+export class CodexRunner extends BaseCliAgentRunner implements LlmRunner {
+  readonly id = 'codex';
+  readonly displayName = 'Codex';
+  readonly hasBuiltInRepairLoop = true;
+  protected ignoredFilePatterns = ['**/AGENTS.md', '**/.codex/**'];
+  protected binaryName = 'codex';
+
+  getSupportedModels(): string[] {
+    return Object.keys(MODEL_MAPPING);
+  }
+
+  protected getCommandLineFlags(options: LlmGenerateFilesRequestOptions): string[] {
+    return [
+      'exec',
+      '--model',
+      MODEL_MAPPING[options.model],
+      // Skip all confirmations.
+      '--dangerously-bypass-approvals-and-sandbox',
+      '--skip-git-repo-check',
+      options.context.executablePrompt,
+    ];
+  }
+
+  protected async writeAgentFiles(options: LlmGenerateFilesRequestOptions): Promise<void> {
+    const {context} = options;
+    const instructionFilePath = join(context.directory, 'AGENTS.md');
+    const settingsDir = join(context.directory, '.codex');
+
+    mkdirSync(settingsDir);
+
+    await Promise.all([
+      writeFile(join(settingsDir, 'config.toml'), this.getSettingsFile()),
+      writeFile(instructionFilePath, super.getCommonInstructions(options)),
+    ]);
+  }
+
+  private getSettingsFile(): string {
+    return ['hide_agent_reasoning = true', ''].join('\n');
+  }
+}
diff --git a/runner/codegen/runner-creation.ts b/runner/codegen/runner-creation.ts
index 4ea677c..ccfee96 100644
--- a/runner/codegen/runner-creation.ts
+++ b/runner/codegen/runner-creation.ts
@@ -2,11 +2,13 @@ import {UserFacingError} from '../utils/errors.js';
 import type {GeminiCliRunner} from './gemini-cli-runner.js';
 import type {ClaudeCodeRunner} from './claude-code-runner.js';
 import type {GenkitRunner} from './genkit/genkit-runner.js';
+import type {CodexRunner} from './codex-runner.js';
 
 interface AvailableRunners {
   genkit: GenkitRunner;
   'gemini-cli': GeminiCliRunner;
   'claude-code': ClaudeCodeRunner;
+  'codex': CodexRunner;
 }
 
 /** Names of supported runners. */
@@ -31,6 +33,8 @@ export async function getRunnerByName<T extends RunnerName>(name: T): Promise<Av
       return import('./claude-code-runner.js').then(
         m => new m.ClaudeCodeRunner() as AvailableRunners[T],
       );
+    case 'codex':
+      return import('./codex-runner.js').then(m => new m.CodexRunner() as AvailableRunners[T]);
     default:
       throw new UserFacingError(`Unsupported runner ${name}`);
   }
diff --git a/runner/eval-cli.ts b/runner/eval-cli.ts
index 6173b3a..8734583 100644
--- a/runner/eval-cli.ts
+++ b/runner/eval-cli.ts
@@ -57,7 +57,7 @@ function builder(argv: Argv): Argv<Options> {
       .option('runner', {
         type: 'string',
         default: 'genkit' as const,
-        choices: ['genkit', 'gemini-cli', 'claude-code'] as RunnerName[],
+        choices: ['genkit', 'gemini-cli', 'claude-code', 'codex'] as RunnerName[],
         description: 'Runner to use to execute the eval',
       })
       .option('local', {