From c2a9743ab283a102845825a32398ca4d0b028adf Mon Sep 17 00:00:00 2001 From: Kristiyan Kostadinov Date: Thu, 4 Dec 2025 14:13:27 +0100 Subject: [PATCH 1/3] feat: add validation that ratings didn't change Adds a `ratingHash` on the environment that is generated from the current set of ratings. In addition, includes an `expectedRatingHash` field on the environment config which can be used to verify that an environment has a specific hash before it is executed. This is useful to ensure that the ratings stay stable between runs and `web-codegen-scorer` releases. --- runner/configuration/environment-config.ts | 7 ++ runner/configuration/environment.ts | 93 ++++++++++++++++------ runner/orchestration/generate-summary.ts | 1 + runner/orchestration/grouping.ts | 5 +- runner/shared-interfaces.ts | 7 ++ runner/utils/hashing.ts | 8 ++ 6 files changed, 95 insertions(+), 26 deletions(-) create mode 100644 runner/utils/hashing.ts diff --git a/runner/configuration/environment-config.ts b/runner/configuration/environment-config.ts index ed48b72..49d35a2 100644 --- a/runner/configuration/environment-config.ts +++ b/runner/configuration/environment-config.ts @@ -91,6 +91,13 @@ export const environmentConfigSchema = z.object({ }), ) .optional(), + + /** + * When an environment is created, it generates a hash based on the configured ratings. + * This field is used to validate that the generated hash matches a pre-defined one. + * It's useful to ensure that the set of ratings hasn't changed between two runs. + */ + expectedRatingHash: z.string().optional(), }); /** diff --git a/runner/configuration/environment.ts b/runner/configuration/environment.ts index 851bd02..02aa0aa 100644 --- a/runner/configuration/environment.ts +++ b/runner/configuration/environment.ts @@ -15,6 +15,12 @@ import {lazy} from '../utils/lazy-creation.js'; import {EnvironmentConfig} from './environment-config.js'; import {EvalPromptWithMetadata, MultiStepPrompt} from './prompts.js'; import {renderPromptTemplate} from './prompt-templating.js'; +import {getSha256Hash} from '../utils/hashing.js'; + +interface CategoryConfig { + name: string; + maxPoints: number; +} /** Represents a single prompt evaluation environment. */ export class Environment { @@ -40,10 +46,18 @@ export class Environment { readonly promptTimeoutMinutes: number | undefined; /** Configuration for the individual rating categories. */ readonly ratingCategories: { - [RatingCategory.HIGH_IMPACT]: {name: string; maxPoints: number}; - [RatingCategory.MEDIUM_IMPACT]: {name: string; maxPoints: number}; - [RatingCategory.LOW_IMPACT]: {name: string; maxPoints: number}; + [RatingCategory.HIGH_IMPACT]: CategoryConfig; + [RatingCategory.MEDIUM_IMPACT]: CategoryConfig; + [RatingCategory.LOW_IMPACT]: CategoryConfig; }; + /** + * Hash of the environment-level ratings. Can be used to + * validate that the ratings haven't changed between runs. + */ + readonly ratingHash: string; + + /** Ratings configured at the environment level. */ + private readonly ratings: Rating[]; constructor( rootPath: string, @@ -72,11 +86,14 @@ export class Environment { this.executor = config.executor; this.promptTimeoutMinutes = config.promptTimeoutMinutes; this.ratingCategories = this.getRatingCategories(config); + this.ratings = this.resolveRatings(config); + this.ratingHash = this.getRatingHash(this.ratings, this.ratingCategories); + this.validateRatingHash(this.ratingHash, config); } /** Prompts that should be executed as a part of the evaluation. */ executablePrompts = lazy(async () => { - return this.resolveExecutablePrompts(this.config.executablePrompts, this.config); + return this.resolveExecutablePrompts(this.config.executablePrompts); }); systemPromptGeneration = lazy(async () => { @@ -178,27 +195,9 @@ export class Environment { */ private async resolveExecutablePrompts( prompts: EnvironmentConfig['executablePrompts'], - config: EnvironmentConfig, ): Promise { const result: Promise[] = []; - let envRatings: Rating[]; - - if (config.ratingOverrides) { - Object.keys(config.ratingOverrides).forEach(id => { - if (!config.ratings.some(rating => rating.id === id)) { - throw new UserFacingError( - `Rating with an ID of "${id}" has not been configured. Cannot apply an override to it.`, - ); - } - }); - - envRatings = config.ratings.map(rating => { - const override = config.ratingOverrides![rating.id]; - return override ? {...rating, ...override} : rating; - }); - } else { - envRatings = config.ratings; - } + const envRatings = this.ratings; for (const def of prompts) { if (def instanceof MultiStepPrompt) { @@ -378,6 +377,25 @@ export class Environment { return result; } + private resolveRatings(config: EnvironmentConfig) { + if (!config.ratingOverrides) { + return config.ratings; + } + + Object.keys(config.ratingOverrides).forEach(id => { + if (!config.ratings.some(rating => rating.id === id)) { + throw new UserFacingError( + `Rating with an ID of "${id}" has not been configured. Cannot apply an override to it.`, + ); + } + }); + + return config.ratings.map(rating => { + const override = config.ratingOverrides![rating.id]; + return override ? {...rating, ...override} : rating; + }); + } + private getRatingCategories(config: EnvironmentConfig) { const overrides = config.categoryOverrides; @@ -399,4 +417,33 @@ export class Environment { }, }; } + + private getRatingHash( + ratings: Rating[], + categories: Record, + ): string { + const parts: string[] = []; + + for (const rating of ratings) { + parts.push( + `${rating.category};${categories[rating.category]?.maxPoints};` + + `${rating.id};${rating.scoreReduction};${rating.groupingLabels || [].sort().join(',')}`, + ); + } + + return getSha256Hash(parts.sort().join('|')); + } + + private validateRatingHash(currentHash: string, config: EnvironmentConfig) { + if (config.expectedRatingHash && config.expectedRatingHash !== currentHash) { + throw new UserFacingError( + [ + `Rating hash for environment "${this.displayName}" does not match the expectation.`, + `Expected: ${config.expectedRatingHash}`, + `Actual: ${this.ratingHash}`, + `Either update the \`expectedRatingHash\` field in the config or revert the ratings back to their previous configuration`, + ].join('\n'), + ); + } + } } diff --git a/runner/orchestration/generate-summary.ts b/runner/orchestration/generate-summary.ts index 6c077a0..935ba65 100644 --- a/runner/orchestration/generate-summary.ts +++ b/runner/orchestration/generate-summary.ts @@ -89,5 +89,6 @@ export async function prepareSummary( id: executorInfo.id, displayName: executorInfo.displayName, }, + ratingHash: env.ratingHash, } satisfies RunSummary; } diff --git a/runner/orchestration/grouping.ts b/runner/orchestration/grouping.ts index 9eb19aa..0135cf7 100644 --- a/runner/orchestration/grouping.ts +++ b/runner/orchestration/grouping.ts @@ -1,9 +1,8 @@ -import {createHash} from 'crypto'; -import type {LlmRunner} from '../codegen/llm-runner.js'; import type {Environment} from '../configuration/environment.js'; import {calculateBuildAndCheckStats} from '../ratings/stats.js'; import type {AssessmentResult, RunGroup, RunInfo} from '../shared-interfaces.js'; import {RunnerName} from '../codegen/runner-creation.js'; +import {getSha256Hash} from '../utils/hashing.js'; /** Generates a unique grouping ID for a run. */ export function getRunGroupId( @@ -30,7 +29,7 @@ export function getRunGroupId( `${options.labels?.sort().join('/')}/${options.model}/${options.runner}`; // The group string above can get long. Hash it to something shorter and fixed length. - return createHash('sha256').update(group).digest('hex'); + return getSha256Hash(group); } /** diff --git a/runner/shared-interfaces.ts b/runner/shared-interfaces.ts index a1d1096..c83dda1 100644 --- a/runner/shared-interfaces.ts +++ b/runner/shared-interfaces.ts @@ -442,6 +442,13 @@ export interface RunSummary { * Optional since some older reports might not have it. */ runner?: CodegenRunnerInfo; + + /** + * Hash of the environment-level ratings. Can be used to + * validate that the ratings haven't changed between runs. + * This field is optional, because older reports might not have it. + */ + ratingHash?: string; } /** diff --git a/runner/utils/hashing.ts b/runner/utils/hashing.ts new file mode 100644 index 0000000..90c28a4 --- /dev/null +++ b/runner/utils/hashing.ts @@ -0,0 +1,8 @@ +import {createHash} from 'node:crypto'; + +/** + * Returns a sha-256 hash of a string. + */ +export function getSha256Hash(value: string): string { + return createHash('sha256').update(value).digest('hex'); +} From aa1f5a2e44eef6bd7a0b69b5f639929d99438d8b Mon Sep 17 00:00:00 2001 From: Kristiyan Kostadinov Date: Thu, 4 Dec 2025 14:17:02 +0100 Subject: [PATCH 2/3] build: bump version --- package.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/package.json b/package.json index 94d8c93..40158c4 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "web-codegen-scorer", - "version": "0.0.50", + "version": "0.0.51", "scripts": { "build-runner": "tsc", "release-build": "tsx ./scripts/release-build.ts", From 762c09e3462abcd620feb2eb4dbef1f67b22b4b3 Mon Sep 17 00:00:00 2001 From: Kristiyan Kostadinov Date: Thu, 4 Dec 2025 14:18:48 +0100 Subject: [PATCH 3/3] build: update scripts Formats the release scripts to match the rest of the repo. Also sets `stable` as the first selection in the release script. --- scripts/npm-publish.ts | 38 ++++++++++++------------------- scripts/release-build.ts | 49 +++++++++++++++------------------------- 2 files changed, 32 insertions(+), 55 deletions(-) diff --git a/scripts/npm-publish.ts b/scripts/npm-publish.ts index 4e26315..06b2c34 100644 --- a/scripts/npm-publish.ts +++ b/scripts/npm-publish.ts @@ -1,8 +1,8 @@ -import { join } from 'path'; -import { spawn } from 'child_process'; -import { input, select } from '@inquirer/prompts'; -import { executeCommand } from '../runner/utils/exec.js'; -import { readFile, writeFile } from 'fs/promises'; +import {join} from 'path'; +import {spawn} from 'child_process'; +import {input, select} from '@inquirer/prompts'; +import {executeCommand} from '../runner/utils/exec.js'; +import {readFile, writeFile} from 'fs/promises'; const root = join(import.meta.dirname, '..'); const distDirectory = join(root, 'dist'); @@ -22,22 +22,17 @@ const registry = 'https://wombat-dressing-room.appspot.com'; const distTag = await select({ choices: [ - { name: 'Pre-release', value: 'next' }, - { name: 'Stable', value: 'latest' }, + {name: 'Stable', value: 'latest'}, + {name: 'Pre-release', value: 'next'}, ], message: 'Select a release channel', }); // Build the project. - await executeCommand( - `pnpm release-build --version=${version}`, - root, - undefined, - { - forwardStdoutToParent: true, - forwardStderrToParent: true, - } - ); + await executeCommand(`pnpm release-build --version=${version}`, root, undefined, { + forwardStdoutToParent: true, + forwardStderrToParent: true, + }); // Log into our registry. await spawnInteractive('npm', ['login', '--registry', registry]); @@ -50,15 +45,12 @@ const registry = 'https://wombat-dressing-room.appspot.com'; { forwardStderrToParent: true, forwardStdoutToParent: true, - } + }, ); // Write the package.json back to disk so the version is in sync. packageJson.version = version; - await writeFile( - packageJsonPath, - JSON.stringify(packageJson, undefined, 2) + '\n' - ); + await writeFile(packageJsonPath, JSON.stringify(packageJson, undefined, 2) + '\n'); console.log('Done! 🎉'); console.log('Remember to push the changed package.json!'); @@ -77,8 +69,6 @@ function spawnInteractive(command: string, args: string[]) { stdio: 'inherit', }); - childProcess.on('close', (status) => - status === 0 ? resolve() : reject(status) - ); + childProcess.on('close', status => (status === 0 ? resolve() : reject(status))); }); } diff --git a/scripts/release-build.ts b/scripts/release-build.ts index d01e877..18166b8 100644 --- a/scripts/release-build.ts +++ b/scripts/release-build.ts @@ -1,9 +1,9 @@ -import { join } from 'path'; -import { rm, cp, readFile, writeFile } from 'fs/promises'; +import {join} from 'path'; +import {rm, cp, readFile, writeFile} from 'fs/promises'; import yargs from 'yargs'; -import { hideBin } from 'yargs/helpers'; -import { globSync as glob } from 'tinyglobby'; -import { executeCommand } from '../runner/utils/exec.js'; +import {hideBin} from 'yargs/helpers'; +import {globSync as glob} from 'tinyglobby'; +import {executeCommand} from '../runner/utils/exec.js'; const root = join(import.meta.dirname, '..'); const runnerSource = join(root, 'runner'); @@ -28,7 +28,7 @@ const args = yargs(hideBin(process.argv)) console.log('Building release output...'); // Clear out the target directory. - await rm(targetDirectory, { recursive: true, force: true }); + await rm(targetDirectory, {recursive: true, force: true}); // Build the runner. This also creates `dist`. await executeCommand('pnpm build-runner', runnerSource, undefined, { @@ -38,7 +38,7 @@ const args = yargs(hideBin(process.argv)) // Generate the package.json. await writeFile( join(targetDirectory, 'package.json'), - await getPackageJson(join(root, 'package.json'), args.version) + await getPackageJson(join(root, 'package.json'), args.version), ); // Copy the readme and license. @@ -50,18 +50,10 @@ const args = yargs(hideBin(process.argv)) glob('**/*', { cwd: join(root, 'examples'), dot: true, - ignore: [ - '**/node_modules/**', - '**/dist/**', - '**/.vinxi/**', - '**/.output/**', - ], - }).map((agentFile) => - cp( - join(root, 'examples', agentFile), - join(targetDirectory, 'examples', agentFile) - ) - ) + ignore: ['**/node_modules/**', '**/dist/**', '**/.vinxi/**', '**/.output/**'], + }).map(agentFile => + cp(join(root, 'examples', agentFile), join(targetDirectory, 'examples', agentFile)), + ), ); // The user journey testing requires various files to work. @@ -71,12 +63,12 @@ const args = yargs(hideBin(process.argv)) cwd: join(root, browserAgentRelativePath), dot: true, ignore: ['*.ts', 'README.md'], - }).map((agentFile) => + }).map(agentFile => cp( join(root, browserAgentRelativePath, agentFile), - join(targetDirectory, browserAgentRelativePath, agentFile) - ) - ) + join(targetDirectory, browserAgentRelativePath, agentFile), + ), + ), ); if (!args.runnerOnly) { @@ -86,16 +78,13 @@ const args = yargs(hideBin(process.argv)) }); // Copy the report artifacts into the `dist`. - await cp(reportAppDist, targetDirectory, { recursive: true }); + await cp(reportAppDist, targetDirectory, {recursive: true}); } console.log(`Release output has been built in ${targetDirectory}`); })(); -async function getPackageJson( - path: string, - version: string | null -): Promise { +async function getPackageJson(path: string, version: string | null): Promise { const content = await readFile(path, 'utf8'); const parsed = JSON.parse(content) as { version: string; @@ -106,9 +95,7 @@ async function getPackageJson( if (version) { if (version === parsed.version) { - throw new Error( - `Specified version is the same version as the current one.` - ); + throw new Error(`Specified version is the same version as the current one.`); } else { parsed.version = version; }