Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "web-codegen-scorer",
"version": "0.0.50",
"version": "0.0.51",
"scripts": {
"build-runner": "tsc",
"release-build": "tsx ./scripts/release-build.ts",
Expand Down
7 changes: 7 additions & 0 deletions runner/configuration/environment-config.ts
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,13 @@ export const environmentConfigSchema = z.object({
}),
)
.optional(),

/**
* When an environment is created, it generates a hash based on the configured ratings.
* This field is used to validate that the generated hash matches a pre-defined one.
* It's useful to ensure that the set of ratings hasn't changed between two runs.
*/
expectedRatingHash: z.string().optional(),
});

/**
Expand Down
93 changes: 70 additions & 23 deletions runner/configuration/environment.ts
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,12 @@ import {lazy} from '../utils/lazy-creation.js';
import {EnvironmentConfig} from './environment-config.js';
import {EvalPromptWithMetadata, MultiStepPrompt} from './prompts.js';
import {renderPromptTemplate} from './prompt-templating.js';
import {getSha256Hash} from '../utils/hashing.js';

interface CategoryConfig {
name: string;
maxPoints: number;
}

/** Represents a single prompt evaluation environment. */
export class Environment {
Expand All @@ -40,10 +46,18 @@ export class Environment {
readonly promptTimeoutMinutes: number | undefined;
/** Configuration for the individual rating categories. */
readonly ratingCategories: {
[RatingCategory.HIGH_IMPACT]: {name: string; maxPoints: number};
[RatingCategory.MEDIUM_IMPACT]: {name: string; maxPoints: number};
[RatingCategory.LOW_IMPACT]: {name: string; maxPoints: number};
[RatingCategory.HIGH_IMPACT]: CategoryConfig;
[RatingCategory.MEDIUM_IMPACT]: CategoryConfig;
[RatingCategory.LOW_IMPACT]: CategoryConfig;
};
/**
* Hash of the environment-level ratings. Can be used to
* validate that the ratings haven't changed between runs.
*/
readonly ratingHash: string;

/** Ratings configured at the environment level. */
private readonly ratings: Rating[];

constructor(
rootPath: string,
Expand Down Expand Up @@ -72,11 +86,14 @@ export class Environment {
this.executor = config.executor;
this.promptTimeoutMinutes = config.promptTimeoutMinutes;
this.ratingCategories = this.getRatingCategories(config);
this.ratings = this.resolveRatings(config);
this.ratingHash = this.getRatingHash(this.ratings, this.ratingCategories);
this.validateRatingHash(this.ratingHash, config);
}

/** Prompts that should be executed as a part of the evaluation. */
executablePrompts = lazy(async () => {
return this.resolveExecutablePrompts(this.config.executablePrompts, this.config);
return this.resolveExecutablePrompts(this.config.executablePrompts);
});

systemPromptGeneration = lazy(async () => {
Expand Down Expand Up @@ -178,27 +195,9 @@ export class Environment {
*/
private async resolveExecutablePrompts(
prompts: EnvironmentConfig['executablePrompts'],
config: EnvironmentConfig,
): Promise<RootPromptDefinition[]> {
const result: Promise<RootPromptDefinition>[] = [];
let envRatings: Rating[];

if (config.ratingOverrides) {
Object.keys(config.ratingOverrides).forEach(id => {
if (!config.ratings.some(rating => rating.id === id)) {
throw new UserFacingError(
`Rating with an ID of "${id}" has not been configured. Cannot apply an override to it.`,
);
}
});

envRatings = config.ratings.map(rating => {
const override = config.ratingOverrides![rating.id];
return override ? {...rating, ...override} : rating;
});
} else {
envRatings = config.ratings;
}
const envRatings = this.ratings;

for (const def of prompts) {
if (def instanceof MultiStepPrompt) {
Expand Down Expand Up @@ -378,6 +377,25 @@ export class Environment {
return result;
}

private resolveRatings(config: EnvironmentConfig) {
if (!config.ratingOverrides) {
return config.ratings;
}

Object.keys(config.ratingOverrides).forEach(id => {
if (!config.ratings.some(rating => rating.id === id)) {
throw new UserFacingError(
`Rating with an ID of "${id}" has not been configured. Cannot apply an override to it.`,
);
}
});

return config.ratings.map(rating => {
const override = config.ratingOverrides![rating.id];
return override ? {...rating, ...override} : rating;
});
}

private getRatingCategories(config: EnvironmentConfig) {
const overrides = config.categoryOverrides;

Expand All @@ -399,4 +417,33 @@ export class Environment {
},
};
}

private getRatingHash(
ratings: Rating[],
categories: Record<RatingCategory, CategoryConfig>,
): string {
const parts: string[] = [];

for (const rating of ratings) {
parts.push(
`${rating.category};${categories[rating.category]?.maxPoints};` +
`${rating.id};${rating.scoreReduction};${rating.groupingLabels || [].sort().join(',')}`,
);
}

return getSha256Hash(parts.sort().join('|'));
}

private validateRatingHash(currentHash: string, config: EnvironmentConfig) {
if (config.expectedRatingHash && config.expectedRatingHash !== currentHash) {
throw new UserFacingError(
[
`Rating hash for environment "${this.displayName}" does not match the expectation.`,
`Expected: ${config.expectedRatingHash}`,
`Actual: ${this.ratingHash}`,
`Either update the \`expectedRatingHash\` field in the config or revert the ratings back to their previous configuration`,
].join('\n'),
);
}
}
}
1 change: 1 addition & 0 deletions runner/orchestration/generate-summary.ts
Original file line number Diff line number Diff line change
Expand Up @@ -89,5 +89,6 @@ export async function prepareSummary(
id: executorInfo.id,
displayName: executorInfo.displayName,
},
ratingHash: env.ratingHash,
} satisfies RunSummary;
}
5 changes: 2 additions & 3 deletions runner/orchestration/grouping.ts
Original file line number Diff line number Diff line change
@@ -1,9 +1,8 @@
import {createHash} from 'crypto';
import type {LlmRunner} from '../codegen/llm-runner.js';
import type {Environment} from '../configuration/environment.js';
import {calculateBuildAndCheckStats} from '../ratings/stats.js';
import type {AssessmentResult, RunGroup, RunInfo} from '../shared-interfaces.js';
import {RunnerName} from '../codegen/runner-creation.js';
import {getSha256Hash} from '../utils/hashing.js';

/** Generates a unique grouping ID for a run. */
export function getRunGroupId(
Expand All @@ -30,7 +29,7 @@ export function getRunGroupId(
`${options.labels?.sort().join('/')}/${options.model}/${options.runner}`;

// The group string above can get long. Hash it to something shorter and fixed length.
return createHash('sha256').update(group).digest('hex');
return getSha256Hash(group);
}

/**
Expand Down
7 changes: 7 additions & 0 deletions runner/shared-interfaces.ts
Original file line number Diff line number Diff line change
Expand Up @@ -442,6 +442,13 @@ export interface RunSummary {
* Optional since some older reports might not have it.
*/
runner?: CodegenRunnerInfo;

/**
* Hash of the environment-level ratings. Can be used to
* validate that the ratings haven't changed between runs.
* This field is optional, because older reports might not have it.
*/
ratingHash?: string;
}

/**
Expand Down
8 changes: 8 additions & 0 deletions runner/utils/hashing.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
import {createHash} from 'node:crypto';

/**
* Returns a sha-256 hash of a string.
*/
export function getSha256Hash(value: string): string {
return createHash('sha256').update(value).digest('hex');
}
38 changes: 14 additions & 24 deletions scripts/npm-publish.ts
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
import { join } from 'path';
import { spawn } from 'child_process';
import { input, select } from '@inquirer/prompts';
import { executeCommand } from '../runner/utils/exec.js';
import { readFile, writeFile } from 'fs/promises';
import {join} from 'path';
import {spawn} from 'child_process';
import {input, select} from '@inquirer/prompts';
import {executeCommand} from '../runner/utils/exec.js';
import {readFile, writeFile} from 'fs/promises';

const root = join(import.meta.dirname, '..');
const distDirectory = join(root, 'dist');
Expand All @@ -22,22 +22,17 @@ const registry = 'https://wombat-dressing-room.appspot.com';

const distTag = await select({
choices: [
{ name: 'Pre-release', value: 'next' },
{ name: 'Stable', value: 'latest' },
{name: 'Stable', value: 'latest'},
{name: 'Pre-release', value: 'next'},
],
message: 'Select a release channel',
});

// Build the project.
await executeCommand(
`pnpm release-build --version=${version}`,
root,
undefined,
{
forwardStdoutToParent: true,
forwardStderrToParent: true,
}
);
await executeCommand(`pnpm release-build --version=${version}`, root, undefined, {
forwardStdoutToParent: true,
forwardStderrToParent: true,
});

// Log into our registry.
await spawnInteractive('npm', ['login', '--registry', registry]);
Expand All @@ -50,15 +45,12 @@ const registry = 'https://wombat-dressing-room.appspot.com';
{
forwardStderrToParent: true,
forwardStdoutToParent: true,
}
},
);

// Write the package.json back to disk so the version is in sync.
packageJson.version = version;
await writeFile(
packageJsonPath,
JSON.stringify(packageJson, undefined, 2) + '\n'
);
await writeFile(packageJsonPath, JSON.stringify(packageJson, undefined, 2) + '\n');

console.log('Done! 🎉');
console.log('Remember to push the changed package.json!');
Expand All @@ -77,8 +69,6 @@ function spawnInteractive(command: string, args: string[]) {
stdio: 'inherit',
});

childProcess.on('close', (status) =>
status === 0 ? resolve() : reject(status)
);
childProcess.on('close', status => (status === 0 ? resolve() : reject(status)));
});
}
49 changes: 18 additions & 31 deletions scripts/release-build.ts
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
import { join } from 'path';
import { rm, cp, readFile, writeFile } from 'fs/promises';
import {join} from 'path';
import {rm, cp, readFile, writeFile} from 'fs/promises';
import yargs from 'yargs';
import { hideBin } from 'yargs/helpers';
import { globSync as glob } from 'tinyglobby';
import { executeCommand } from '../runner/utils/exec.js';
import {hideBin} from 'yargs/helpers';
import {globSync as glob} from 'tinyglobby';
import {executeCommand} from '../runner/utils/exec.js';

const root = join(import.meta.dirname, '..');
const runnerSource = join(root, 'runner');
Expand All @@ -28,7 +28,7 @@ const args = yargs(hideBin(process.argv))
console.log('Building release output...');

// Clear out the target directory.
await rm(targetDirectory, { recursive: true, force: true });
await rm(targetDirectory, {recursive: true, force: true});

// Build the runner. This also creates `dist`.
await executeCommand('pnpm build-runner', runnerSource, undefined, {
Expand All @@ -38,7 +38,7 @@ const args = yargs(hideBin(process.argv))
// Generate the package.json.
await writeFile(
join(targetDirectory, 'package.json'),
await getPackageJson(join(root, 'package.json'), args.version)
await getPackageJson(join(root, 'package.json'), args.version),
);

// Copy the readme and license.
Expand All @@ -50,18 +50,10 @@ const args = yargs(hideBin(process.argv))
glob('**/*', {
cwd: join(root, 'examples'),
dot: true,
ignore: [
'**/node_modules/**',
'**/dist/**',
'**/.vinxi/**',
'**/.output/**',
],
}).map((agentFile) =>
cp(
join(root, 'examples', agentFile),
join(targetDirectory, 'examples', agentFile)
)
)
ignore: ['**/node_modules/**', '**/dist/**', '**/.vinxi/**', '**/.output/**'],
}).map(agentFile =>
cp(join(root, 'examples', agentFile), join(targetDirectory, 'examples', agentFile)),
),
);

// The user journey testing requires various files to work.
Expand All @@ -71,12 +63,12 @@ const args = yargs(hideBin(process.argv))
cwd: join(root, browserAgentRelativePath),
dot: true,
ignore: ['*.ts', 'README.md'],
}).map((agentFile) =>
}).map(agentFile =>
cp(
join(root, browserAgentRelativePath, agentFile),
join(targetDirectory, browserAgentRelativePath, agentFile)
)
)
join(targetDirectory, browserAgentRelativePath, agentFile),
),
),
);

if (!args.runnerOnly) {
Expand All @@ -86,16 +78,13 @@ const args = yargs(hideBin(process.argv))
});

// Copy the report artifacts into the `dist`.
await cp(reportAppDist, targetDirectory, { recursive: true });
await cp(reportAppDist, targetDirectory, {recursive: true});
}

console.log(`Release output has been built in ${targetDirectory}`);
})();

async function getPackageJson(
path: string,
version: string | null
): Promise<string> {
async function getPackageJson(path: string, version: string | null): Promise<string> {
const content = await readFile(path, 'utf8');
const parsed = JSON.parse(content) as {
version: string;
Expand All @@ -106,9 +95,7 @@ async function getPackageJson(

if (version) {
if (version === parsed.version) {
throw new Error(
`Specified version is the same version as the current one.`
);
throw new Error(`Specified version is the same version as the current one.`);
} else {
parsed.version = version;
}
Expand Down
Loading