diff --git a/runner/eval-cli.ts b/runner/eval-cli.ts index 700b93b..90ea718 100644 --- a/runner/eval-cli.ts +++ b/runner/eval-cli.ts @@ -2,7 +2,10 @@ import { Arguments, Argv, CommandModule } from 'yargs'; import chalk from 'chalk'; import { join } from 'path'; import { assertValidModelName, LlmRunner } from './codegen/llm-runner.js'; -import { DEFAULT_MODEL_NAME } from './configuration/constants.js'; +import { + DEFAULT_AUTORATER_MODEL_NAME, + DEFAULT_MODEL_NAME, +} from './configuration/constants.js'; import { generateCodeAndAssess } from './orchestration/generate.js'; import { logReportToConsole, @@ -48,6 +51,7 @@ interface Options { skipAxeTesting?: boolean; enableUserJourneyTesting?: boolean; enableAutoCsp?: boolean; + autoraterModel?: string; logging?: 'text-only' | 'dynamic'; } @@ -156,6 +160,11 @@ function builder(argv: Argv): Argv { description: 'Whether to include a automatic hash-based Content-Security-Policy and Trusted Types to find incompatibilities.', }) + .option('autorater-model', { + type: 'string', + default: DEFAULT_AUTORATER_MODEL_NAME, + descript: 'Model to use when automatically rating generated code', + }) .strict() .version(false) .help() @@ -204,6 +213,7 @@ async function handler(cliArgs: Arguments): Promise { enableUserJourneyTesting: cliArgs.enableUserJourneyTesting, enableAutoCsp: cliArgs.enableAutoCsp, logging: cliArgs.logging, + autoraterModel: cliArgs.autoraterModel, }); logReportToConsole(runInfo); diff --git a/runner/orchestration/generate.ts b/runner/orchestration/generate.ts index 9f6a802..f267b6c 100644 --- a/runner/orchestration/generate.ts +++ b/runner/orchestration/generate.ts @@ -6,7 +6,11 @@ import PQueue from 'p-queue'; import { basename, join } from 'path'; import { existsSync, readdirSync } from 'fs'; import { LlmGenerateFilesContext, LlmRunner } from '../codegen/llm-runner.js'; -import { LLM_OUTPUT_DIR, REPORT_VERSION } from '../configuration/constants.js'; +import { + DEFAULT_AUTORATER_MODEL_NAME, + LLM_OUTPUT_DIR, + REPORT_VERSION, +} from '../configuration/constants.js'; import { Environment } from '../configuration/environment.js'; import { rateGeneratedCode } from '../ratings/rate-code.js'; import { summarizeReportWithAI } from '../reporting/ai-summarize.js'; @@ -77,6 +81,7 @@ export async function generateCodeAndAssess(options: { enableUserJourneyTesting?: boolean; enableAutoCsp?: boolean; logging?: 'text-only' | 'dynamic'; + autoraterModel?: string; }): Promise { const env = await getEnvironmentByPath(options.environmentConfigPath); const promptsToProcess = getCandidateExecutablePrompts( @@ -163,7 +168,8 @@ export async function generateCodeAndAssess(options: { !!options.enableUserJourneyTesting, !!options.enableAutoCsp, workerConcurrencyQueue, - progress + progress, + options.autoraterModel || DEFAULT_AUTORATER_MODEL_NAME ), // 10min max per app evaluation. We just want to make sure it never gets stuck. 10 @@ -291,7 +297,8 @@ async function startEvaluationTask( enableUserJourneyTesting: boolean, enableAutoCsp: boolean, workerConcurrencyQueue: PQueue, - progress: ProgressLogger + progress: ProgressLogger, + autoraterModel: string ): Promise { // Set up the project structure once for the root project. const { directory, cleanup } = await setupProjectStructure( @@ -444,7 +451,8 @@ async function startEvaluationTask( attempt.repairAttempts, attempt.axeRepairAttempts, abortSignal, - progress + progress, + autoraterModel ); results.push({ diff --git a/runner/ratings/built-in-ratings/code-quality-rating.ts b/runner/ratings/built-in-ratings/code-quality-rating.ts index e58c85c..2f77ef7 100644 --- a/runner/ratings/built-in-ratings/code-quality-rating.ts +++ b/runner/ratings/built-in-ratings/code-quality-rating.ts @@ -1,4 +1,3 @@ -import { DEFAULT_AUTORATER_MODEL_NAME } from '../../configuration/constants.js'; import { autoRateCode } from '../autoraters/code-rater.js'; import { LLMBasedRating, @@ -12,7 +11,6 @@ export const codeQualityRating: LLMBasedRating = { kind: RatingKind.LLM_BASED, name: 'Code Quality (LLM-rated)', description: `Rates the app's source code via LLM`, - model: DEFAULT_AUTORATER_MODEL_NAME, category: RatingCategory.MEDIUM_IMPACT, id: 'common-autorater-code-quality', scoreReduction: '30%', diff --git a/runner/ratings/built-in-ratings/visual-appearance-rating.ts b/runner/ratings/built-in-ratings/visual-appearance-rating.ts index 0bea82d..e2d783d 100644 --- a/runner/ratings/built-in-ratings/visual-appearance-rating.ts +++ b/runner/ratings/built-in-ratings/visual-appearance-rating.ts @@ -1,5 +1,4 @@ import { TimeoutError } from 'puppeteer'; -import { DEFAULT_AUTORATER_MODEL_NAME } from '../../configuration/constants.js'; import { AutoRateResult } from '../autoraters/auto-rate-shared.js'; import { autoRateAppearance } from '../autoraters/visuals-rater.js'; import { @@ -18,7 +17,6 @@ export const visualAppearanceRating: LLMBasedRating = { category: RatingCategory.MEDIUM_IMPACT, scoreReduction: '30%', id: 'common-autorater-visuals', - model: DEFAULT_AUTORATER_MODEL_NAME, rate: async (ctx) => { if (ctx.buildResult.screenshotPngUrl === undefined) { return { diff --git a/runner/ratings/rate-code.ts b/runner/ratings/rate-code.ts index fb1ffb3..56d878b 100644 --- a/runner/ratings/rate-code.ts +++ b/runner/ratings/rate-code.ts @@ -54,7 +54,8 @@ export async function rateGeneratedCode( repairAttempts: number, axeRepairAttempts: number, abortSignal: AbortSignal, - progress: ProgressLogger + progress: ProgressLogger, + autoraterModel: string ): Promise { let categorizedFiles: CategorizedFiles | null = null; let totalPoints = 0; @@ -107,7 +108,8 @@ export async function rateGeneratedCode( buildResult, repairAttempts, axeRepairAttempts, - abortSignal + abortSignal, + autoraterModel ); } else { throw new UserFacingError(`Unsupported rating type ${current}`); @@ -269,14 +271,15 @@ async function runLlmBasedRating( buildResult: BuildResult, repairAttempts: number, axeRepairAttempts: number, - abortSignal: AbortSignal + abortSignal: AbortSignal, + autoraterModel: string ): Promise { const result = await rating.rate({ environment, fullPromptText, currentPromptDef, llm, - model: rating.model, + model: autoraterModel, outputFiles, buildResult, repairAttempts, diff --git a/runner/ratings/rating-types.ts b/runner/ratings/rating-types.ts index e1c0d50..c4b83b7 100644 --- a/runner/ratings/rating-types.ts +++ b/runner/ratings/rating-types.ts @@ -99,7 +99,6 @@ const llmBasedRatingSchema = z .object({ ...ratingSchemaCommonFields, kind: z.literal(RatingKind.LLM_BASED), - model: z.string(), rate: z .function() .args(z.custom()) diff --git a/runner/reporting/report-logging.ts b/runner/reporting/report-logging.ts index cb36940..bcd27f6 100644 --- a/runner/reporting/report-logging.ts +++ b/runner/reporting/report-logging.ts @@ -6,7 +6,10 @@ import { RunInfo, ScoreBucket, } from '../shared-interfaces.js'; -import { REPORTS_ROOT_DIR } from '../configuration/constants.js'; +import { + DEFAULT_AUTORATER_MODEL_NAME, + REPORTS_ROOT_DIR, +} from '../configuration/constants.js'; import { calculateBuildAndCheckStats } from '../ratings/stats.js'; import { safeWriteFile } from '../file-system-utils.js'; import { BuildResultStatus } from '../builder/builder-types.js'; @@ -160,6 +163,7 @@ export function logReportHeader( llm: LlmRunner; labels: string[]; startMcp?: boolean; + autoraterModel?: string; } ): void { const titleCardText = [ @@ -167,6 +171,10 @@ export function logReportHeader( '', ` - Environment: ${env.displayName}`, ` - Model: ${options.model}`, + options.autoraterModel && + options.autoraterModel !== DEFAULT_AUTORATER_MODEL_NAME + ? ` - Autorater model: ${options.autoraterModel}` + : null, ` - Runner: ${options.llm.displayName}`, ` - MCP servers: ${options.startMcp && env.mcpServerOptions.length ? env.mcpServerOptions.length : 'none'}`, options.labels.length ? ` - Labels: ${options.labels.join(', ')}` : null,