Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 11 additions & 1 deletion runner/eval-cli.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,10 @@ import { Arguments, Argv, CommandModule } from 'yargs';
import chalk from 'chalk';
import { join } from 'path';
import { assertValidModelName, LlmRunner } from './codegen/llm-runner.js';
import { DEFAULT_MODEL_NAME } from './configuration/constants.js';
import {
DEFAULT_AUTORATER_MODEL_NAME,
DEFAULT_MODEL_NAME,
} from './configuration/constants.js';
import { generateCodeAndAssess } from './orchestration/generate.js';
import {
logReportToConsole,
Expand Down Expand Up @@ -48,6 +51,7 @@ interface Options {
skipAxeTesting?: boolean;
enableUserJourneyTesting?: boolean;
enableAutoCsp?: boolean;
autoraterModel?: string;
logging?: 'text-only' | 'dynamic';
}

Expand Down Expand Up @@ -156,6 +160,11 @@ function builder(argv: Argv): Argv<Options> {
description:
'Whether to include a automatic hash-based Content-Security-Policy and Trusted Types to find incompatibilities.',
})
.option('autorater-model', {
type: 'string',
default: DEFAULT_AUTORATER_MODEL_NAME,
descript: 'Model to use when automatically rating generated code',
})
.strict()
.version(false)
.help()
Expand Down Expand Up @@ -204,6 +213,7 @@ async function handler(cliArgs: Arguments<Options>): Promise<void> {
enableUserJourneyTesting: cliArgs.enableUserJourneyTesting,
enableAutoCsp: cliArgs.enableAutoCsp,
logging: cliArgs.logging,
autoraterModel: cliArgs.autoraterModel,
});

logReportToConsole(runInfo);
Expand Down
16 changes: 12 additions & 4 deletions runner/orchestration/generate.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,11 @@ import PQueue from 'p-queue';
import { basename, join } from 'path';
import { existsSync, readdirSync } from 'fs';
import { LlmGenerateFilesContext, LlmRunner } from '../codegen/llm-runner.js';
import { LLM_OUTPUT_DIR, REPORT_VERSION } from '../configuration/constants.js';
import {
DEFAULT_AUTORATER_MODEL_NAME,
LLM_OUTPUT_DIR,
REPORT_VERSION,
} from '../configuration/constants.js';
import { Environment } from '../configuration/environment.js';
import { rateGeneratedCode } from '../ratings/rate-code.js';
import { summarizeReportWithAI } from '../reporting/ai-summarize.js';
Expand Down Expand Up @@ -77,6 +81,7 @@ export async function generateCodeAndAssess(options: {
enableUserJourneyTesting?: boolean;
enableAutoCsp?: boolean;
logging?: 'text-only' | 'dynamic';
autoraterModel?: string;
}): Promise<RunInfo> {
const env = await getEnvironmentByPath(options.environmentConfigPath);
const promptsToProcess = getCandidateExecutablePrompts(
Expand Down Expand Up @@ -163,7 +168,8 @@ export async function generateCodeAndAssess(options: {
!!options.enableUserJourneyTesting,
!!options.enableAutoCsp,
workerConcurrencyQueue,
progress
progress,
options.autoraterModel || DEFAULT_AUTORATER_MODEL_NAME
),
// 10min max per app evaluation. We just want to make sure it never gets stuck.
10
Expand Down Expand Up @@ -291,7 +297,8 @@ async function startEvaluationTask(
enableUserJourneyTesting: boolean,
enableAutoCsp: boolean,
workerConcurrencyQueue: PQueue,
progress: ProgressLogger
progress: ProgressLogger,
autoraterModel: string
): Promise<AssessmentResult[]> {
// Set up the project structure once for the root project.
const { directory, cleanup } = await setupProjectStructure(
Expand Down Expand Up @@ -444,7 +451,8 @@ async function startEvaluationTask(
attempt.repairAttempts,
attempt.axeRepairAttempts,
abortSignal,
progress
progress,
autoraterModel
);

results.push({
Expand Down
2 changes: 0 additions & 2 deletions runner/ratings/built-in-ratings/code-quality-rating.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import { DEFAULT_AUTORATER_MODEL_NAME } from '../../configuration/constants.js';
import { autoRateCode } from '../autoraters/code-rater.js';
import {
LLMBasedRating,
Expand All @@ -12,7 +11,6 @@ export const codeQualityRating: LLMBasedRating = {
kind: RatingKind.LLM_BASED,
name: 'Code Quality (LLM-rated)',
description: `Rates the app's source code via LLM`,
model: DEFAULT_AUTORATER_MODEL_NAME,
category: RatingCategory.MEDIUM_IMPACT,
id: 'common-autorater-code-quality',
scoreReduction: '30%',
Expand Down
2 changes: 0 additions & 2 deletions runner/ratings/built-in-ratings/visual-appearance-rating.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import { TimeoutError } from 'puppeteer';
import { DEFAULT_AUTORATER_MODEL_NAME } from '../../configuration/constants.js';
import { AutoRateResult } from '../autoraters/auto-rate-shared.js';
import { autoRateAppearance } from '../autoraters/visuals-rater.js';
import {
Expand All @@ -18,7 +17,6 @@ export const visualAppearanceRating: LLMBasedRating = {
category: RatingCategory.MEDIUM_IMPACT,
scoreReduction: '30%',
id: 'common-autorater-visuals',
model: DEFAULT_AUTORATER_MODEL_NAME,
rate: async (ctx) => {
if (ctx.buildResult.screenshotPngUrl === undefined) {
return {
Expand Down
11 changes: 7 additions & 4 deletions runner/ratings/rate-code.ts
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,8 @@ export async function rateGeneratedCode(
repairAttempts: number,
axeRepairAttempts: number,
abortSignal: AbortSignal,
progress: ProgressLogger
progress: ProgressLogger,
autoraterModel: string
): Promise<CodeAssessmentScore> {
let categorizedFiles: CategorizedFiles | null = null;
let totalPoints = 0;
Expand Down Expand Up @@ -107,7 +108,8 @@ export async function rateGeneratedCode(
buildResult,
repairAttempts,
axeRepairAttempts,
abortSignal
abortSignal,
autoraterModel
);
} else {
throw new UserFacingError(`Unsupported rating type ${current}`);
Expand Down Expand Up @@ -269,14 +271,15 @@ async function runLlmBasedRating(
buildResult: BuildResult,
repairAttempts: number,
axeRepairAttempts: number,
abortSignal: AbortSignal
abortSignal: AbortSignal,
autoraterModel: string
): Promise<IndividualAssessment | SkippedIndividualAssessment> {
const result = await rating.rate({
environment,
fullPromptText,
currentPromptDef,
llm,
model: rating.model,
model: autoraterModel,
outputFiles,
buildResult,
repairAttempts,
Expand Down
1 change: 0 additions & 1 deletion runner/ratings/rating-types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,6 @@ const llmBasedRatingSchema = z
.object({
...ratingSchemaCommonFields,
kind: z.literal(RatingKind.LLM_BASED),
model: z.string(),
rate: z
.function()
.args(z.custom<LLMBasedRatingContext>())
Expand Down
10 changes: 9 additions & 1 deletion runner/reporting/report-logging.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,10 @@ import {
RunInfo,
ScoreBucket,
} from '../shared-interfaces.js';
import { REPORTS_ROOT_DIR } from '../configuration/constants.js';
import {
DEFAULT_AUTORATER_MODEL_NAME,
REPORTS_ROOT_DIR,
} from '../configuration/constants.js';
import { calculateBuildAndCheckStats } from '../ratings/stats.js';
import { safeWriteFile } from '../file-system-utils.js';
import { BuildResultStatus } from '../builder/builder-types.js';
Expand Down Expand Up @@ -160,13 +163,18 @@ export function logReportHeader(
llm: LlmRunner;
labels: string[];
startMcp?: boolean;
autoraterModel?: string;
}
): void {
const titleCardText = [
'Running a codegen evaluation with configuration:',
'',
` - Environment: ${env.displayName}`,
` - Model: ${options.model}`,
options.autoraterModel &&
options.autoraterModel !== DEFAULT_AUTORATER_MODEL_NAME
? ` - Autorater model: ${options.autoraterModel}`
: null,
` - Runner: ${options.llm.displayName}`,
` - MCP servers: ${options.startMcp && env.mcpServerOptions.length ? env.mcpServerOptions.length : 'none'}`,
options.labels.length ? ` - Labels: ${options.labels.join(', ')}` : null,
Expand Down
Loading