From 9e2a7664678fdafcd02843df6406e8de86ed670e Mon Sep 17 00:00:00 2001 From: Aaron Shim <5382864+aaronshim@users.noreply.github.com> Date: Thu, 25 Sep 2025 21:18:24 +0000 Subject: [PATCH 1/5] Auto-rater allows some previous rater results to be passed in as context. --- runner/ratings/autoraters/code-rater.ts | 32 +++++++++++++++---- runner/ratings/autoraters/rate-files.ts | 15 +++++++-- .../built-in-ratings/code-quality-rating.ts | 3 +- runner/ratings/built-in.ts | 2 +- runner/ratings/rate-code.ts | 28 ++++++++++++---- runner/ratings/rating-types.ts | 19 ++++++++++- 6 files changed, 80 insertions(+), 19 deletions(-) diff --git a/runner/ratings/autoraters/code-rater.ts b/runner/ratings/autoraters/code-rater.ts index 932092f..bcd9c85 100644 --- a/runner/ratings/autoraters/code-rater.ts +++ b/runner/ratings/autoraters/code-rater.ts @@ -2,7 +2,12 @@ import { readFileSync } from 'node:fs'; import { z } from 'zod'; import { prepareContextFilesMessage } from '../../orchestration/codegen.js'; import { Environment } from '../../configuration/environment.js'; -import { LlmResponseFile } from '../../shared-interfaces.js'; +import { + IndividualAssessment, + IndividualAssessmentState, + LlmResponseFile, + SkippedIndividualAssessment, +} from '../../shared-interfaces.js'; import { AutoRateResult, getCoefficient, @@ -10,6 +15,7 @@ import { } from './auto-rate-shared.js'; import { GenkitRunner } from '../../codegen/genkit/genkit-runner.js'; import defaultCodeRaterPrompt from './code-rating-prompt.js'; +import { RatingsContext } from '../rating-types.js'; /** Framework-specific hints for the rating prompt. */ const FW_HINTS: Record = { @@ -33,6 +39,7 @@ const CACHED_RATING_PROMPTS: Record = {}; * @param environment Environment in which the rating is running. * @param files Files to be rated. * @param appPrompt Prompt to be used for the rating. + * @param ratingsContext Context containing results from previous ratings. */ export async function autoRateCode( llm: GenkitRunner, @@ -40,7 +47,8 @@ export async function autoRateCode( model: string, environment: Environment, files: LlmResponseFile[], - appPrompt: string + appPrompt: string, + ratingsContext: RatingsContext ): Promise { const contextMessage = prepareContextFilesMessage( files.map((o) => ({ @@ -61,10 +69,22 @@ export async function autoRateCode( promptText = defaultCodeRaterPrompt; } - const prompt = environment.renderPrompt(promptText, null, { - APP_PROMPT: appPrompt, - FRAMEWORK_SPECIFIC_HINTS: FW_HINTS[environment.fullStackFramework.id] ?? '', - }).result; + const safetyRating = ratingsContext['safety-web']; + const safetyWebResultsJson = + safetyRating?.state === IndividualAssessmentState.EXECUTED + ? JSON.stringify(safetyRating, null, 2) + : ''; + + const prompt = environment.renderPrompt( + promptText, + environment.codeRatingPromptPath, + { + APP_PROMPT: appPrompt, + FRAMEWORK_SPECIFIC_HINTS: + FW_HINTS[environment.fullStackFramework.id] ?? '', + SAFETY_WEB_RESULTS_JSON: safetyWebResultsJson, + } + ).result; const result = await llm.generateConstrained({ abortSignal, diff --git a/runner/ratings/autoraters/rate-files.ts b/runner/ratings/autoraters/rate-files.ts index 89b5836..cc66820 100644 --- a/runner/ratings/autoraters/rate-files.ts +++ b/runner/ratings/autoraters/rate-files.ts @@ -1,9 +1,15 @@ import { greenCheckmark } from '../../reporting/format.js'; -import { AutoraterRunInfo, LlmResponseFile } from '../../shared-interfaces.js'; +import { + AutoraterRunInfo, + IndividualAssessment, + LlmResponseFile, + SkippedIndividualAssessment, +} from '../../shared-interfaces.js'; import { autoRateCode } from './code-rater.js'; import { autoRateAppearance } from './visuals-rater.js'; import { Environment } from '../../configuration/environment.js'; import { GenkitRunner } from '../../codegen/genkit/genkit-runner.js'; +import { RatingsContext } from '../rating-types.js'; /** * Automatically rates the code inside of a file. @@ -13,6 +19,7 @@ import { GenkitRunner } from '../../codegen/genkit/genkit-runner.js'; * @param filePath Path to the file to be rated. * @param appPrompt Prompt that should be checked. * @param screenshotPath Path to the screenshot to use for visual rating. + * @param ratingsContext Context containing results from previous ratings. */ export async function autoRateFiles( llm: GenkitRunner, @@ -21,7 +28,8 @@ export async function autoRateFiles( environment: Environment, files: LlmResponseFile[], appPrompt: string, - screenshotPngUrl: string | null + screenshotPngUrl: string | null, + ratingsContext: RatingsContext ): Promise { console.log(`Autorater is using '${model}' model. \n`); @@ -33,7 +41,8 @@ export async function autoRateFiles( model, environment, files, - appPrompt + appPrompt, + ratingsContext ); console.log(`${greenCheckmark()} Code scoring is successful.`); diff --git a/runner/ratings/built-in-ratings/code-quality-rating.ts b/runner/ratings/built-in-ratings/code-quality-rating.ts index 2f77ef7..ee2df0a 100644 --- a/runner/ratings/built-in-ratings/code-quality-rating.ts +++ b/runner/ratings/built-in-ratings/code-quality-rating.ts @@ -21,7 +21,8 @@ export const codeQualityRating: LLMBasedRating = { ctx.model, ctx.environment, ctx.outputFiles, - ctx.fullPromptText + ctx.fullPromptText, + ctx.ratingsContext ); return { diff --git a/runner/ratings/built-in.ts b/runner/ratings/built-in.ts index 4838366..c07264a 100644 --- a/runner/ratings/built-in.ts +++ b/runner/ratings/built-in.ts @@ -20,6 +20,7 @@ import { export function getBuiltInRatings(): Rating[] { return [ successfulBuildRating, + safetyWebRating, noRuntimeExceptionsRating, sufficientCodeSizeRating, sufficientGeneratedFilesRating, @@ -27,7 +28,6 @@ export function getBuiltInRatings(): Rating[] { visualAppearanceRating, validCssRating, axeRating, - safetyWebRating, userJourneysRating, NoInnerHtmlBindingsRating, NoDangerouslySetInnerHtmlRating, diff --git a/runner/ratings/rate-code.ts b/runner/ratings/rate-code.ts index 305682c..59646c5 100644 --- a/runner/ratings/rate-code.ts +++ b/runner/ratings/rate-code.ts @@ -20,6 +20,7 @@ import { POINTS_FOR_CATEGORIES, Rating, CATEGORY_NAMES, + RatingsContext, } from './rating-types.js'; import { extractEmbeddedCodeFromTypeScript } from './embedded-languages.js'; import { Environment } from '../configuration/environment.js'; @@ -62,6 +63,7 @@ export async function rateGeneratedCode( let categorizedFiles: CategorizedFiles | null = null; let totalPoints = 0; let maxOverallPoints = 0; + const ratingsContext: RatingsContext = {}; // Rating may also invoke LLMs. Track the usage. const tokenUsage = { @@ -95,11 +97,16 @@ export async function rateGeneratedCode( serveTestingResult, repairAttempts, outputFiles.length, - axeRepairAttempts + axeRepairAttempts, + ratingsContext ); } else if (current.kind === RatingKind.PER_FILE) { categorizedFiles ??= splitFilesIntoCategories(outputFiles); - result = await runPerFileRating(current, categorizedFiles); + result = await runPerFileRating( + current, + categorizedFiles, + ratingsContext + ); } else if (current.kind === RatingKind.LLM_BASED) { result = await runLlmBasedRating( environment, @@ -113,7 +120,8 @@ export async function rateGeneratedCode( repairAttempts, axeRepairAttempts, abortSignal, - autoraterModel + autoraterModel, + ratingsContext ); } else { throw new UserFacingError(`Unsupported rating type ${current}`); @@ -139,6 +147,7 @@ export async function rateGeneratedCode( ); } + ratingsContext[current.id] = result; category.assessments.push(result); } @@ -178,7 +187,8 @@ function runPerBuildRating( serveResult: ServeTestingResult | null, repairAttempts: number, generatedFileCount: number, - axeRepairAttempts: number + axeRepairAttempts: number, + ratingsContext: RatingsContext ): IndividualAssessment | SkippedIndividualAssessment { const rateResult = rating.rate({ buildResult, @@ -186,6 +196,7 @@ function runPerBuildRating( repairAttempts, generatedFileCount, axeRepairAttempts, + ratingsContext, }); // If the rating was skipped (e.g., Axe test wasn't run), create a skipped assessment. @@ -203,7 +214,8 @@ function runPerBuildRating( async function runPerFileRating( rating: PerFileRating, - categorizedFiles: CategorizedFiles + categorizedFiles: CategorizedFiles, + ratingsContext: RatingsContext ): Promise { const errorMessages: string[] = []; let contentType: PerFileRatingContentType; @@ -234,7 +246,7 @@ async function runPerFileRating( // Remove comments from the code to avoid false-detection of bad patterns. // Some keywords like `NgModule` can be used in code comments. const code = removeComments(file.code, contentType); - const result = await rating.rate(code, file.filePath); + const result = await rating.rate(code, file.filePath, ratingsContext); let coeff: number; if (typeof result === 'number') { @@ -279,7 +291,8 @@ async function runLlmBasedRating( repairAttempts: number, axeRepairAttempts: number, abortSignal: AbortSignal, - autoraterModel: string + autoraterModel: string, + ratingsContext: RatingsContext ): Promise { const result = await rating.rate({ environment, @@ -293,6 +306,7 @@ async function runLlmBasedRating( repairAttempts, axeRepairAttempts, abortSignal, + ratingsContext, }); if (result.state === RatingState.SKIPPED) { diff --git a/runner/ratings/rating-types.ts b/runner/ratings/rating-types.ts index 0a2aa0a..09019bc 100644 --- a/runner/ratings/rating-types.ts +++ b/runner/ratings/rating-types.ts @@ -1,8 +1,10 @@ import z from 'zod'; import { BuildResult } from '../workers/builder/builder-types.js'; import type { + IndividualAssessment, LlmResponseFile, PromptDefinition, + SkippedIndividualAssessment, Usage, } from '../shared-interfaces.js'; import { Environment } from '../configuration/environment.js'; @@ -64,6 +66,9 @@ const perBuildRatingSchema = z repairAttempts: z.number(), axeRepairAttempts: z.number(), generatedFileCount: z.number(), + ratingsContext: z.record( + z.custom() + ), }) ) .returns(z.custom()), @@ -76,7 +81,13 @@ const perFileRatingSchema = z kind: z.literal(RatingKind.PER_FILE), rate: z .function() - .args(z.string(), z.string().optional()) + .args( + z.string(), + z.string().optional(), + z.record( + z.custom() + ) + ) .returns(z.custom()), filter: z.union([ z @@ -171,6 +182,11 @@ export interface ExecutedLLMBasedRating { }; } +export type RatingsContext = Record< + string, + IndividualAssessment | SkippedIndividualAssessment +>; + export interface LLMBasedRatingContext { environment: Environment; fullPromptText: string; @@ -183,6 +199,7 @@ export interface LLMBasedRatingContext { repairAttempts: number; axeRepairAttempts: number; abortSignal: AbortSignal; + ratingsContext: RatingsContext; } /** Rating that applies over build results. */ From 575af8a967f11bdeb0d49382afa4065a96b59ac0 Mon Sep 17 00:00:00 2001 From: Aaron Shim <5382864+aaronshim@users.noreply.github.com> Date: Thu, 25 Sep 2025 21:32:13 +0000 Subject: [PATCH 2/5] Fix formatting --- runner/ratings/rating-types.ts | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/runner/ratings/rating-types.ts b/runner/ratings/rating-types.ts index 09019bc..ecaa587 100644 --- a/runner/ratings/rating-types.ts +++ b/runner/ratings/rating-types.ts @@ -84,9 +84,7 @@ const perFileRatingSchema = z .args( z.string(), z.string().optional(), - z.record( - z.custom() - ) + z.record(z.custom()) ) .returns(z.custom()), filter: z.union([ From 0e752f9a0a2039beae3d4d2ee8b9255f994b8b52 Mon Sep 17 00:00:00 2001 From: Aaron Shim <5382864+aaronshim@users.noreply.github.com> Date: Fri, 26 Sep 2025 18:52:21 +0000 Subject: [PATCH 3/5] Rename ratingsContext to ratingsResult --- runner/ratings/autoraters/code-rater.ts | 8 +++---- runner/ratings/autoraters/rate-files.ts | 8 +++---- .../built-in-ratings/code-quality-rating.ts | 2 +- runner/ratings/rate-code.ts | 24 +++++++++---------- runner/ratings/rating-types.ts | 6 ++--- 5 files changed, 24 insertions(+), 24 deletions(-) diff --git a/runner/ratings/autoraters/code-rater.ts b/runner/ratings/autoraters/code-rater.ts index bcd9c85..cf09a7b 100644 --- a/runner/ratings/autoraters/code-rater.ts +++ b/runner/ratings/autoraters/code-rater.ts @@ -15,7 +15,7 @@ import { } from './auto-rate-shared.js'; import { GenkitRunner } from '../../codegen/genkit/genkit-runner.js'; import defaultCodeRaterPrompt from './code-rating-prompt.js'; -import { RatingsContext } from '../rating-types.js'; +import { RatingsResult } from '../rating-types.js'; /** Framework-specific hints for the rating prompt. */ const FW_HINTS: Record = { @@ -39,7 +39,7 @@ const CACHED_RATING_PROMPTS: Record = {}; * @param environment Environment in which the rating is running. * @param files Files to be rated. * @param appPrompt Prompt to be used for the rating. - * @param ratingsContext Context containing results from previous ratings. + * @param ratingsResult Context containing results from previous ratings. */ export async function autoRateCode( llm: GenkitRunner, @@ -48,7 +48,7 @@ export async function autoRateCode( environment: Environment, files: LlmResponseFile[], appPrompt: string, - ratingsContext: RatingsContext + ratingsResult: RatingsResult ): Promise { const contextMessage = prepareContextFilesMessage( files.map((o) => ({ @@ -69,7 +69,7 @@ export async function autoRateCode( promptText = defaultCodeRaterPrompt; } - const safetyRating = ratingsContext['safety-web']; + const safetyRating = ratingsResult['safety-web']; const safetyWebResultsJson = safetyRating?.state === IndividualAssessmentState.EXECUTED ? JSON.stringify(safetyRating, null, 2) diff --git a/runner/ratings/autoraters/rate-files.ts b/runner/ratings/autoraters/rate-files.ts index cc66820..bb45da9 100644 --- a/runner/ratings/autoraters/rate-files.ts +++ b/runner/ratings/autoraters/rate-files.ts @@ -9,7 +9,7 @@ import { autoRateCode } from './code-rater.js'; import { autoRateAppearance } from './visuals-rater.js'; import { Environment } from '../../configuration/environment.js'; import { GenkitRunner } from '../../codegen/genkit/genkit-runner.js'; -import { RatingsContext } from '../rating-types.js'; +import { RatingsResult } from '../rating-types.js'; /** * Automatically rates the code inside of a file. @@ -19,7 +19,7 @@ import { RatingsContext } from '../rating-types.js'; * @param filePath Path to the file to be rated. * @param appPrompt Prompt that should be checked. * @param screenshotPath Path to the screenshot to use for visual rating. - * @param ratingsContext Context containing results from previous ratings. + * @param ratingsResult Context containing results from previous ratings. */ export async function autoRateFiles( llm: GenkitRunner, @@ -29,7 +29,7 @@ export async function autoRateFiles( files: LlmResponseFile[], appPrompt: string, screenshotPngUrl: string | null, - ratingsContext: RatingsContext + ratingsResult: RatingsResult, ): Promise { console.log(`Autorater is using '${model}' model. \n`); @@ -42,7 +42,7 @@ export async function autoRateFiles( environment, files, appPrompt, - ratingsContext + ratingsResult, ); console.log(`${greenCheckmark()} Code scoring is successful.`); diff --git a/runner/ratings/built-in-ratings/code-quality-rating.ts b/runner/ratings/built-in-ratings/code-quality-rating.ts index ee2df0a..6cc5f95 100644 --- a/runner/ratings/built-in-ratings/code-quality-rating.ts +++ b/runner/ratings/built-in-ratings/code-quality-rating.ts @@ -22,7 +22,7 @@ export const codeQualityRating: LLMBasedRating = { ctx.environment, ctx.outputFiles, ctx.fullPromptText, - ctx.ratingsContext + ctx.ratingsResult ); return { diff --git a/runner/ratings/rate-code.ts b/runner/ratings/rate-code.ts index 59646c5..a4f38fc 100644 --- a/runner/ratings/rate-code.ts +++ b/runner/ratings/rate-code.ts @@ -20,7 +20,7 @@ import { POINTS_FOR_CATEGORIES, Rating, CATEGORY_NAMES, - RatingsContext, + RatingsResult, } from './rating-types.js'; import { extractEmbeddedCodeFromTypeScript } from './embedded-languages.js'; import { Environment } from '../configuration/environment.js'; @@ -63,7 +63,7 @@ export async function rateGeneratedCode( let categorizedFiles: CategorizedFiles | null = null; let totalPoints = 0; let maxOverallPoints = 0; - const ratingsContext: RatingsContext = {}; + const ratingsResult: RatingsResult = {}; // Rating may also invoke LLMs. Track the usage. const tokenUsage = { @@ -98,14 +98,14 @@ export async function rateGeneratedCode( repairAttempts, outputFiles.length, axeRepairAttempts, - ratingsContext + ratingsResult ); } else if (current.kind === RatingKind.PER_FILE) { categorizedFiles ??= splitFilesIntoCategories(outputFiles); result = await runPerFileRating( current, categorizedFiles, - ratingsContext + ratingsResult ); } else if (current.kind === RatingKind.LLM_BASED) { result = await runLlmBasedRating( @@ -121,7 +121,7 @@ export async function rateGeneratedCode( axeRepairAttempts, abortSignal, autoraterModel, - ratingsContext + ratingsResult ); } else { throw new UserFacingError(`Unsupported rating type ${current}`); @@ -147,7 +147,7 @@ export async function rateGeneratedCode( ); } - ratingsContext[current.id] = result; + ratingsResult[current.id] = result; category.assessments.push(result); } @@ -188,7 +188,7 @@ function runPerBuildRating( repairAttempts: number, generatedFileCount: number, axeRepairAttempts: number, - ratingsContext: RatingsContext + ratingsResult: RatingsResult ): IndividualAssessment | SkippedIndividualAssessment { const rateResult = rating.rate({ buildResult, @@ -196,7 +196,7 @@ function runPerBuildRating( repairAttempts, generatedFileCount, axeRepairAttempts, - ratingsContext, + ratingsResult, }); // If the rating was skipped (e.g., Axe test wasn't run), create a skipped assessment. @@ -215,7 +215,7 @@ function runPerBuildRating( async function runPerFileRating( rating: PerFileRating, categorizedFiles: CategorizedFiles, - ratingsContext: RatingsContext + ratingsResult: RatingsResult ): Promise { const errorMessages: string[] = []; let contentType: PerFileRatingContentType; @@ -246,7 +246,7 @@ async function runPerFileRating( // Remove comments from the code to avoid false-detection of bad patterns. // Some keywords like `NgModule` can be used in code comments. const code = removeComments(file.code, contentType); - const result = await rating.rate(code, file.filePath, ratingsContext); + const result = await rating.rate(code, file.filePath, ratingsResult); let coeff: number; if (typeof result === 'number') { @@ -292,7 +292,7 @@ async function runLlmBasedRating( axeRepairAttempts: number, abortSignal: AbortSignal, autoraterModel: string, - ratingsContext: RatingsContext + ratingsResult: RatingsResult ): Promise { const result = await rating.rate({ environment, @@ -306,7 +306,7 @@ async function runLlmBasedRating( repairAttempts, axeRepairAttempts, abortSignal, - ratingsContext, + ratingsResult, }); if (result.state === RatingState.SKIPPED) { diff --git a/runner/ratings/rating-types.ts b/runner/ratings/rating-types.ts index ecaa587..2ce3f0d 100644 --- a/runner/ratings/rating-types.ts +++ b/runner/ratings/rating-types.ts @@ -66,7 +66,7 @@ const perBuildRatingSchema = z repairAttempts: z.number(), axeRepairAttempts: z.number(), generatedFileCount: z.number(), - ratingsContext: z.record( + ratingsResult: z.record( z.custom() ), }) @@ -180,7 +180,7 @@ export interface ExecutedLLMBasedRating { }; } -export type RatingsContext = Record< +export type RatingsResult = Record< string, IndividualAssessment | SkippedIndividualAssessment >; @@ -197,7 +197,7 @@ export interface LLMBasedRatingContext { repairAttempts: number; axeRepairAttempts: number; abortSignal: AbortSignal; - ratingsContext: RatingsContext; + ratingsResult: RatingsResult; } /** Rating that applies over build results. */ From 0f8f6f9f2cb377963c9c3137775b7980f35b1b80 Mon Sep 17 00:00:00 2001 From: Aaron Shim <5382864+aaronshim@users.noreply.github.com> Date: Fri, 26 Sep 2025 20:33:24 +0000 Subject: [PATCH 4/5] Formatting --- runner/ratings/autoraters/rate-files.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/runner/ratings/autoraters/rate-files.ts b/runner/ratings/autoraters/rate-files.ts index bb45da9..e5453f2 100644 --- a/runner/ratings/autoraters/rate-files.ts +++ b/runner/ratings/autoraters/rate-files.ts @@ -29,7 +29,7 @@ export async function autoRateFiles( files: LlmResponseFile[], appPrompt: string, screenshotPngUrl: string | null, - ratingsResult: RatingsResult, + ratingsResult: RatingsResult ): Promise { console.log(`Autorater is using '${model}' model. \n`); @@ -42,7 +42,7 @@ export async function autoRateFiles( environment, files, appPrompt, - ratingsResult, + ratingsResult ); console.log(`${greenCheckmark()} Code scoring is successful.`); From 97ccb799dd01015dce44651dc5c306c07a473b83 Mon Sep 17 00:00:00 2001 From: Aaron Shim <5382864+aaronshim@users.noreply.github.com> Date: Fri, 26 Sep 2025 23:24:01 +0000 Subject: [PATCH 5/5] Comment explaining default rating order. --- runner/ratings/autoraters/code-rater.ts | 3 +++ 1 file changed, 3 insertions(+) diff --git a/runner/ratings/autoraters/code-rater.ts b/runner/ratings/autoraters/code-rater.ts index cf09a7b..15c67ca 100644 --- a/runner/ratings/autoraters/code-rater.ts +++ b/runner/ratings/autoraters/code-rater.ts @@ -69,6 +69,9 @@ export async function autoRateCode( promptText = defaultCodeRaterPrompt; } + // At this point, we assume that safety-web checks have run. + // The order in runner/ratings/built-in.ts has been set to ensure this. + // (But it's entirely possible that a particular run has overridden a different order. ) const safetyRating = ratingsResult['safety-web']; const safetyWebResultsJson = safetyRating?.state === IndividualAssessmentState.EXECUTED