diff --git a/runner/ratings/autoraters/code-rater.ts b/runner/ratings/autoraters/code-rater.ts index 932092f..15c67ca 100644 --- a/runner/ratings/autoraters/code-rater.ts +++ b/runner/ratings/autoraters/code-rater.ts @@ -2,7 +2,12 @@ import { readFileSync } from 'node:fs'; import { z } from 'zod'; import { prepareContextFilesMessage } from '../../orchestration/codegen.js'; import { Environment } from '../../configuration/environment.js'; -import { LlmResponseFile } from '../../shared-interfaces.js'; +import { + IndividualAssessment, + IndividualAssessmentState, + LlmResponseFile, + SkippedIndividualAssessment, +} from '../../shared-interfaces.js'; import { AutoRateResult, getCoefficient, @@ -10,6 +15,7 @@ import { } from './auto-rate-shared.js'; import { GenkitRunner } from '../../codegen/genkit/genkit-runner.js'; import defaultCodeRaterPrompt from './code-rating-prompt.js'; +import { RatingsResult } from '../rating-types.js'; /** Framework-specific hints for the rating prompt. */ const FW_HINTS: Record = { @@ -33,6 +39,7 @@ const CACHED_RATING_PROMPTS: Record = {}; * @param environment Environment in which the rating is running. * @param files Files to be rated. * @param appPrompt Prompt to be used for the rating. + * @param ratingsResult Context containing results from previous ratings. */ export async function autoRateCode( llm: GenkitRunner, @@ -40,7 +47,8 @@ export async function autoRateCode( model: string, environment: Environment, files: LlmResponseFile[], - appPrompt: string + appPrompt: string, + ratingsResult: RatingsResult ): Promise { const contextMessage = prepareContextFilesMessage( files.map((o) => ({ @@ -61,10 +69,25 @@ export async function autoRateCode( promptText = defaultCodeRaterPrompt; } - const prompt = environment.renderPrompt(promptText, null, { - APP_PROMPT: appPrompt, - FRAMEWORK_SPECIFIC_HINTS: FW_HINTS[environment.fullStackFramework.id] ?? '', - }).result; + // At this point, we assume that safety-web checks have run. + // The order in runner/ratings/built-in.ts has been set to ensure this. + // (But it's entirely possible that a particular run has overridden a different order. ) + const safetyRating = ratingsResult['safety-web']; + const safetyWebResultsJson = + safetyRating?.state === IndividualAssessmentState.EXECUTED + ? JSON.stringify(safetyRating, null, 2) + : ''; + + const prompt = environment.renderPrompt( + promptText, + environment.codeRatingPromptPath, + { + APP_PROMPT: appPrompt, + FRAMEWORK_SPECIFIC_HINTS: + FW_HINTS[environment.fullStackFramework.id] ?? '', + SAFETY_WEB_RESULTS_JSON: safetyWebResultsJson, + } + ).result; const result = await llm.generateConstrained({ abortSignal, diff --git a/runner/ratings/autoraters/rate-files.ts b/runner/ratings/autoraters/rate-files.ts index 89b5836..e5453f2 100644 --- a/runner/ratings/autoraters/rate-files.ts +++ b/runner/ratings/autoraters/rate-files.ts @@ -1,9 +1,15 @@ import { greenCheckmark } from '../../reporting/format.js'; -import { AutoraterRunInfo, LlmResponseFile } from '../../shared-interfaces.js'; +import { + AutoraterRunInfo, + IndividualAssessment, + LlmResponseFile, + SkippedIndividualAssessment, +} from '../../shared-interfaces.js'; import { autoRateCode } from './code-rater.js'; import { autoRateAppearance } from './visuals-rater.js'; import { Environment } from '../../configuration/environment.js'; import { GenkitRunner } from '../../codegen/genkit/genkit-runner.js'; +import { RatingsResult } from '../rating-types.js'; /** * Automatically rates the code inside of a file. @@ -13,6 +19,7 @@ import { GenkitRunner } from '../../codegen/genkit/genkit-runner.js'; * @param filePath Path to the file to be rated. * @param appPrompt Prompt that should be checked. * @param screenshotPath Path to the screenshot to use for visual rating. + * @param ratingsResult Context containing results from previous ratings. */ export async function autoRateFiles( llm: GenkitRunner, @@ -21,7 +28,8 @@ export async function autoRateFiles( environment: Environment, files: LlmResponseFile[], appPrompt: string, - screenshotPngUrl: string | null + screenshotPngUrl: string | null, + ratingsResult: RatingsResult ): Promise { console.log(`Autorater is using '${model}' model. \n`); @@ -33,7 +41,8 @@ export async function autoRateFiles( model, environment, files, - appPrompt + appPrompt, + ratingsResult ); console.log(`${greenCheckmark()} Code scoring is successful.`); diff --git a/runner/ratings/built-in-ratings/code-quality-rating.ts b/runner/ratings/built-in-ratings/code-quality-rating.ts index 2f77ef7..6cc5f95 100644 --- a/runner/ratings/built-in-ratings/code-quality-rating.ts +++ b/runner/ratings/built-in-ratings/code-quality-rating.ts @@ -21,7 +21,8 @@ export const codeQualityRating: LLMBasedRating = { ctx.model, ctx.environment, ctx.outputFiles, - ctx.fullPromptText + ctx.fullPromptText, + ctx.ratingsResult ); return { diff --git a/runner/ratings/built-in.ts b/runner/ratings/built-in.ts index 4838366..c07264a 100644 --- a/runner/ratings/built-in.ts +++ b/runner/ratings/built-in.ts @@ -20,6 +20,7 @@ import { export function getBuiltInRatings(): Rating[] { return [ successfulBuildRating, + safetyWebRating, noRuntimeExceptionsRating, sufficientCodeSizeRating, sufficientGeneratedFilesRating, @@ -27,7 +28,6 @@ export function getBuiltInRatings(): Rating[] { visualAppearanceRating, validCssRating, axeRating, - safetyWebRating, userJourneysRating, NoInnerHtmlBindingsRating, NoDangerouslySetInnerHtmlRating, diff --git a/runner/ratings/rate-code.ts b/runner/ratings/rate-code.ts index 305682c..a4f38fc 100644 --- a/runner/ratings/rate-code.ts +++ b/runner/ratings/rate-code.ts @@ -20,6 +20,7 @@ import { POINTS_FOR_CATEGORIES, Rating, CATEGORY_NAMES, + RatingsResult, } from './rating-types.js'; import { extractEmbeddedCodeFromTypeScript } from './embedded-languages.js'; import { Environment } from '../configuration/environment.js'; @@ -62,6 +63,7 @@ export async function rateGeneratedCode( let categorizedFiles: CategorizedFiles | null = null; let totalPoints = 0; let maxOverallPoints = 0; + const ratingsResult: RatingsResult = {}; // Rating may also invoke LLMs. Track the usage. const tokenUsage = { @@ -95,11 +97,16 @@ export async function rateGeneratedCode( serveTestingResult, repairAttempts, outputFiles.length, - axeRepairAttempts + axeRepairAttempts, + ratingsResult ); } else if (current.kind === RatingKind.PER_FILE) { categorizedFiles ??= splitFilesIntoCategories(outputFiles); - result = await runPerFileRating(current, categorizedFiles); + result = await runPerFileRating( + current, + categorizedFiles, + ratingsResult + ); } else if (current.kind === RatingKind.LLM_BASED) { result = await runLlmBasedRating( environment, @@ -113,7 +120,8 @@ export async function rateGeneratedCode( repairAttempts, axeRepairAttempts, abortSignal, - autoraterModel + autoraterModel, + ratingsResult ); } else { throw new UserFacingError(`Unsupported rating type ${current}`); @@ -139,6 +147,7 @@ export async function rateGeneratedCode( ); } + ratingsResult[current.id] = result; category.assessments.push(result); } @@ -178,7 +187,8 @@ function runPerBuildRating( serveResult: ServeTestingResult | null, repairAttempts: number, generatedFileCount: number, - axeRepairAttempts: number + axeRepairAttempts: number, + ratingsResult: RatingsResult ): IndividualAssessment | SkippedIndividualAssessment { const rateResult = rating.rate({ buildResult, @@ -186,6 +196,7 @@ function runPerBuildRating( repairAttempts, generatedFileCount, axeRepairAttempts, + ratingsResult, }); // If the rating was skipped (e.g., Axe test wasn't run), create a skipped assessment. @@ -203,7 +214,8 @@ function runPerBuildRating( async function runPerFileRating( rating: PerFileRating, - categorizedFiles: CategorizedFiles + categorizedFiles: CategorizedFiles, + ratingsResult: RatingsResult ): Promise { const errorMessages: string[] = []; let contentType: PerFileRatingContentType; @@ -234,7 +246,7 @@ async function runPerFileRating( // Remove comments from the code to avoid false-detection of bad patterns. // Some keywords like `NgModule` can be used in code comments. const code = removeComments(file.code, contentType); - const result = await rating.rate(code, file.filePath); + const result = await rating.rate(code, file.filePath, ratingsResult); let coeff: number; if (typeof result === 'number') { @@ -279,7 +291,8 @@ async function runLlmBasedRating( repairAttempts: number, axeRepairAttempts: number, abortSignal: AbortSignal, - autoraterModel: string + autoraterModel: string, + ratingsResult: RatingsResult ): Promise { const result = await rating.rate({ environment, @@ -293,6 +306,7 @@ async function runLlmBasedRating( repairAttempts, axeRepairAttempts, abortSignal, + ratingsResult, }); if (result.state === RatingState.SKIPPED) { diff --git a/runner/ratings/rating-types.ts b/runner/ratings/rating-types.ts index 0a2aa0a..2ce3f0d 100644 --- a/runner/ratings/rating-types.ts +++ b/runner/ratings/rating-types.ts @@ -1,8 +1,10 @@ import z from 'zod'; import { BuildResult } from '../workers/builder/builder-types.js'; import type { + IndividualAssessment, LlmResponseFile, PromptDefinition, + SkippedIndividualAssessment, Usage, } from '../shared-interfaces.js'; import { Environment } from '../configuration/environment.js'; @@ -64,6 +66,9 @@ const perBuildRatingSchema = z repairAttempts: z.number(), axeRepairAttempts: z.number(), generatedFileCount: z.number(), + ratingsResult: z.record( + z.custom() + ), }) ) .returns(z.custom()), @@ -76,7 +81,11 @@ const perFileRatingSchema = z kind: z.literal(RatingKind.PER_FILE), rate: z .function() - .args(z.string(), z.string().optional()) + .args( + z.string(), + z.string().optional(), + z.record(z.custom()) + ) .returns(z.custom()), filter: z.union([ z @@ -171,6 +180,11 @@ export interface ExecutedLLMBasedRating { }; } +export type RatingsResult = Record< + string, + IndividualAssessment | SkippedIndividualAssessment +>; + export interface LLMBasedRatingContext { environment: Environment; fullPromptText: string; @@ -183,6 +197,7 @@ export interface LLMBasedRatingContext { repairAttempts: number; axeRepairAttempts: number; abortSignal: AbortSignal; + ratingsResult: RatingsResult; } /** Rating that applies over build results. */