Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 29 additions & 6 deletions runner/ratings/autoraters/code-rater.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,20 @@ import { readFileSync } from 'node:fs';
import { z } from 'zod';
import { prepareContextFilesMessage } from '../../orchestration/codegen.js';
import { Environment } from '../../configuration/environment.js';
import { LlmResponseFile } from '../../shared-interfaces.js';
import {
IndividualAssessment,
IndividualAssessmentState,
LlmResponseFile,
SkippedIndividualAssessment,
} from '../../shared-interfaces.js';
import {
AutoRateResult,
getCoefficient,
MAX_RATING,
} from './auto-rate-shared.js';
import { GenkitRunner } from '../../codegen/genkit/genkit-runner.js';
import defaultCodeRaterPrompt from './code-rating-prompt.js';
import { RatingsResult } from '../rating-types.js';

/** Framework-specific hints for the rating prompt. */
const FW_HINTS: Record<string, string | undefined> = {
Expand All @@ -33,14 +39,16 @@ const CACHED_RATING_PROMPTS: Record<string, string> = {};
* @param environment Environment in which the rating is running.
* @param files Files to be rated.
* @param appPrompt Prompt to be used for the rating.
* @param ratingsResult Context containing results from previous ratings.
*/
export async function autoRateCode(
llm: GenkitRunner,
abortSignal: AbortSignal,
model: string,
environment: Environment,
files: LlmResponseFile[],
appPrompt: string
appPrompt: string,
ratingsResult: RatingsResult
): Promise<AutoRateResult> {
const contextMessage = prepareContextFilesMessage(
files.map((o) => ({
Expand All @@ -61,10 +69,25 @@ export async function autoRateCode(
promptText = defaultCodeRaterPrompt;
}

const prompt = environment.renderPrompt(promptText, null, {
APP_PROMPT: appPrompt,
FRAMEWORK_SPECIFIC_HINTS: FW_HINTS[environment.fullStackFramework.id] ?? '',
}).result;
// At this point, we assume that safety-web checks have run.
// The order in runner/ratings/built-in.ts has been set to ensure this.
// (But it's entirely possible that a particular run has overridden a different order. )
const safetyRating = ratingsResult['safety-web'];
const safetyWebResultsJson =
safetyRating?.state === IndividualAssessmentState.EXECUTED
? JSON.stringify(safetyRating, null, 2)
: '';

const prompt = environment.renderPrompt(
promptText,
environment.codeRatingPromptPath,
{
APP_PROMPT: appPrompt,
FRAMEWORK_SPECIFIC_HINTS:
FW_HINTS[environment.fullStackFramework.id] ?? '',
SAFETY_WEB_RESULTS_JSON: safetyWebResultsJson,
}
).result;

const result = await llm.generateConstrained({
abortSignal,
Expand Down
15 changes: 12 additions & 3 deletions runner/ratings/autoraters/rate-files.ts
Original file line number Diff line number Diff line change
@@ -1,9 +1,15 @@
import { greenCheckmark } from '../../reporting/format.js';
import { AutoraterRunInfo, LlmResponseFile } from '../../shared-interfaces.js';
import {
AutoraterRunInfo,
IndividualAssessment,
LlmResponseFile,
SkippedIndividualAssessment,
} from '../../shared-interfaces.js';
import { autoRateCode } from './code-rater.js';
import { autoRateAppearance } from './visuals-rater.js';
import { Environment } from '../../configuration/environment.js';
import { GenkitRunner } from '../../codegen/genkit/genkit-runner.js';
import { RatingsResult } from '../rating-types.js';

/**
* Automatically rates the code inside of a file.
Expand All @@ -13,6 +19,7 @@ import { GenkitRunner } from '../../codegen/genkit/genkit-runner.js';
* @param filePath Path to the file to be rated.
* @param appPrompt Prompt that should be checked.
* @param screenshotPath Path to the screenshot to use for visual rating.
* @param ratingsResult Context containing results from previous ratings.
*/
export async function autoRateFiles(
llm: GenkitRunner,
Expand All @@ -21,7 +28,8 @@ export async function autoRateFiles(
environment: Environment,
files: LlmResponseFile[],
appPrompt: string,
screenshotPngUrl: string | null
screenshotPngUrl: string | null,
ratingsResult: RatingsResult
): Promise<AutoraterRunInfo> {
console.log(`Autorater is using '${model}' model. \n`);

Expand All @@ -33,7 +41,8 @@ export async function autoRateFiles(
model,
environment,
files,
appPrompt
appPrompt,
ratingsResult
);
console.log(`${greenCheckmark()} Code scoring is successful.`);

Expand Down
3 changes: 2 additions & 1 deletion runner/ratings/built-in-ratings/code-quality-rating.ts
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,8 @@ export const codeQualityRating: LLMBasedRating = {
ctx.model,
ctx.environment,
ctx.outputFiles,
ctx.fullPromptText
ctx.fullPromptText,
ctx.ratingsResult
);

return {
Expand Down
2 changes: 1 addition & 1 deletion runner/ratings/built-in.ts
Original file line number Diff line number Diff line change
Expand Up @@ -20,14 +20,14 @@ import {
export function getBuiltInRatings(): Rating[] {
return [
successfulBuildRating,
safetyWebRating,
noRuntimeExceptionsRating,
sufficientCodeSizeRating,
sufficientGeneratedFilesRating,
codeQualityRating,
visualAppearanceRating,
validCssRating,
axeRating,
safetyWebRating,
userJourneysRating,
NoInnerHtmlBindingsRating,
NoDangerouslySetInnerHtmlRating,
Expand Down
28 changes: 21 additions & 7 deletions runner/ratings/rate-code.ts
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ import {
POINTS_FOR_CATEGORIES,
Rating,
CATEGORY_NAMES,
RatingsResult,
} from './rating-types.js';
import { extractEmbeddedCodeFromTypeScript } from './embedded-languages.js';
import { Environment } from '../configuration/environment.js';
Expand Down Expand Up @@ -62,6 +63,7 @@ export async function rateGeneratedCode(
let categorizedFiles: CategorizedFiles | null = null;
let totalPoints = 0;
let maxOverallPoints = 0;
const ratingsResult: RatingsResult = {};

// Rating may also invoke LLMs. Track the usage.
const tokenUsage = {
Expand Down Expand Up @@ -95,11 +97,16 @@ export async function rateGeneratedCode(
serveTestingResult,
repairAttempts,
outputFiles.length,
axeRepairAttempts
axeRepairAttempts,
ratingsResult
);
} else if (current.kind === RatingKind.PER_FILE) {
categorizedFiles ??= splitFilesIntoCategories(outputFiles);
result = await runPerFileRating(current, categorizedFiles);
result = await runPerFileRating(
current,
categorizedFiles,
ratingsResult
);
} else if (current.kind === RatingKind.LLM_BASED) {
result = await runLlmBasedRating(
environment,
Expand All @@ -113,7 +120,8 @@ export async function rateGeneratedCode(
repairAttempts,
axeRepairAttempts,
abortSignal,
autoraterModel
autoraterModel,
ratingsResult
);
} else {
throw new UserFacingError(`Unsupported rating type ${current}`);
Expand All @@ -139,6 +147,7 @@ export async function rateGeneratedCode(
);
}

ratingsResult[current.id] = result;
category.assessments.push(result);
}

Expand Down Expand Up @@ -178,14 +187,16 @@ function runPerBuildRating(
serveResult: ServeTestingResult | null,
repairAttempts: number,
generatedFileCount: number,
axeRepairAttempts: number
axeRepairAttempts: number,
ratingsResult: RatingsResult
): IndividualAssessment | SkippedIndividualAssessment {
const rateResult = rating.rate({
buildResult,
serveResult,
repairAttempts,
generatedFileCount,
axeRepairAttempts,
ratingsResult,
});

// If the rating was skipped (e.g., Axe test wasn't run), create a skipped assessment.
Expand All @@ -203,7 +214,8 @@ function runPerBuildRating(

async function runPerFileRating(
rating: PerFileRating,
categorizedFiles: CategorizedFiles
categorizedFiles: CategorizedFiles,
ratingsResult: RatingsResult
): Promise<IndividualAssessment | SkippedIndividualAssessment> {
const errorMessages: string[] = [];
let contentType: PerFileRatingContentType;
Expand Down Expand Up @@ -234,7 +246,7 @@ async function runPerFileRating(
// Remove comments from the code to avoid false-detection of bad patterns.
// Some keywords like `NgModule` can be used in code comments.
const code = removeComments(file.code, contentType);
const result = await rating.rate(code, file.filePath);
const result = await rating.rate(code, file.filePath, ratingsResult);
let coeff: number;

if (typeof result === 'number') {
Expand Down Expand Up @@ -279,7 +291,8 @@ async function runLlmBasedRating(
repairAttempts: number,
axeRepairAttempts: number,
abortSignal: AbortSignal,
autoraterModel: string
autoraterModel: string,
ratingsResult: RatingsResult
): Promise<IndividualAssessment | SkippedIndividualAssessment> {
const result = await rating.rate({
environment,
Expand All @@ -293,6 +306,7 @@ async function runLlmBasedRating(
repairAttempts,
axeRepairAttempts,
abortSignal,
ratingsResult,
});

if (result.state === RatingState.SKIPPED) {
Expand Down
17 changes: 16 additions & 1 deletion runner/ratings/rating-types.ts
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
import z from 'zod';
import { BuildResult } from '../workers/builder/builder-types.js';
import type {
IndividualAssessment,
LlmResponseFile,
PromptDefinition,
SkippedIndividualAssessment,
Usage,
} from '../shared-interfaces.js';
import { Environment } from '../configuration/environment.js';
Expand Down Expand Up @@ -64,6 +66,9 @@ const perBuildRatingSchema = z
repairAttempts: z.number(),
axeRepairAttempts: z.number(),
generatedFileCount: z.number(),
ratingsResult: z.record(
z.custom<IndividualAssessment | SkippedIndividualAssessment>()
),
})
)
.returns(z.custom<PerBuildRatingResult>()),
Expand All @@ -76,7 +81,11 @@ const perFileRatingSchema = z
kind: z.literal(RatingKind.PER_FILE),
rate: z
.function()
.args(z.string(), z.string().optional())
.args(
z.string(),
z.string().optional(),
z.record(z.custom<IndividualAssessment | SkippedIndividualAssessment>())
)
.returns(z.custom<PerFileRatingResult>()),
filter: z.union([
z
Expand Down Expand Up @@ -171,6 +180,11 @@ export interface ExecutedLLMBasedRating {
};
}

export type RatingsResult = Record<
string,
IndividualAssessment | SkippedIndividualAssessment
>;

export interface LLMBasedRatingContext {
environment: Environment;
fullPromptText: string;
Expand All @@ -183,6 +197,7 @@ export interface LLMBasedRatingContext {
repairAttempts: number;
axeRepairAttempts: number;
abortSignal: AbortSignal;
ratingsResult: RatingsResult;
}

/** Rating that applies over build results. */
Expand Down
Loading