From bfb48b2a293fba68fa45c78ce42ae83673e3e8ca Mon Sep 17 00:00:00 2001 From: Kristiyan Kostadinov Date: Thu, 4 Dec 2025 10:05:36 +0100 Subject: [PATCH 1/2] feat: add support for rating overrides Adds a `ratingOverrides` field to the environment which makes it easier to override the weight for a specific rating without having to re-define it. --- runner/configuration/environment-config.ts | 7 +++++- runner/configuration/environment.ts | 25 ++++++++++++++++++---- runner/ratings/rating-types.ts | 6 ++++++ 3 files changed, 33 insertions(+), 5 deletions(-) diff --git a/runner/configuration/environment-config.ts b/runner/configuration/environment-config.ts index 3842b14..a422eb5 100644 --- a/runner/configuration/environment-config.ts +++ b/runner/configuration/environment-config.ts @@ -1,7 +1,7 @@ import z from 'zod'; import {createMessageBuilder, fromError} from 'zod-validation-error/v3'; import {UserFacingError} from '../utils/errors.js'; -import {ratingSchema} from '../ratings/rating-types.js'; +import {ratingOverrideSchema, ratingSchema} from '../ratings/rating-types.js'; import {EvalPrompt, EvalPromptWithMetadata, MultiStepPrompt} from './prompts.js'; import {executorSchema} from '../orchestration/executors/executor.js'; import { @@ -21,6 +21,11 @@ export const environmentConfigSchema = z.object({ clientSideFramework: z.string(), /** Ratings to run when evaluating the environment. */ ratings: z.array(ratingSchema), + /** + * Map used to override fields for specific ratings. The key is the unique ID of + * the rating and the value are the override fields. + */ + ratingOverrides: z.record(z.string(), ratingOverrideSchema).optional(), /** Path to the prompt used by the LLM for generating files. */ generationSystemPrompt: z.string(), /** diff --git a/runner/configuration/environment.ts b/runner/configuration/environment.ts index 7411263..b5cb1bf 100644 --- a/runner/configuration/environment.ts +++ b/runner/configuration/environment.ts @@ -69,7 +69,7 @@ export class Environment { /** Prompts that should be executed as a part of the evaluation. */ executablePrompts = lazy(async () => { - return this.resolveExecutablePrompts(this.config.executablePrompts, this.config.ratings); + return this.resolveExecutablePrompts(this.config.executablePrompts, this.config); }); systemPromptGeneration = lazy(async () => { @@ -166,15 +166,32 @@ export class Environment { /** * Resolves the prompt configuration into prompt definitions. - * @param rootPath Root path of the project. * @param prompts Prompts to be resolved. - * @param envRatings Environment-level ratings. + * @param config Configuration for the environment. */ private async resolveExecutablePrompts( prompts: EnvironmentConfig['executablePrompts'], - envRatings: Rating[], + config: EnvironmentConfig, ): Promise { const result: Promise[] = []; + let envRatings: Rating[]; + + if (config.ratingOverrides) { + Object.keys(config.ratingOverrides).forEach(id => { + if (!config.ratings.some(rating => rating.id === id)) { + throw new UserFacingError( + `Rating with an ID of "${id}" has not been configured. Cannot apply an override to it.`, + ); + } + }); + + envRatings = config.ratings.map(rating => { + const override = config.ratingOverrides![rating.id]; + return override ? {...rating, ...override} : rating; + }); + } else { + envRatings = config.ratings; + } for (const def of prompts) { if (def instanceof MultiStepPrompt) { diff --git a/runner/ratings/rating-types.ts b/runner/ratings/rating-types.ts index 71dbd7c..bb5e353 100644 --- a/runner/ratings/rating-types.ts +++ b/runner/ratings/rating-types.ts @@ -126,6 +126,12 @@ export const ratingSchema = z.union([ llmBasedRatingSchema, ]); +export const ratingOverrideSchema = z.object({ + category: z.custom().optional(), + scoreReduction: z.custom<`${number}%`>().optional(), + groupingLabels: z.array(z.string()).optional().optional(), +}); + /** Result of a per-build rating. */ export type PerBuildRatingResult = | { From 5f20a82a2284b95968277fbb96258e27e20ad9de Mon Sep 17 00:00:00 2001 From: Kristiyan Kostadinov Date: Thu, 4 Dec 2025 10:48:43 +0100 Subject: [PATCH 2/2] feat: add config option to override categories Adds a config option that allows users to override the config for a category. --- runner/configuration/environment-config.ts | 16 ++++++- runner/configuration/environment.ts | 31 +++++++++++- runner/ratings/rate-code.ts | 5 +- runner/ratings/rating-types.ts | 14 ------ runner/reporting/report-ai-chat.ts | 56 +++++++++++++--------- runner/shared-interfaces.ts | 2 +- 6 files changed, 81 insertions(+), 43 deletions(-) diff --git a/runner/configuration/environment-config.ts b/runner/configuration/environment-config.ts index a422eb5..ed48b72 100644 --- a/runner/configuration/environment-config.ts +++ b/runner/configuration/environment-config.ts @@ -1,7 +1,7 @@ import z from 'zod'; import {createMessageBuilder, fromError} from 'zod-validation-error/v3'; import {UserFacingError} from '../utils/errors.js'; -import {ratingOverrideSchema, ratingSchema} from '../ratings/rating-types.js'; +import {RatingCategory, ratingOverrideSchema, ratingSchema} from '../ratings/rating-types.js'; import {EvalPrompt, EvalPromptWithMetadata, MultiStepPrompt} from './prompts.js'; import {executorSchema} from '../orchestration/executors/executor.js'; import { @@ -77,6 +77,20 @@ export const environmentConfigSchema = z.object({ 'Executor to be used for this environment. ' + 'If unset, a local executor is derived from the full environment configuration.', ), + + /** + * Map used to override fields for specific rating categories. The key is the unique ID of + * the category and the value are the override fields. + */ + categoryOverrides: z + .record( + z.custom(), + z.object({ + name: z.string().optional(), + maxPoints: z.number().optional(), + }), + ) + .optional(), }); /** diff --git a/runner/configuration/environment.ts b/runner/configuration/environment.ts index b5cb1bf..851bd02 100644 --- a/runner/configuration/environment.ts +++ b/runner/configuration/environment.ts @@ -2,7 +2,7 @@ import {readdirSync, readFileSync, statSync} from 'fs'; import {basename, extname, join, resolve} from 'path'; import {globSync} from 'tinyglobby'; import {Executor} from '../orchestration/executors/executor.js'; -import {Rating} from '../ratings/rating-types.js'; +import {Rating, RatingCategory} from '../ratings/rating-types.js'; import { FrameworkInfo, MultiStepPromptDefinition, @@ -38,6 +38,12 @@ export class Environment { readonly executor: Executor; /** Timeout for a single eval prompt in minutes. */ readonly promptTimeoutMinutes: number | undefined; + /** Configuration for the individual rating categories. */ + readonly ratingCategories: { + [RatingCategory.HIGH_IMPACT]: {name: string; maxPoints: number}; + [RatingCategory.MEDIUM_IMPACT]: {name: string; maxPoints: number}; + [RatingCategory.LOW_IMPACT]: {name: string; maxPoints: number}; + }; constructor( rootPath: string, @@ -65,6 +71,7 @@ export class Environment { this.isBuiltIn = rootPath.includes('node_modules'); this.executor = config.executor; this.promptTimeoutMinutes = config.promptTimeoutMinutes; + this.ratingCategories = this.getRatingCategories(config); } /** Prompts that should be executed as a part of the evaluation. */ @@ -370,4 +377,26 @@ export class Environment { return result; } + + private getRatingCategories(config: EnvironmentConfig) { + const overrides = config.categoryOverrides; + + return { + [RatingCategory.HIGH_IMPACT]: { + name: 'High Impact', + maxPoints: 60, + ...overrides?.[RatingCategory.HIGH_IMPACT], + }, + [RatingCategory.MEDIUM_IMPACT]: { + name: 'Medium Impact', + maxPoints: 30, + ...overrides?.[RatingCategory.MEDIUM_IMPACT], + }, + [RatingCategory.LOW_IMPACT]: { + name: 'Low Impact', + maxPoints: 10, + ...overrides?.[RatingCategory.LOW_IMPACT], + }, + }; + } } diff --git a/runner/ratings/rate-code.ts b/runner/ratings/rate-code.ts index 2067e87..14d772c 100644 --- a/runner/ratings/rate-code.ts +++ b/runner/ratings/rate-code.ts @@ -19,9 +19,7 @@ import { PerFileRatingContentType, RatingKind, RatingCategory, - POINTS_FOR_CATEGORIES, Rating, - CATEGORY_NAMES, RatingsResult, } from './rating-types.js'; import {extractEmbeddedCodeFromTypeScript} from './embedded-languages.js'; @@ -82,10 +80,9 @@ export async function rateGeneratedCode( RatingCategory.MEDIUM_IMPACT, RatingCategory.LOW_IMPACT, ].map(category => ({ + ...environment.ratingCategories[category], id: category, - name: CATEGORY_NAMES[category], points: 0, - maxPoints: POINTS_FOR_CATEGORIES[category], assessments: [], })); diff --git a/runner/ratings/rating-types.ts b/runner/ratings/rating-types.ts index bb5e353..71e6426 100644 --- a/runner/ratings/rating-types.ts +++ b/runner/ratings/rating-types.ts @@ -32,20 +32,6 @@ export enum RatingCategory { LOW_IMPACT = 'low-impact', } -/** Points correspond to each `RatingCategory`. */ -export const POINTS_FOR_CATEGORIES = { - [RatingCategory.HIGH_IMPACT]: 60, - [RatingCategory.MEDIUM_IMPACT]: 30, - [RatingCategory.LOW_IMPACT]: 10, -}; - -/** Display names for each `RatingCategory`. */ -export const CATEGORY_NAMES = { - [RatingCategory.HIGH_IMPACT]: 'High Impact', - [RatingCategory.MEDIUM_IMPACT]: 'Medium Impact', - [RatingCategory.LOW_IMPACT]: 'Low Impact', -}; - const ratingCommonContextFields = { ratingsResult: z.record(z.custom()), prompt: z.custom(), diff --git a/runner/reporting/report-ai-chat.ts b/runner/reporting/report-ai-chat.ts index af7e023..53c1ec6 100644 --- a/runner/reporting/report-ai-chat.ts +++ b/runner/reporting/report-ai-chat.ts @@ -12,27 +12,6 @@ import { } from '../shared-interfaces.js'; import {BuildResultStatus} from '../workers/builder/builder-types.js'; import {BUCKET_CONFIG} from '../ratings/stats.js'; -import {POINTS_FOR_CATEGORIES} from '../ratings/rating-types.js'; - -export const reportLlmEvalsToolContext = `## What is a report? -A report consists of many apps that were LLM generated. You will have information -about checks that failed for this LLM generated app. - -Note that there may be multiple attempts for an app. E.g. an initial build may fail and -another attempt might have repaired the build failure. The last attempt reflects the final -state of the app. E.g. whether it does build, or if there are runtime errors. - -## Scoring mechanism -Apps are rated based on their scores in the following buckets: -${BUCKET_CONFIG.map(b => `* ${b.name}: ${b.min}-${b.max}`).join('\n')} - -The overall score of an app is determined based on score reductions. -There are three pillars: ${Object.keys(POINTS_FOR_CATEGORIES).join(', ')} -Pillars are a split up of a 100% perfect score, allowing for individual ratings -to be less impactful than others. The pillars are distributed as follows: -${Object.entries(POINTS_FOR_CATEGORIES).map(e => `* ${e[0]}: ${e[1]} points.`)} -Within pillars, the available score can be reduced by individual ratings. -`; const defaultAiChatPrompt = `Strictly follow the instructions here. - You are an expert in LLM-based code generation evaluation and quality assessments. @@ -90,7 +69,7 @@ export async function chatWithReportAI( ${message} \`\`\` -${reportLlmEvalsToolContext} +${getContextPrompt(assessmentsToProcess)} ### How many apps are there? There are ${allAssessments.length} apps in this report. @@ -193,3 +172,36 @@ function isAssessmentResultWithID( ): value is AssessmentResultFromReportServer { return (value as Partial).id !== undefined; } + +function getContextPrompt(assessments: AssessmentResultFromReportServer[] | AssessmentResult[]) { + let categoryCount = 0; + let pointsForCategories = {} as Record; + + // Deduce the categories from the first result since they're the same for the entire run. + if (assessments.length) { + assessments[0].score.categories.forEach(category => { + categoryCount++; + pointsForCategories[category.id] = category.maxPoints; + }); + } + + return `## What is a report? +A report consists of many apps that were LLM generated. You will have information +about checks that failed for this LLM generated app. + +Note that there may be multiple attempts for an app. E.g. an initial build may fail and +another attempt might have repaired the build failure. The last attempt reflects the final +state of the app. E.g. whether it does build, or if there are runtime errors. + +## Scoring mechanism +Apps are rated based on their scores in the following buckets: +${BUCKET_CONFIG.map(b => `* ${b.name}: ${b.min}-${b.max}`).join('\n')} + +The overall score of an app is determined based on score reductions. +There are ${categoryCount} pillars: ${Object.keys(pointsForCategories).join(', ')} +Pillars are a split up of a 100% perfect score, allowing for individual ratings +to be less impactful than others. The pillars are distributed as follows: +${Object.entries(pointsForCategories).map(e => `* ${e[0]}: ${e[1]} points.`)} +Within pillars, the available score can be reduced by individual ratings. +`; +} diff --git a/runner/shared-interfaces.ts b/runner/shared-interfaces.ts index 72d541e..a1d1096 100644 --- a/runner/shared-interfaces.ts +++ b/runner/shared-interfaces.ts @@ -174,7 +174,7 @@ export interface LlmContextFile { export interface AssessmentCategory { /** Unique ID of the category. */ id: RatingCategory; - /** Display name of the cateogry. */ + /** Display name of the category. */ name: string; /** Points that have been awarded to the category. */ points: number;