From bfb48b2a293fba68fa45c78ce42ae83673e3e8ca Mon Sep 17 00:00:00 2001
From: Kristiyan Kostadinov <crisbeto@abv.bg>
Date: Thu, 4 Dec 2025 10:05:36 +0100
Subject: [PATCH 1/2] feat: add support for rating overrides

Adds a `ratingOverrides` field to the environment which makes it easier to override the weight for a specific rating without having to re-define it.
---
 runner/configuration/environment-config.ts |  7 +++++-
 runner/configuration/environment.ts        | 25 ++++++++++++++++++----
 runner/ratings/rating-types.ts             |  6 ++++++
 3 files changed, 33 insertions(+), 5 deletions(-)
diff --git a/runner/configuration/environment-config.ts b/runner/configuration/environment-config.ts
index 3842b14..a422eb5 100644
--- a/runner/configuration/environment-config.ts
+++ b/runner/configuration/environment-config.ts
@@ -1,7 +1,7 @@
 import z from 'zod';
 import {createMessageBuilder, fromError} from 'zod-validation-error/v3';
 import {UserFacingError} from '../utils/errors.js';
-import {ratingSchema} from '../ratings/rating-types.js';
+import {ratingOverrideSchema, ratingSchema} from '../ratings/rating-types.js';
 import {EvalPrompt, EvalPromptWithMetadata, MultiStepPrompt} from './prompts.js';
 import {executorSchema} from '../orchestration/executors/executor.js';
 import {
@@ -21,6 +21,11 @@ export const environmentConfigSchema = z.object({
   clientSideFramework: z.string(),
   /** Ratings to run when evaluating the environment. */
   ratings: z.array(ratingSchema),
+  /**
+   * Map used to override fields for specific ratings. The key is the unique ID of
+   * the rating and the value are the override fields.
+   */
+  ratingOverrides: z.record(z.string(), ratingOverrideSchema).optional(),
   /** Path to the prompt used by the LLM for generating files. */
   generationSystemPrompt: z.string(),
   /**
diff --git a/runner/configuration/environment.ts b/runner/configuration/environment.ts
index 7411263..b5cb1bf 100644
--- a/runner/configuration/environment.ts
+++ b/runner/configuration/environment.ts
@@ -69,7 +69,7 @@ export class Environment {
 
   /** Prompts that should be executed as a part of the evaluation. */
   executablePrompts = lazy(async () => {
-    return this.resolveExecutablePrompts(this.config.executablePrompts, this.config.ratings);
+    return this.resolveExecutablePrompts(this.config.executablePrompts, this.config);
   });
 
   systemPromptGeneration = lazy(async () => {
@@ -166,15 +166,32 @@ export class Environment {
 
   /**
    * Resolves the prompt configuration into prompt definitions.
-   * @param rootPath Root path of the project.
    * @param prompts Prompts to be resolved.
-   * @param envRatings Environment-level ratings.
+   * @param config Configuration for the environment.
    */
   private async resolveExecutablePrompts(
     prompts: EnvironmentConfig['executablePrompts'],
-    envRatings: Rating[],
+    config: EnvironmentConfig,
   ): Promise<RootPromptDefinition[]> {
     const result: Promise<RootPromptDefinition>[] = [];
+    let envRatings: Rating[];
+
+    if (config.ratingOverrides) {
+      Object.keys(config.ratingOverrides).forEach(id => {
+        if (!config.ratings.some(rating => rating.id === id)) {
+          throw new UserFacingError(
+            `Rating with an ID of "${id}" has not been configured. Cannot apply an override to it.`,
+          );
+        }
+      });
+
+      envRatings = config.ratings.map(rating => {
+        const override = config.ratingOverrides![rating.id];
+        return override ? {...rating, ...override} : rating;
+      });
+    } else {
+      envRatings = config.ratings;
+    }
 
     for (const def of prompts) {
       if (def instanceof MultiStepPrompt) {
diff --git a/runner/ratings/rating-types.ts b/runner/ratings/rating-types.ts
index 71dbd7c..bb5e353 100644
--- a/runner/ratings/rating-types.ts
+++ b/runner/ratings/rating-types.ts
@@ -126,6 +126,12 @@ export const ratingSchema = z.union([
   llmBasedRatingSchema,
 ]);
 
+export const ratingOverrideSchema = z.object({
+  category: z.custom<RatingCategory>().optional(),
+  scoreReduction: z.custom<`${number}%`>().optional(),
+  groupingLabels: z.array(z.string()).optional().optional(),
+});
+
 /** Result of a per-build rating. */
 export type PerBuildRatingResult =
   | {

From 5f20a82a2284b95968277fbb96258e27e20ad9de Mon Sep 17 00:00:00 2001
From: Kristiyan Kostadinov <crisbeto@abv.bg>
Date: Thu, 4 Dec 2025 10:48:43 +0100
Subject: [PATCH 2/2] feat: add config option to override categories

Adds a config option that allows users to override the config for a category.
---
 runner/configuration/environment-config.ts | 16 ++++++-
 runner/configuration/environment.ts        | 31 +++++++++++-
 runner/ratings/rate-code.ts                |  5 +-
 runner/ratings/rating-types.ts             | 14 ------
 runner/reporting/report-ai-chat.ts         | 56 +++++++++++++---------
 runner/shared-interfaces.ts                |  2 +-
 6 files changed, 81 insertions(+), 43 deletions(-)

diff --git a/runner/configuration/environment-config.ts b/runner/configuration/environment-config.ts
index a422eb5..ed48b72 100644
--- a/runner/configuration/environment-config.ts
+++ b/runner/configuration/environment-config.ts
@@ -1,7 +1,7 @@
 import z from 'zod';
 import {createMessageBuilder, fromError} from 'zod-validation-error/v3';
 import {UserFacingError} from '../utils/errors.js';
-import {ratingOverrideSchema, ratingSchema} from '../ratings/rating-types.js';
+import {RatingCategory, ratingOverrideSchema, ratingSchema} from '../ratings/rating-types.js';
 import {EvalPrompt, EvalPromptWithMetadata, MultiStepPrompt} from './prompts.js';
 import {executorSchema} from '../orchestration/executors/executor.js';
 import {
@@ -77,6 +77,20 @@ export const environmentConfigSchema = z.object({
       'Executor to be used for this environment. ' +
         'If unset, a local executor is derived from the full environment configuration.',
     ),
+
+  /**
+   * Map used to override fields for specific rating categories. The key is the unique ID of
+   * the category and the value are the override fields.
+   */
+  categoryOverrides: z
+    .record(
+      z.custom<RatingCategory>(),
+      z.object({
+        name: z.string().optional(),
+        maxPoints: z.number().optional(),
+      }),
+    )
+    .optional(),
 });
 
 /**
diff --git a/runner/configuration/environment.ts b/runner/configuration/environment.ts
index b5cb1bf..851bd02 100644
--- a/runner/configuration/environment.ts
+++ b/runner/configuration/environment.ts
@@ -2,7 +2,7 @@ import {readdirSync, readFileSync, statSync} from 'fs';
 import {basename, extname, join, resolve} from 'path';
 import {globSync} from 'tinyglobby';
 import {Executor} from '../orchestration/executors/executor.js';
-import {Rating} from '../ratings/rating-types.js';
+import {Rating, RatingCategory} from '../ratings/rating-types.js';
 import {
   FrameworkInfo,
   MultiStepPromptDefinition,
@@ -38,6 +38,12 @@ export class Environment {
   readonly executor: Executor;
   /** Timeout for a single eval prompt in minutes. */
   readonly promptTimeoutMinutes: number | undefined;
+  /** Configuration for the individual rating categories. */
+  readonly ratingCategories: {
+    [RatingCategory.HIGH_IMPACT]: {name: string; maxPoints: number};
+    [RatingCategory.MEDIUM_IMPACT]: {name: string; maxPoints: number};
+    [RatingCategory.LOW_IMPACT]: {name: string; maxPoints: number};
+  };
 
   constructor(
     rootPath: string,
@@ -65,6 +71,7 @@ export class Environment {
     this.isBuiltIn = rootPath.includes('node_modules');
     this.executor = config.executor;
     this.promptTimeoutMinutes = config.promptTimeoutMinutes;
+    this.ratingCategories = this.getRatingCategories(config);
   }
 
   /** Prompts that should be executed as a part of the evaluation. */
@@ -370,4 +377,26 @@ export class Environment {
 
     return result;
   }
+
+  private getRatingCategories(config: EnvironmentConfig) {
+    const overrides = config.categoryOverrides;
+
+    return {
+      [RatingCategory.HIGH_IMPACT]: {
+        name: 'High Impact',
+        maxPoints: 60,
+        ...overrides?.[RatingCategory.HIGH_IMPACT],
+      },
+      [RatingCategory.MEDIUM_IMPACT]: {
+        name: 'Medium Impact',
+        maxPoints: 30,
+        ...overrides?.[RatingCategory.MEDIUM_IMPACT],
+      },
+      [RatingCategory.LOW_IMPACT]: {
+        name: 'Low Impact',
+        maxPoints: 10,
+        ...overrides?.[RatingCategory.LOW_IMPACT],
+      },
+    };
+  }
 }
diff --git a/runner/ratings/rate-code.ts b/runner/ratings/rate-code.ts
index 2067e87..14d772c 100644
--- a/runner/ratings/rate-code.ts
+++ b/runner/ratings/rate-code.ts
@@ -19,9 +19,7 @@ import {
   PerFileRatingContentType,
   RatingKind,
   RatingCategory,
-  POINTS_FOR_CATEGORIES,
   Rating,
-  CATEGORY_NAMES,
   RatingsResult,
 } from './rating-types.js';
 import {extractEmbeddedCodeFromTypeScript} from './embedded-languages.js';
@@ -82,10 +80,9 @@ export async function rateGeneratedCode(
     RatingCategory.MEDIUM_IMPACT,
     RatingCategory.LOW_IMPACT,
   ].map(category => ({
+    ...environment.ratingCategories[category],
     id: category,
-    name: CATEGORY_NAMES[category],
     points: 0,
-    maxPoints: POINTS_FOR_CATEGORIES[category],
     assessments: [],
   }));
 
diff --git a/runner/ratings/rating-types.ts b/runner/ratings/rating-types.ts
index bb5e353..71e6426 100644
--- a/runner/ratings/rating-types.ts
+++ b/runner/ratings/rating-types.ts
@@ -32,20 +32,6 @@ export enum RatingCategory {
   LOW_IMPACT = 'low-impact',
 }
 
-/** Points correspond to each `RatingCategory`. */
-export const POINTS_FOR_CATEGORIES = {
-  [RatingCategory.HIGH_IMPACT]: 60,
-  [RatingCategory.MEDIUM_IMPACT]: 30,
-  [RatingCategory.LOW_IMPACT]: 10,
-};
-
-/** Display names for each `RatingCategory`. */
-export const CATEGORY_NAMES = {
-  [RatingCategory.HIGH_IMPACT]: 'High Impact',
-  [RatingCategory.MEDIUM_IMPACT]: 'Medium Impact',
-  [RatingCategory.LOW_IMPACT]: 'Low Impact',
-};
-
 const ratingCommonContextFields = {
   ratingsResult: z.record(z.custom<IndividualAssessment | SkippedIndividualAssessment>()),
   prompt: z.custom<PromptDefinition>(),
diff --git a/runner/reporting/report-ai-chat.ts b/runner/reporting/report-ai-chat.ts
index af7e023..53c1ec6 100644
--- a/runner/reporting/report-ai-chat.ts
+++ b/runner/reporting/report-ai-chat.ts
@@ -12,27 +12,6 @@ import {
 } from '../shared-interfaces.js';
 import {BuildResultStatus} from '../workers/builder/builder-types.js';
 import {BUCKET_CONFIG} from '../ratings/stats.js';
-import {POINTS_FOR_CATEGORIES} from '../ratings/rating-types.js';
-
-export const reportLlmEvalsToolContext = `## What is a report?
-A report consists of many apps that were LLM generated. You will have information
-about checks that failed for this LLM generated app.
-
-Note that there may be multiple attempts for an app. E.g. an initial build may fail and
-another attempt might have repaired the build failure. The last attempt reflects the final
-state of the app. E.g. whether it does build, or if there are runtime errors.
-
-## Scoring mechanism
-Apps are rated based on their scores in the following buckets:
-${BUCKET_CONFIG.map(b => `* ${b.name}: ${b.min}-${b.max}`).join('\n')}
-
-The overall score of an app is determined based on score reductions.
-There are three pillars: ${Object.keys(POINTS_FOR_CATEGORIES).join(', ')}
-Pillars are a split up of a 100% perfect score, allowing for individual ratings
-to be less impactful than others. The pillars are distributed as follows:
-${Object.entries(POINTS_FOR_CATEGORIES).map(e => `* ${e[0]}: ${e[1]} points.`)}
-Within pillars, the available score can be reduced by individual ratings.
-`;
 
 const defaultAiChatPrompt = `Strictly follow the instructions here.
 - You are an expert in LLM-based code generation evaluation and quality assessments.
@@ -90,7 +69,7 @@ export async function chatWithReportAI(
 ${message}
 \`\`\`
 
-${reportLlmEvalsToolContext}
+${getContextPrompt(assessmentsToProcess)}
 
 ### How many apps are there?
 There are ${allAssessments.length} apps in this report.
@@ -193,3 +172,36 @@ function isAssessmentResultWithID(
 ): value is AssessmentResultFromReportServer {
   return (value as Partial<AssessmentResultFromReportServer>).id !== undefined;
 }
+
+function getContextPrompt(assessments: AssessmentResultFromReportServer[] | AssessmentResult[]) {
+  let categoryCount = 0;
+  let pointsForCategories = {} as Record<string, number>;
+
+  // Deduce the categories from the first result since they're the same for the entire run.
+  if (assessments.length) {
+    assessments[0].score.categories.forEach(category => {
+      categoryCount++;
+      pointsForCategories[category.id] = category.maxPoints;
+    });
+  }
+
+  return `## What is a report?
+A report consists of many apps that were LLM generated. You will have information
+about checks that failed for this LLM generated app.
+
+Note that there may be multiple attempts for an app. E.g. an initial build may fail and
+another attempt might have repaired the build failure. The last attempt reflects the final
+state of the app. E.g. whether it does build, or if there are runtime errors.
+
+## Scoring mechanism
+Apps are rated based on their scores in the following buckets:
+${BUCKET_CONFIG.map(b => `* ${b.name}: ${b.min}-${b.max}`).join('\n')}
+
+The overall score of an app is determined based on score reductions.
+There are ${categoryCount} pillars: ${Object.keys(pointsForCategories).join(', ')}
+Pillars are a split up of a 100% perfect score, allowing for individual ratings
+to be less impactful than others. The pillars are distributed as follows:
+${Object.entries(pointsForCategories).map(e => `* ${e[0]}: ${e[1]} points.`)}
+Within pillars, the available score can be reduced by individual ratings.
+`;
+}
diff --git a/runner/shared-interfaces.ts b/runner/shared-interfaces.ts
index 72d541e..a1d1096 100644
--- a/runner/shared-interfaces.ts
+++ b/runner/shared-interfaces.ts
@@ -174,7 +174,7 @@ export interface LlmContextFile {
 export interface AssessmentCategory {
   /** Unique ID of the category. */
   id: RatingCategory;
-  /** Display name of the cateogry. */
+  /** Display name of the category. */
   name: string;
   /** Points that have been awarded to the category. */
   points: number;