From 2ae1a63e3fbb9af7096cbc7306c535ce95d01643 Mon Sep 17 00:00:00 2001 From: Christopher Ehrlich Date: Tue, 11 Nov 2025 15:50:57 +0700 Subject: [PATCH 01/11] initial eval docs --- ai-engineering/measure.mdx | 431 +++++++++++++++++++++++++++++++++++-- 1 file changed, 408 insertions(+), 23 deletions(-) diff --git a/ai-engineering/measure.mdx b/ai-engineering/measure.mdx index b288e8b0f..df51ac929 100644 --- a/ai-engineering/measure.mdx +++ b/ai-engineering/measure.mdx @@ -1,34 +1,45 @@ --- title: "Measure" description: "Learn how to measure the quality of your AI capabilities by running evaluations against ground truth data." -keywords: ["ai engineering", "AI engineering", "measure", "evals", "evaluation", "scoring", "graders"] +keywords: ["ai engineering", "AI engineering", "measure", "evals", "evaluation", "scoring", "scorers"] --- import { Badge } from "/snippets/badge.jsx" import { definitions } from '/snippets/definitions.mdx' -The evaluation framework described here is in active development. Axiom is working with design partners to shape what’s built. [Contact Axiom](https://www.axiom.co/contact) to get early access and join a focused group of teams shaping these tools. +The evaluation framework described here is in active development. Axiom is working with design partners to shape what's built. [Contact Axiom](https://www.axiom.co/contact) to get early access and join a focused group of teams shaping these tools. -The **Measure** stage is where you quantify the quality and effectiveness of your AI capability. Instead of relying on anecdotal checks, this stage uses a systematic process called an eval to score your capability’s performance against a known set of correct examples (ground truth). This provides a data-driven benchmark to ensure a capability is ready for production and to track its quality over time. +The **Measure** stage is where you quantify the quality and effectiveness of your AI capability. Instead of relying on anecdotal checks, this stage uses a systematic process called an eval to score your capability's performance against a known set of correct examples (ground truth). This provides a data-driven benchmark to ensure a capability is ready for production and to track its quality over time. + +Evaluations (evals) are systematic tests that measure how well your AI features perform. Instead of manually testing AI outputs, evals automatically run your AI code against test datasets and score the results using custom metrics. This lets you catch regressions, compare different approaches, and confidently improve your AI features over time. ## The `Eval` function -Coming soon The primary tool for the Measure stage is the `Eval` function, which will be available in the `axiom/ai` package. It provides a simple, declarative way to define a test suite for your capability directly in your codebase. +The primary tool for the Measure stage is the `Eval` function, available in the `axiom/ai/evals` package. It provides a simple, declarative way to define a test suite for your capability directly in your codebase. An `Eval` is structured around a few key parameters: -* `data`: An async function that returns your `collection` of `{ input, expected }` pairs, which serve as your ground truth. +* `data`: An async function that returns your collection of `{ input, expected }` pairs, which serve as your ground truth. * `task`: The function that executes your AI capability, taking an `input` and producing an `output`. -* `scorers`: An array of `grader` functions that score the `output` against the `expected` value. -* `threshold`: A score between 0 and 1 that determines the pass/fail condition for the evaluation. +* `scorers`: An array of scorer functions that score the `output` against the `expected` value. +* `metadata`: Optional metadata for the evaluation, such as a description. Here is an example of a complete evaluation suite: ```ts /evals/text-match.eval.ts -import { Levenshtein } from 'autoevals'; -import { Eval } from 'axiom/ai/evals'; +import { Eval, Scorer } from 'axiom/ai/evals'; + +const LevenshteinScorer = Scorer( + 'Levenshtein', + ({ output, expected }: { output: string; expected: string }) => { + // Calculate Levenshtein distance score + const distance = calculateLevenshtein(output, expected); + const maxLen = Math.max(output.length, expected.length); + return maxLen === 0 ? 1 : 1 - distance / maxLen; + } +); Eval('text-match-eval', { // 1. Your ground truth dataset @@ -46,40 +57,414 @@ Eval('text-match-eval', { }, // 2. The task that runs your capability - task: async (input: string) => { + task: async ({ input }) => { return `hi, ${input}!`; }, // 3. The scorers that grade the output - scorers: [Levenshtein], + scorers: [LevenshteinScorer], +}); +``` + +## Getting Started + +### Prerequisites + +- Node.js 22.20 or higher +- Existing AI SDK setup (e.g., `@ai-sdk/openai`, `ai`) +- Axiom account with API token and dataset + +### Installation + +```bash +npm install axiom +npm install --save-dev autoevals +``` + +Install required OpenTelemetry dependencies: + +```bash +npm install @opentelemetry/api \ + @opentelemetry/exporter-trace-otlp-http \ + @opentelemetry/resources \ + @opentelemetry/sdk-trace-node \ + @opentelemetry/semantic-conventions +``` + +### Configuration + +#### 1. Set up environment variables + +Create a `.env` file: + +```bash +AXIOM_URL="https://api.axiom.co" +AXIOM_TOKEN="xaat-******" +AXIOM_DATASET="my_dataset" +``` + +#### 2. Create instrumentation setup (optional) + +If you are evaluating components of a production application that is instrumented with OpenTelemetry, you can see your application spans in Axiom. In order to enable this, your instrumentation setup must be a function that can be passed in `axiom.config.ts`. An example is shown below. + +Create `src/instrumentation.node.ts`: + +```typescript +import { OTLPTraceExporter } from '@opentelemetry/exporter-trace-otlp-http'; +import { resourceFromAttributes } from '@opentelemetry/resources'; +import { BatchSpanProcessor, NodeTracerProvider } from '@opentelemetry/sdk-trace-node'; +import { ATTR_SERVICE_NAME } from '@opentelemetry/semantic-conventions'; +import { initAxiomAI, RedactionPolicy } from 'axiom/ai'; +import type { AxiomEvalInstrumentationHook } from 'axiom/ai/config'; +import { tracer } from './tracer'; + +let provider: NodeTracerProvider | undefined; + +export const setupAppInstrumentation: AxiomEvalInstrumentationHook = async ({ + dataset, + url, + token, +}) => { + if (provider) { + return { provider }; + } + + if (!dataset || !url || !token) { + throw new Error('Missing environment variables'); + } + + const exporter = new OTLPTraceExporter({ + url: `${url}/v1/traces`, + headers: { + Authorization: `Bearer ${token}`, + 'X-Axiom-Dataset': dataset, + }, + }); + + provider = new NodeTracerProvider({ + resource: resourceFromAttributes({ + [ATTR_SERVICE_NAME]: 'my-app', + }), + spanProcessors: [new BatchSpanProcessor(exporter)], + }); + + provider.register(); + initAxiomAI({ tracer, redactionPolicy: RedactionPolicy.AxiomDefault }); + + return { provider }; +}; +``` + +Create `src/tracer.ts`: + +```typescript +import { trace } from '@opentelemetry/api'; + +export const tracer = trace.getTracer('my-tracer'); +``` + +#### 1. Create `axiom.config.ts` + +Create a configuration file at the root of your project: + +```typescript +import { defineConfig } from 'axiom/ai/config'; +import { setupAppInstrumentation } from './src/instrumentation.node'; + +export default defineConfig({ + eval: { + url: process.env.AXIOM_URL, + token: process.env.AXIOM_TOKEN, + dataset: process.env.AXIOM_DATASET, + + // Optional: customize which files to run + include: ['**/*.eval.{ts,js}'], + + // Optional: exclude patterns + exclude: [], + + // Optional: timeout for eval execution + timeoutMs: 60_000, + + // Optional: instrumentation hook for OpenTelemetry + instrumentation: ({ url, token, dataset }) => + setupAppInstrumentation({ url, token, dataset }), + }, +}); +``` + +## Setting up Flags + +Flags let you parameterize your AI behavior (like model choice or prompting strategies) and run experiments with different configurations. They're type-safe via Zod schemas and can be overridden at runtime. + +Create `src/lib/app-scope.ts`: + +```typescript +import { createAppScope } from 'axiom/ai/evals'; +import { z } from 'zod'; + +export const flagSchema = z.object({ + ticketClassification: z.object({ + model: z.string().default('gpt-4o-mini'), + }), +}); + +const { flag, pickFlags } = createAppScope({ flagSchema }); + +export { flag, pickFlags }; +``` + +## Writing a Real-World Eval + +Let's build a practical evaluation for a support ticket classification system. + +Create an eval file `src/evals/ticket-classification.eval.ts`: + +```typescript +import { experimental_Eval as Eval, Scorer } from 'axiom/ai/evals'; +import { generateObject } from 'ai'; +import { openai } from '@ai-sdk/openai'; +import { wrapAISDKModel } from 'axiom/ai'; +import { flag, pickFlags } from '../lib/app-scope'; +import { z } from 'zod'; +import { ExactMatch } from 'autoevals'; + +// Define your schemas +const ticketCategorySchema = z.enum(['spam', 'question', 'feature_request', 'bug_report']); +const ticketResponseSchema = z.object({ + category: ticketCategorySchema, + response: z.string(), +}); + +// The function you want to evaluate +async function classifyTicket({ subject, content }: { subject?: string; content: string }) { + const model = flag('ticketClassification.model'); + + const result = await generateObject({ + model: wrapAISDKModel(openai(model)), + messages: [ + { + role: 'system', + content: `You are a customer support engineer classifying tickets as: spam, question, feature_request, or bug_report. + +If spam, return a polite auto-close message. Otherwise, say a team member will respond shortly.`, + }, + { + role: 'user', + content: subject ? `Subject: ${subject}\n\n${content}` : content, + }, + ], + schema: ticketResponseSchema, + }); + + return result.object; +} - // 4. The pass/fail threshold for the scores - threshold: 1, +// Custom exact-match scorer +const ExactMatchScorer = Scorer( + 'Exact-Match', + ({ output, expected }: { output: { response: string }; expected: { response: string } }) => { + return ExactMatch({ + output: output.response, + expected: expected.response, + }); + } +); + +// Custom spam classification scorer +const SpamClassificationScorer = Scorer( + 'Spam-Classification', + ({ output, expected }: { + output: { category: string }; + expected: { category: string }; + }) => { + return (expected.category === 'spam') === (output.category === 'spam') ? 1 : 0; + } +); + +// Define the evaluation +Eval('spam-classification', { + // Specify which flags this eval uses + configFlags: pickFlags('ticketClassification'), + + // Test data with input/expected pairs + data: () => [ + { + input: { + subject: "Congratulations! You've Been Selected for an Exclusive Reward", + content: 'Claim your $500 gift card now by clicking this link!', + }, + expected: { + category: 'spam', + response: "We're sorry, but your message has been automatically closed.", + }, + }, + { + input: { + subject: 'FREE V1AGRA', + content: 'BUY NOW ON WWW.BEST-DEALS.COM!', + }, + expected: { + category: 'spam', + response: "We're sorry, but your message has been automatically closed.", + }, + }, + ], + + // The task to run for each test case + task: async ({ input }) => { + return await classifyTicket(input); + }, + + // Scorers to measure performance + scorers: [SpamClassificationScorer, ExactMatchScorer], + + // Optional metadata + metadata: { + description: 'Classify support tickets as spam or not spam', + }, }); ``` -## Grading with scorers +## Scoring with Scorers -Coming soon A grader is a function that scores a capability’s output. Axiom will provide a library of built-in scorers for common tasks (e.g., checking for semantic similarity, factual correctness, or JSON validity). You can also provide your own custom functions to measure domain-specific logic. Each scorer receives the `input`, the generated `output`, and the `expected` value, and must return a score. +A scorer is a function that scores a capability's output. Scorers receive the `input`, the generated `output`, and the `expected` value, and return a score (typically 0-1). + +### Simple Custom Scorer + +```typescript +import { Scorer } from 'axiom/ai/evals'; + +const ExactMatchScorer = Scorer( + 'Exact-Match', + ({ output, expected }: { output: string; expected: string }) => { + return output === expected ? 1 : 0; + } +); +``` -## Running evaluations +### Using AutoEvals Library -Coming soon You will run your evaluation suites from your terminal using the `axiom` CLI. +The `autoevals` library provides pre-built scorers for common tasks like semantic similarity, factual correctness, and text matching: + +```typescript +import { Scorer } from 'axiom/ai/evals'; +import { ExactMatch } from 'autoevals'; + +const WrappedExactMatch = Scorer( + 'Exact-Match', + ({ output, expected }: { output: string; expected: string }) => { + return ExactMatch({ output, expected }); + } +); +``` + +### Scorer with Metadata + +Scorers can return additional metadata alongside the score: + +```typescript +const CustomScorer = Scorer( + 'Custom-Scorer', + ({ output, expected }) => { + const score = computeScore(output, expected); + return { + score, + metadata: { + details: 'Additional info about this score', + }, + }; + } +); +``` + +## Running Evaluations + +You run your evaluation suites from your terminal using the `axiom` CLI. + +### Run all evals ```bash -axiom run evals/text-match.eval.ts +axiom eval ``` -This command will execute the specified test file using `vitest` in the background. Note that `vitest` will be a peer dependency for this functionality. +This finds and runs all files matching `**/*.eval.{ts,js}`. + +### Run specific eval file + +```bash +axiom eval src/evals/ticket-classification.eval.ts +``` -## Analyzing results in the console +### Run evals matching a glob pattern + +```bash +axiom eval "**/*spam*.eval.ts" +``` + +### Run eval by name + +```bash +axiom eval "spam-classification" +``` + +### List available evals without running + +```bash +axiom eval --list +``` + +## Overriding Flags + +Flags allow you to run experiments by testing different configurations without changing code. + +### From CLI (dot notation) + +Override individual flags: + +```bash +axiom eval --flag.ticketClassification.model=gpt-4o +``` + +### From JSON file + +Create `experiment.json`: + +```json +{ + "ticketClassification": { + "model": "gpt-4o" + } +} +``` + +Then run: + +```bash +axiom eval --flags-config=experiment.json +``` + +## Analyzing Results in the Console Coming soon When you run an eval, the Axiom SDK captures a detailed OpenTelemetry trace for the entire run. This includes parent spans for the evaluation suite and child spans for each individual test case, task execution, and scorer result. These traces are enriched with `eval.*` attributes, allowing you to deeply analyze results in the Axiom Console. +After running evals, you'll see: +- Pass/fail status for each test case +- Scores from each scorer +- Comparison to baseline (if available) +- Links to view detailed traces in Axiom + +Results are also sent to your Axiom dataset for long-term tracking and analysis. + The Console will feature leaderboards and comparison views to track score progression across different versions of a capability, helping you verify that your changes are leading to measurable improvements. -## What’s next? +## What's Next? + +Once your capability meets your quality benchmarks in the Measure stage, it's ready to be deployed. Additional next steps include: -Once your capability meets your quality benchmarks in the Measure stage, it’s ready to be deployed. The next step is to monitor its performance with real-world traffic. +- **Baseline Comparisons**: Run evals multiple times to track regression over time +- **Experiment with Flags**: Test different models or strategies using flag overrides +- **Advanced Scorers**: Build custom scorers for domain-specific metrics +- **CI/CD Integration**: Add `axiom eval` to your CI pipeline to catch regressions -Learn more about this step of the AI engineering workflow in the [Observe](/ai-engineering/observe) docs. \ No newline at end of file +The next step is to monitor its performance with real-world traffic. Learn more about this step of the AI engineering workflow in the [Observe](/ai-engineering/observe) docs. From a082b905e392d6d7641dc7f18d8b63df5f604a80 Mon Sep 17 00:00:00 2001 From: Christopher Ehrlich Date: Tue, 11 Nov 2025 15:53:36 +0700 Subject: [PATCH 02/11] add note about instrumentation fn --- ai-engineering/measure.mdx | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ai-engineering/measure.mdx b/ai-engineering/measure.mdx index df51ac929..28a0d6840 100644 --- a/ai-engineering/measure.mdx +++ b/ai-engineering/measure.mdx @@ -103,7 +103,7 @@ AXIOM_TOKEN="xaat-******" AXIOM_DATASET="my_dataset" ``` -#### 2. Create instrumentation setup (optional) +#### 2. Create instrumentation setup(optional) If you are evaluating components of a production application that is instrumented with OpenTelemetry, you can see your application spans in Axiom. In order to enable this, your instrumentation setup must be a function that can be passed in `axiom.config.ts`. An example is shown below. @@ -187,6 +187,7 @@ export default defineConfig({ timeoutMs: 60_000, // Optional: instrumentation hook for OpenTelemetry + // (created this in the "Create instrumentation setup" step) instrumentation: ({ url, token, dataset }) => setupAppInstrumentation({ url, token, dataset }), }, From 7df0bdb082039969ffbcf194a15404a19e0ec45d Mon Sep 17 00:00:00 2001 From: Mano Toth Date: Tue, 11 Nov 2025 16:12:58 +0100 Subject: [PATCH 03/11] Stylistic fixes --- ai-engineering/measure.mdx | 54 +++++++++++++++++++------------------- 1 file changed, 27 insertions(+), 27 deletions(-) diff --git a/ai-engineering/measure.mdx b/ai-engineering/measure.mdx index 28a0d6840..d706ee0c4 100644 --- a/ai-engineering/measure.mdx +++ b/ai-engineering/measure.mdx @@ -5,17 +5,17 @@ keywords: ["ai engineering", "AI engineering", "measure", "evals", "evaluation", --- import { Badge } from "/snippets/badge.jsx" -import { definitions } from '/snippets/definitions.mdx' +import { definitions } from "/snippets/definitions.mdx" -The evaluation framework described here is in active development. Axiom is working with design partners to shape what's built. [Contact Axiom](https://www.axiom.co/contact) to get early access and join a focused group of teams shaping these tools. +The evaluation framework described here is in active development. Axiom is working with design partners to shape what’s built. [Contact Axiom](https://www.axiom.co/contact) to get early access and join a focused group of teams shaping these tools. -The **Measure** stage is where you quantify the quality and effectiveness of your AI capability. Instead of relying on anecdotal checks, this stage uses a systematic process called an eval to score your capability's performance against a known set of correct examples (ground truth). This provides a data-driven benchmark to ensure a capability is ready for production and to track its quality over time. +The **Measure** stage is where you quantify the quality and effectiveness of your AI capability. Instead of relying on anecdotal checks, this stage uses a systematic process called an eval to score your capability’s performance against a known set of correct examples (ground truth). This provides a data-driven benchmark to ensure a capability is ready for production and to track its quality over time. Evaluations (evals) are systematic tests that measure how well your AI features perform. Instead of manually testing AI outputs, evals automatically run your AI code against test datasets and score the results using custom metrics. This lets you catch regressions, compare different approaches, and confidently improve your AI features over time. -## The `Eval` function +## `Eval` function The primary tool for the Measure stage is the `Eval` function, available in the `axiom/ai/evals` package. It provides a simple, declarative way to define a test suite for your capability directly in your codebase. @@ -66,7 +66,7 @@ Eval('text-match-eval', { }); ``` -## Getting Started +## Get started ### Prerequisites @@ -74,7 +74,7 @@ Eval('text-match-eval', { - Existing AI SDK setup (e.g., `@ai-sdk/openai`, `ai`) - Axiom account with API token and dataset -### Installation +### Install dependencies ```bash npm install axiom @@ -91,7 +91,7 @@ npm install @opentelemetry/api \ @opentelemetry/semantic-conventions ``` -### Configuration +### Configure #### 1. Set up environment variables @@ -103,7 +103,7 @@ AXIOM_TOKEN="xaat-******" AXIOM_DATASET="my_dataset" ``` -#### 2. Create instrumentation setup(optional) +#### 2. Create instrumentation setup (optional) If you are evaluating components of a production application that is instrumented with OpenTelemetry, you can see your application spans in Axiom. In order to enable this, your instrumentation setup must be a function that can be passed in `axiom.config.ts`. An example is shown below. @@ -194,9 +194,9 @@ export default defineConfig({ }); ``` -## Setting up Flags +## Set up flags -Flags let you parameterize your AI behavior (like model choice or prompting strategies) and run experiments with different configurations. They're type-safe via Zod schemas and can be overridden at runtime. +Flags let you parameterize your AI behavior (like model choice or prompting strategies) and run experiments with different configurations. They’re type-safe via Zod schemas and can be overridden at runtime. Create `src/lib/app-scope.ts`: @@ -215,9 +215,9 @@ const { flag, pickFlags } = createAppScope({ flagSchema }); export { flag, pickFlags }; ``` -## Writing a Real-World Eval +## Write real-world evals -Let's build a practical evaluation for a support ticket classification system. +Let’s build a practical evaluation for a support ticket classification system. Create an eval file `src/evals/ticket-classification.eval.ts`: @@ -327,11 +327,11 @@ Eval('spam-classification', { }); ``` -## Scoring with Scorers +## Score with scorers -A scorer is a function that scores a capability's output. Scorers receive the `input`, the generated `output`, and the `expected` value, and return a score (typically 0-1). +A scorer is a function that scores a capability’s output. Scorers receive the `input`, the generated `output`, and the `expected` value, and return a score (typically 0-1). -### Simple Custom Scorer +### Simple custom scorer ```typescript import { Scorer } from 'axiom/ai/evals'; @@ -344,7 +344,7 @@ const ExactMatchScorer = Scorer( ); ``` -### Using AutoEvals Library +### Use AutoEvals library The `autoevals` library provides pre-built scorers for common tasks like semantic similarity, factual correctness, and text matching: @@ -360,7 +360,7 @@ const WrappedExactMatch = Scorer( ); ``` -### Scorer with Metadata +### Scorer with metadata Scorers can return additional metadata alongside the score: @@ -379,7 +379,7 @@ const CustomScorer = Scorer( ); ``` -## Running Evaluations +## Run evaluations You run your evaluation suites from your terminal using the `axiom` CLI. @@ -415,7 +415,7 @@ axiom eval "spam-classification" axiom eval --list ``` -## Overriding Flags +## Override flags Flags allow you to run experiments by testing different configurations without changing code. @@ -445,11 +445,11 @@ Then run: axiom eval --flags-config=experiment.json ``` -## Analyzing Results in the Console +## Analyze results in Console Coming soon When you run an eval, the Axiom SDK captures a detailed OpenTelemetry trace for the entire run. This includes parent spans for the evaluation suite and child spans for each individual test case, task execution, and scorer result. These traces are enriched with `eval.*` attributes, allowing you to deeply analyze results in the Axiom Console. -After running evals, you'll see: +After running evals, you’ll see: - Pass/fail status for each test case - Scores from each scorer - Comparison to baseline (if available) @@ -459,13 +459,13 @@ Results are also sent to your Axiom dataset for long-term tracking and analysis. The Console will feature leaderboards and comparison views to track score progression across different versions of a capability, helping you verify that your changes are leading to measurable improvements. -## What's Next? +## What’s next? -Once your capability meets your quality benchmarks in the Measure stage, it's ready to be deployed. Additional next steps include: +Once your capability meets your quality benchmarks in the Measure stage, it’s ready to be deployed. Additional next steps include: -- **Baseline Comparisons**: Run evals multiple times to track regression over time -- **Experiment with Flags**: Test different models or strategies using flag overrides -- **Advanced Scorers**: Build custom scorers for domain-specific metrics -- **CI/CD Integration**: Add `axiom eval` to your CI pipeline to catch regressions +- **Baseline comparisons**: Run evals multiple times to track regression over time +- **Experiment with flags**: Test different models or strategies using flag overrides +- **Advanced scorers**: Build custom scorers for domain-specific metrics +- **CI/CD integration**: Add `axiom eval` to your CI pipeline to catch regressions The next step is to monitor its performance with real-world traffic. Learn more about this step of the AI engineering workflow in the [Observe](/ai-engineering/observe) docs. From 0254557d225c1f28c3f7fae4e6901a10f9eb5c4e Mon Sep 17 00:00:00 2001 From: Mano Toth Date: Thu, 13 Nov 2025 10:09:21 +0100 Subject: [PATCH 04/11] Quick fixes --- ai-engineering/measure.mdx | 20 ++++++++++---------- ai-engineering/quickstart.mdx | 3 +++ 2 files changed, 13 insertions(+), 10 deletions(-) diff --git a/ai-engineering/measure.mdx b/ai-engineering/measure.mdx index d706ee0c4..37cd0e96b 100644 --- a/ai-engineering/measure.mdx +++ b/ai-engineering/measure.mdx @@ -93,7 +93,7 @@ npm install @opentelemetry/api \ ### Configure -#### 1. Set up environment variables +#### Set up environment variables Create a `.env` file: @@ -103,13 +103,13 @@ AXIOM_TOKEN="xaat-******" AXIOM_DATASET="my_dataset" ``` -#### 2. Create instrumentation setup (optional) +#### Create instrumentation setup (optional) If you are evaluating components of a production application that is instrumented with OpenTelemetry, you can see your application spans in Axiom. In order to enable this, your instrumentation setup must be a function that can be passed in `axiom.config.ts`. An example is shown below. Create `src/instrumentation.node.ts`: -```typescript +```ts /src/instrumentation.node.ts import { OTLPTraceExporter } from '@opentelemetry/exporter-trace-otlp-http'; import { resourceFromAttributes } from '@opentelemetry/resources'; import { BatchSpanProcessor, NodeTracerProvider } from '@opentelemetry/sdk-trace-node'; @@ -163,11 +163,11 @@ import { trace } from '@opentelemetry/api'; export const tracer = trace.getTracer('my-tracer'); ``` -#### 1. Create `axiom.config.ts` +#### Create `axiom.config.ts` Create a configuration file at the root of your project: -```typescript +```ts /axiom.config.ts import { defineConfig } from 'axiom/ai/config'; import { setupAppInstrumentation } from './src/instrumentation.node'; @@ -200,7 +200,7 @@ Flags let you parameterize your AI behavior (like model choice or prompting stra Create `src/lib/app-scope.ts`: -```typescript +```ts /src/lib/app-scope.ts import { createAppScope } from 'axiom/ai/evals'; import { z } from 'zod'; @@ -333,7 +333,7 @@ A scorer is a function that scores a capability’s output. Scorers receive the ### Simple custom scorer -```typescript +```ts import { Scorer } from 'axiom/ai/evals'; const ExactMatchScorer = Scorer( @@ -348,7 +348,7 @@ const ExactMatchScorer = Scorer( The `autoevals` library provides pre-built scorers for common tasks like semantic similarity, factual correctness, and text matching: -```typescript +```ts import { Scorer } from 'axiom/ai/evals'; import { ExactMatch } from 'autoevals'; @@ -364,7 +364,7 @@ const WrappedExactMatch = Scorer( Scorers can return additional metadata alongside the score: -```typescript +```ts const CustomScorer = Scorer( 'Custom-Scorer', ({ output, expected }) => { @@ -381,7 +381,7 @@ const CustomScorer = Scorer( ## Run evaluations -You run your evaluation suites from your terminal using the `axiom` CLI. +To run your evaluation suites from your terminal, [install the Axiom CLI](/reference/cli) and use the following commands. ### Run all evals diff --git a/ai-engineering/quickstart.mdx b/ai-engineering/quickstart.mdx index 72058199b..c301da429 100644 --- a/ai-engineering/quickstart.mdx +++ b/ai-engineering/quickstart.mdx @@ -5,6 +5,7 @@ keywords: ["ai engineering", "getting started", "install", "setup", "configurati --- import ReplaceDatasetToken from "/snippets/replace-dataset-token.mdx" +import ReplaceDomain from "/snippets/replace-domain.mdx" import Prerequisites from "/snippets/standard-prerequisites.mdx" import AIInstrumentationApproaches from "/snippets/ai-instrumentation-approaches.mdx" @@ -148,6 +149,7 @@ For more information on specifying redaction policies, see [Redaction policies]( Store environment variables in an `.env` file in the root of your project: ```bash .env +AXIOM_URL="AXIOM_DOMAIN" AXIOM_TOKEN="API_TOKEN" AXIOM_DATASET="DATASET_NAME" OPENAI_API_KEY="" @@ -158,6 +160,7 @@ ANTHROPIC_API_KEY="" + Enter the API keys for the LLMs you want to work with. From 7b8bd252794f3c312514e5e7bcb33d0edc47ff08 Mon Sep 17 00:00:00 2001 From: Mano Toth Date: Thu, 13 Nov 2025 14:31:42 +0100 Subject: [PATCH 05/11] Fixes --- ai-engineering/create.mdx | 2 +- ai-engineering/observe/manual-instrumentation.mdx | 4 ++-- ai-engineering/quickstart.mdx | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/ai-engineering/create.mdx b/ai-engineering/create.mdx index 9f9f23232..5827e0a96 100644 --- a/ai-engineering/create.mdx +++ b/ai-engineering/create.mdx @@ -9,7 +9,7 @@ import { definitions } from '/snippets/definitions.mdx' The **Create** stage is about defining a new AI capability as a structured, version-able asset in your codebase. The goal is to move away from scattered, hard-coded string prompts and toward a more disciplined and organized approach to prompt engineering. -### Defining a capability as a prompt object +### Define a capability as a prompt object In Axiom AI engineering, every capability is represented by a `Prompt` object. This object serves as the single source of truth for the capability’s logic, including its messages, metadata, and the schema for its arguments. diff --git a/ai-engineering/observe/manual-instrumentation.mdx b/ai-engineering/observe/manual-instrumentation.mdx index 514281586..9e097aeb6 100644 --- a/ai-engineering/observe/manual-instrumentation.mdx +++ b/ai-engineering/observe/manual-instrumentation.mdx @@ -188,7 +188,7 @@ Example of a properly structured chat completion trace: ```typescript TypeScript expandable import { trace, SpanKind, SpanStatusCode } from '@opentelemetry/api'; -const tracer = trace.getTracer('my-ai-app'); +const tracer = trace.getTracer('my-app'); // Create a span for the AI operation return tracer.startActiveSpan('chat gpt-4', { @@ -233,7 +233,7 @@ from opentelemetry import trace from opentelemetry.trace import SpanKind import json -tracer = trace.get_tracer("my-ai-app") +tracer = trace.get_tracer("my-app") # Create a span for the AI operation with tracer.start_as_current_span("chat gpt-4", kind=SpanKind.CLIENT) as span: diff --git a/ai-engineering/quickstart.mdx b/ai-engineering/quickstart.mdx index c301da429..36a3a1ed3 100644 --- a/ai-engineering/quickstart.mdx +++ b/ai-engineering/quickstart.mdx @@ -117,7 +117,7 @@ To send data to Axiom, configure a tracer. For example, use a dedicated instrume // Configure the provider to export traces to your Axiom dataset const provider = new NodeTracerProvider({ resource: resourceFromAttributes({ - [ATTR_SERVICE_NAME]: 'my-ai-app', // Replace with your service name + [ATTR_SERVICE_NAME]: 'my-app', // Replace with your service name }, { // Use the latest schema version @@ -126,7 +126,7 @@ To send data to Axiom, configure a tracer. For example, use a dedicated instrume }), spanProcessor: new SimpleSpanProcessor( new OTLPTraceExporter({ - url: `https://api.axiom.co/v1/traces`, + url: `${process.env.AXIOM_URL}/v1/traces`, headers: { Authorization: `Bearer ${process.env.AXIOM_TOKEN}`, 'X-Axiom-Dataset': process.env.AXIOM_DATASET!, From 2251591b9ecef6bdd95fb2f5d1d84c15768bf997 Mon Sep 17 00:00:00 2001 From: Mano Toth Date: Fri, 14 Nov 2025 12:12:07 +0100 Subject: [PATCH 06/11] Add keywords --- ai-engineering/measure.mdx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ai-engineering/measure.mdx b/ai-engineering/measure.mdx index 37cd0e96b..0db2d3042 100644 --- a/ai-engineering/measure.mdx +++ b/ai-engineering/measure.mdx @@ -1,7 +1,7 @@ --- title: "Measure" description: "Learn how to measure the quality of your AI capabilities by running evaluations against ground truth data." -keywords: ["ai engineering", "AI engineering", "measure", "evals", "evaluation", "scoring", "scorers"] +keywords: ["ai engineering", "AI engineering", "measure", "evals", "evaluation", "scoring", "scorers", "graders", "scores"] --- import { Badge } from "/snippets/badge.jsx" From 2c662b2c491c808bb242e2bd5b8a15126ae4ea55 Mon Sep 17 00:00:00 2001 From: Mano Toth Date: Mon, 17 Nov 2025 08:50:45 +0100 Subject: [PATCH 07/11] Restructure Measure page --- ai-engineering/measure.mdx | 343 ++++++++++++---------------------- ai-engineering/quickstart.mdx | 26 +-- 2 files changed, 136 insertions(+), 233 deletions(-) diff --git a/ai-engineering/measure.mdx b/ai-engineering/measure.mdx index 0db2d3042..fd482c2cf 100644 --- a/ai-engineering/measure.mdx +++ b/ai-engineering/measure.mdx @@ -15,111 +15,36 @@ The **Measure** stage is where you quantify the quality and effectiveness of you Evaluations (evals) are systematic tests that measure how well your AI features perform. Instead of manually testing AI outputs, evals automatically run your AI code against test datasets and score the results using custom metrics. This lets you catch regressions, compare different approaches, and confidently improve your AI features over time. -## `Eval` function +## Initial setup -The primary tool for the Measure stage is the `Eval` function, available in the `axiom/ai/evals` package. It provides a simple, declarative way to define a test suite for your capability directly in your codebase. +1. Follow the [Quickstart](/ai-engineering/quickstart) to set up instrumentation for your app. +1. Run the following command to istall the `autoevals` library. This library provides pre-built scorers for common tasks like semantic similarity, factual correctness, and text matching. -An `Eval` is structured around a few key parameters: + ```bash + npm install --save-dev autoevals + ``` -* `data`: An async function that returns your collection of `{ input, expected }` pairs, which serve as your ground truth. -* `task`: The function that executes your AI capability, taking an `input` and producing an `output`. -* `scorers`: An array of scorer functions that score the `output` against the `expected` value. -* `metadata`: Optional metadata for the evaluation, such as a description. +### Change instrumentation -Here is an example of a complete evaluation suite: +Change the instrumentation setup in the `src/instrumentation.ts` file that you have previously created in the [Quickstart](/ai-engineering/quickstart). -```ts /evals/text-match.eval.ts -import { Eval, Scorer } from 'axiom/ai/evals'; - -const LevenshteinScorer = Scorer( - 'Levenshtein', - ({ output, expected }: { output: string; expected: string }) => { - // Calculate Levenshtein distance score - const distance = calculateLevenshtein(output, expected); - const maxLen = Math.max(output.length, expected.length); - return maxLen === 0 ? 1 : 1 - distance / maxLen; - } -); - -Eval('text-match-eval', { - // 1. Your ground truth dataset - data: async () => { - return [ - { - input: 'test', - expected: 'hi, test!', - }, - { - input: 'foobar', - expected: 'hello, foobar!', - }, - ]; - }, - - // 2. The task that runs your capability - task: async ({ input }) => { - return `hi, ${input}!`; - }, - - // 3. The scorers that grade the output - scorers: [LevenshteinScorer], -}); -``` - -## Get started - -### Prerequisites - -- Node.js 22.20 or higher -- Existing AI SDK setup (e.g., `@ai-sdk/openai`, `ai`) -- Axiom account with API token and dataset - -### Install dependencies - -```bash -npm install axiom -npm install --save-dev autoevals -``` - -Install required OpenTelemetry dependencies: - -```bash -npm install @opentelemetry/api \ - @opentelemetry/exporter-trace-otlp-http \ - @opentelemetry/resources \ - @opentelemetry/sdk-trace-node \ - @opentelemetry/semantic-conventions -``` - -### Configure - -#### Set up environment variables - -Create a `.env` file: - -```bash -AXIOM_URL="https://api.axiom.co" -AXIOM_TOKEN="xaat-******" -AXIOM_DATASET="my_dataset" -``` - -#### Create instrumentation setup (optional) - -If you are evaluating components of a production application that is instrumented with OpenTelemetry, you can see your application spans in Axiom. In order to enable this, your instrumentation setup must be a function that can be passed in `axiom.config.ts`. An example is shown below. - -Create `src/instrumentation.node.ts`: - -```ts /src/instrumentation.node.ts +```ts /src/instrumentation.ts lines highlight={1,9-10,16-28,58-59} +// Remove `import 'dotenv/config';` import { OTLPTraceExporter } from '@opentelemetry/exporter-trace-otlp-http'; import { resourceFromAttributes } from '@opentelemetry/resources'; -import { BatchSpanProcessor, NodeTracerProvider } from '@opentelemetry/sdk-trace-node'; +import { SimpleSpanProcessor, NodeTracerProvider } from '@opentelemetry/sdk-trace-node'; import { ATTR_SERVICE_NAME } from '@opentelemetry/semantic-conventions'; +import { trace } from "@opentelemetry/api"; import { initAxiomAI, RedactionPolicy } from 'axiom/ai'; + +// Import the type for the AxiomEvalInstrumentationHook import type { AxiomEvalInstrumentationHook } from 'axiom/ai/config'; -import { tracer } from './tracer'; + +const tracer = trace.getTracer("my-tracer"); let provider: NodeTracerProvider | undefined; +// Wrap your logic in the AxiomEvalInstrumentationHook function export const setupAppInstrumentation: AxiomEvalInstrumentationHook = async ({ dataset, url, @@ -133,43 +58,45 @@ export const setupAppInstrumentation: AxiomEvalInstrumentationHook = async ({ throw new Error('Missing environment variables'); } + // Replace the environment variables with the parameters passed to the function const exporter = new OTLPTraceExporter({ url: `${url}/v1/traces`, headers: { Authorization: `Bearer ${token}`, 'X-Axiom-Dataset': dataset, }, - }); + }) + // Configure the provider to export traces to your Axiom dataset provider = new NodeTracerProvider({ resource: resourceFromAttributes({ - [ATTR_SERVICE_NAME]: 'my-app', + [ATTR_SERVICE_NAME]: 'my-app', // Replace with your service name + }, + { + // Use the latest schema version + // Info: https://opentelemetry.io/docs/specs/semconv/ + schemaUrl: 'https://opentelemetry.io/schemas/1.37.0', }), - spanProcessors: [new BatchSpanProcessor(exporter)], + spanProcessor: new SimpleSpanProcessor(exporter), }); + // Register the provider provider.register(); + + // Initialize Axiom AI SDK with the configured tracer initAxiomAI({ tracer, redactionPolicy: RedactionPolicy.AxiomDefault }); return { provider }; }; ``` -Create `src/tracer.ts`: +### Create Axiom configuration file -```typescript -import { trace } from '@opentelemetry/api'; - -export const tracer = trace.getTracer('my-tracer'); -``` - -#### Create `axiom.config.ts` - -Create a configuration file at the root of your project: +At the root of your project, create the Axiom configuration file `/axiom.config.ts`: ```ts /axiom.config.ts import { defineConfig } from 'axiom/ai/config'; -import { setupAppInstrumentation } from './src/instrumentation.node'; +import { setupAppInstrumentation } from './src/instrumentation'; export default defineConfig({ eval: { @@ -194,34 +121,20 @@ export default defineConfig({ }); ``` -## Set up flags +## Write evalulation function -Flags let you parameterize your AI behavior (like model choice or prompting strategies) and run experiments with different configurations. They’re type-safe via Zod schemas and can be overridden at runtime. +The `Eval` function provides a simple, declarative way to define a test suite for your capability directly in your codebase. -Create `src/lib/app-scope.ts`: +The key parameters of the `Eval` function: -```ts /src/lib/app-scope.ts -import { createAppScope } from 'axiom/ai/evals'; -import { z } from 'zod'; +- `data`: An async function that returns your collection of `{ input, expected }` pairs, which serve as your ground truth. +- `task`: The function that executes your AI capability, taking an `input` and producing an `output`. +- `scorers`: An array of scorer functions that score the `output` against the `expected` value. +- `metadata`: Optional metadata for the evaluation, such as a description. -export const flagSchema = z.object({ - ticketClassification: z.object({ - model: z.string().default('gpt-4o-mini'), - }), -}); +Create an evaluation for a support ticket classification system in the file `/src/evals/ticket-classification.eval.ts`. -const { flag, pickFlags } = createAppScope({ flagSchema }); - -export { flag, pickFlags }; -``` - -## Write real-world evals - -Let’s build a practical evaluation for a support ticket classification system. - -Create an eval file `src/evals/ticket-classification.eval.ts`: - -```typescript +```ts /src/evals/ticket-classification.eval.ts expandable import { experimental_Eval as Eval, Scorer } from 'axiom/ai/evals'; import { generateObject } from 'ai'; import { openai } from '@ai-sdk/openai'; @@ -302,7 +215,7 @@ Eval('spam-classification', { }, { input: { - subject: 'FREE V1AGRA', + subject: 'FREE CA$H', content: 'BUY NOW ON WWW.BEST-DEALS.COM!', }, expected: { @@ -327,111 +240,101 @@ Eval('spam-classification', { }); ``` -## Score with scorers +## Set up scorers A scorer is a function that scores a capability’s output. Scorers receive the `input`, the generated `output`, and the `expected` value, and return a score (typically 0-1). -### Simple custom scorer - -```ts -import { Scorer } from 'axiom/ai/evals'; - -const ExactMatchScorer = Scorer( - 'Exact-Match', - ({ output, expected }: { output: string; expected: string }) => { - return output === expected ? 1 : 0; - } -); -``` - -### Use AutoEvals library - -The `autoevals` library provides pre-built scorers for common tasks like semantic similarity, factual correctness, and text matching: - -```ts -import { Scorer } from 'axiom/ai/evals'; -import { ExactMatch } from 'autoevals'; - -const WrappedExactMatch = Scorer( - 'Exact-Match', - ({ output, expected }: { output: string; expected: string }) => { - return ExactMatch({ output, expected }); - } -); -``` - -### Scorer with metadata - -Scorers can return additional metadata alongside the score: - -```ts -const CustomScorer = Scorer( - 'Custom-Scorer', - ({ output, expected }) => { - const score = computeScore(output, expected); - return { - score, - metadata: { - details: 'Additional info about this score', - }, - }; - } -); -``` +Examples to set up scorers: +- Use a simple exact match scorer that returns 1 if the output matches the expected value, and 0 otherwise: + + ```ts + import { Scorer } from 'axiom/ai/evals'; + + const ExactMatchScorer = Scorer( + 'Exact-Match', + ({ output, expected }: { output: string; expected: string }) => { + return output === expected ? 1 : 0; + } + ); + ``` + +- Use the `autoevals` library that provides prebuilt scorers for common tasks like semantic similarity, factual correctness, and text matching: + + ```ts + import { Scorer } from 'axiom/ai/evals'; + import { ExactMatch } from 'autoevals'; + + const WrappedExactMatch = Scorer( + 'Exact-Match', + ({ output, expected }: { output: string; expected: string }) => { + return ExactMatch({ output, expected }); + } + ); + ``` + +- Use a custom scorer that returns metadata alongside the score: + + ```ts + const CustomScorer = Scorer( + 'Custom-Scorer', + ({ output, expected }) => { + const score = computeScore(output, expected); + return { + score, + metadata: { + details: 'Additional info about this score', + }, + }; + } + ); + ``` ## Run evaluations To run your evaluation suites from your terminal, [install the Axiom CLI](/reference/cli) and use the following commands. -### Run all evals - -```bash -axiom eval -``` - -This finds and runs all files matching `**/*.eval.{ts,js}`. +| Description | Command | +| ----------- | ------- | +| Run all evals | `axiom eval` | +| Run specific eval file | `axiom eval src/evals/ticket-classification.eval.ts` | +| Run evals matching a glob pattern | `axiom eval "**/*spam*.eval.ts"` | +| Run eval by name | `axiom eval "spam-classification"` | +| List available evals without running | `axiom eval --list` | -### Run specific eval file +## Run experiments -```bash -axiom eval src/evals/ticket-classification.eval.ts -``` +Flags let you parameterize your AI behavior (like model choice or prompting strategies) and run experiments with different configurations. They’re type-safe via Zod schemas, and you can override them at runtime. -### Run evals matching a glob pattern +### Set up flags -```bash -axiom eval "**/*spam*.eval.ts" -``` +Create the file `src/lib/app-scope.ts` that uses the `ticketClassification` flagto test different language models. -### Run eval by name +```ts /src/lib/app-scope.ts +import { createAppScope } from 'axiom/ai/evals'; +import { z } from 'zod'; -```bash -axiom eval "spam-classification" -``` +export const flagSchema = z.object({ + ticketClassification: z.object({ + model: z.string().default('gpt-4o-mini'), + }), +}); -### List available evals without running +const { flag, pickFlags } = createAppScope({ flagSchema }); -```bash -axiom eval --list +export { flag, pickFlags }; ``` -## Override flags +### Override flags at runtime -Flags allow you to run experiments by testing different configurations without changing code. - -### From CLI (dot notation) - -Override individual flags: +Override flags directly when you run the eval: ```bash axiom eval --flag.ticketClassification.model=gpt-4o ``` -### From JSON file - -Create `experiment.json`: +Alternatively, specify the flag overrides in a JSON file. -```json +```json experiment.json { "ticketClassification": { "model": "gpt-4o" @@ -439,7 +342,7 @@ Create `experiment.json`: } ``` -Then run: +And then specify the JSON file as the value of the `flags-config` parameter when you run the eval: ```bash axiom eval --flags-config=experiment.json @@ -447,25 +350,25 @@ axiom eval --flags-config=experiment.json ## Analyze results in Console -Coming soon When you run an eval, the Axiom SDK captures a detailed OpenTelemetry trace for the entire run. This includes parent spans for the evaluation suite and child spans for each individual test case, task execution, and scorer result. These traces are enriched with `eval.*` attributes, allowing you to deeply analyze results in the Axiom Console. +Coming soon When you run an eval, Axiom AI SDK captures a detailed OpenTelemetry trace for the entire run. This includes parent spans for the evaluation suite and child spans for each individual test case, task execution, and scorer result. Axiom enriches the traces with `eval.*` attributes, allowing you to deeply analyze results in the Axiom Console. -After running evals, you’ll see: +The results of evals: - Pass/fail status for each test case - Scores from each scorer - Comparison to baseline (if available) - Links to view detailed traces in Axiom -Results are also sent to your Axiom dataset for long-term tracking and analysis. +Additionally, the AI SDK sends the results to your Axiom dataset for long-term tracking and analysis. The Console will feature leaderboards and comparison views to track score progression across different versions of a capability, helping you verify that your changes are leading to measurable improvements. ## What’s next? -Once your capability meets your quality benchmarks in the Measure stage, it’s ready to be deployed. Additional next steps include: +A capability is ready to be deployed when it meets your quality benchmarks. After deployment, the next steps can be the following: -- **Baseline comparisons**: Run evals multiple times to track regression over time -- **Experiment with flags**: Test different models or strategies using flag overrides -- **Advanced scorers**: Build custom scorers for domain-specific metrics -- **CI/CD integration**: Add `axiom eval` to your CI pipeline to catch regressions +- **Baseline comparisons**: Run evals multiple times to track regression over time. +- **Experiment with flags**: Test different models or strategies using flag overrides. +- **Advanced scorers**: Build custom scorers for domain-specific metrics. +- **CI/CD integration**: Add `axiom eval` to your CI pipeline to catch regressions. -The next step is to monitor its performance with real-world traffic. Learn more about this step of the AI engineering workflow in the [Observe](/ai-engineering/observe) docs. +The next step is to monitor your capability’s performance with real-world traffic. To learn more about this step of the AI engineering workflow, see [Observe](/ai-engineering/observe). diff --git a/ai-engineering/quickstart.mdx b/ai-engineering/quickstart.mdx index 36a3a1ed3..67624e927 100644 --- a/ai-engineering/quickstart.mdx +++ b/ai-engineering/quickstart.mdx @@ -102,20 +102,28 @@ To send data to Axiom, configure a tracer. For example, use a dedicated instrume 1. Create instrumentation file: ```typescript /src/instrumentation.ts - import 'dotenv/config'; // Make sure to load environment variables import { OTLPTraceExporter } from '@opentelemetry/exporter-trace-otlp-http'; import { resourceFromAttributes } from '@opentelemetry/resources'; - import { NodeTracerProvider } from '@opentelemetry/sdk-trace-node'; - import { SimpleSpanProcessor } from '@opentelemetry/sdk-trace-node'; + import { SimpleSpanProcessor, NodeTracerProvider } from '@opentelemetry/sdk-trace-node'; import { ATTR_SERVICE_NAME } from '@opentelemetry/semantic-conventions'; import { trace } from "@opentelemetry/api"; import { initAxiomAI, RedactionPolicy } from 'axiom/ai'; const tracer = trace.getTracer("my-tracer"); + let provider: NodeTracerProvider | undefined; + + const exporter = new OTLPTraceExporter({ + url: `${process.env.AXIOM_URL}/v1/traces`, + headers: { + Authorization: `Bearer ${process.env.AXIOM_TOKEN}`, + 'X-Axiom-Dataset': process.env.AXIOM_DATASET!, + }, + }) + // Configure the provider to export traces to your Axiom dataset - const provider = new NodeTracerProvider({ + provider = new NodeTracerProvider({ resource: resourceFromAttributes({ [ATTR_SERVICE_NAME]: 'my-app', // Replace with your service name }, @@ -124,15 +132,7 @@ To send data to Axiom, configure a tracer. For example, use a dedicated instrume // Info: https://opentelemetry.io/docs/specs/semconv/ schemaUrl: 'https://opentelemetry.io/schemas/1.37.0', }), - spanProcessor: new SimpleSpanProcessor( - new OTLPTraceExporter({ - url: `${process.env.AXIOM_URL}/v1/traces`, - headers: { - Authorization: `Bearer ${process.env.AXIOM_TOKEN}`, - 'X-Axiom-Dataset': process.env.AXIOM_DATASET!, - }, - }) - ), + spanProcessor: new SimpleSpanProcessor(exporter), }); // Register the provider From 95d4c5cef415bfbf37e36f5671ae69f0ce2fcee1 Mon Sep 17 00:00:00 2001 From: Mano Toth Date: Mon, 17 Nov 2025 13:10:31 +0100 Subject: [PATCH 08/11] Implement review --- ai-engineering/measure.mdx | 238 +++++++++++++++---------------------- 1 file changed, 94 insertions(+), 144 deletions(-) diff --git a/ai-engineering/measure.mdx b/ai-engineering/measure.mdx index fd482c2cf..b31298abc 100644 --- a/ai-engineering/measure.mdx +++ b/ai-engineering/measure.mdx @@ -15,81 +15,73 @@ The **Measure** stage is where you quantify the quality and effectiveness of you Evaluations (evals) are systematic tests that measure how well your AI features perform. Instead of manually testing AI outputs, evals automatically run your AI code against test datasets and score the results using custom metrics. This lets you catch regressions, compare different approaches, and confidently improve your AI features over time. -## Initial setup +## Prerequisites + +Follow the [Quickstart](/ai-engineering/quickstart) to set up instrumentation for your app. +- To run evals without an existing AI app, skip the part in the Quickstart about instrumentalising your app. +- To run evals within the context of an existing AI app, use the following instrumentation setup in the `src/instrumentation.ts` file that you have previously created in the [Quickstart](/ai-engineering/quickstart): + + ```ts /src/instrumentation.ts expandable + import { OTLPTraceExporter } from '@opentelemetry/exporter-trace-otlp-http'; + import { resourceFromAttributes } from '@opentelemetry/resources'; + import { BatchSpanProcessor, NodeTracerProvider } from '@opentelemetry/sdk-trace-node'; + import { ATTR_SERVICE_NAME } from '@opentelemetry/semantic-conventions'; + import { trace } from "@opentelemetry/api"; + import { initAxiomAI, RedactionPolicy } from 'axiom/ai'; + + // Import the type for the AxiomEvalInstrumentationHook + import type { AxiomEvalInstrumentationHook } from 'axiom/ai/config'; + + const tracer = trace.getTracer("my-tracer"); + + let provider: NodeTracerProvider | undefined; + + // Wrap your logic in the AxiomEvalInstrumentationHook function + export const setupAppInstrumentation: AxiomEvalInstrumentationHook = async ({ + dataset, + url, + token, + }) => { + if (provider) { + return { provider }; + } -1. Follow the [Quickstart](/ai-engineering/quickstart) to set up instrumentation for your app. -1. Run the following command to istall the `autoevals` library. This library provides pre-built scorers for common tasks like semantic similarity, factual correctness, and text matching. + if (!dataset || !url || !token) { + throw new Error('Missing environment variables'); + } - ```bash - npm install --save-dev autoevals + // Replace the environment variables with the parameters passed to the function + const exporter = new OTLPTraceExporter({ + url: `${url}/v1/traces`, + headers: { + Authorization: `Bearer ${token}`, + 'X-Axiom-Dataset': dataset, + }, + }) + + // Configure the provider to export traces to your Axiom dataset + provider = new NodeTracerProvider({ + resource: resourceFromAttributes({ + [ATTR_SERVICE_NAME]: 'my-app', // Replace with your service name + }, + { + // Use the latest schema version + // Info: https://opentelemetry.io/docs/specs/semconv/ + schemaUrl: 'https://opentelemetry.io/schemas/1.37.0', + }), + spanProcessor: new BatchSpanProcessor(exporter), + }); + + // Register the provider + provider.register(); + + // Initialize Axiom AI SDK with the configured tracer + initAxiomAI({ tracer, redactionPolicy: RedactionPolicy.AxiomDefault }); + + return { provider }; + }; ``` -### Change instrumentation - -Change the instrumentation setup in the `src/instrumentation.ts` file that you have previously created in the [Quickstart](/ai-engineering/quickstart). - -```ts /src/instrumentation.ts lines highlight={1,9-10,16-28,58-59} -// Remove `import 'dotenv/config';` -import { OTLPTraceExporter } from '@opentelemetry/exporter-trace-otlp-http'; -import { resourceFromAttributes } from '@opentelemetry/resources'; -import { SimpleSpanProcessor, NodeTracerProvider } from '@opentelemetry/sdk-trace-node'; -import { ATTR_SERVICE_NAME } from '@opentelemetry/semantic-conventions'; -import { trace } from "@opentelemetry/api"; -import { initAxiomAI, RedactionPolicy } from 'axiom/ai'; - -// Import the type for the AxiomEvalInstrumentationHook -import type { AxiomEvalInstrumentationHook } from 'axiom/ai/config'; - -const tracer = trace.getTracer("my-tracer"); - -let provider: NodeTracerProvider | undefined; - -// Wrap your logic in the AxiomEvalInstrumentationHook function -export const setupAppInstrumentation: AxiomEvalInstrumentationHook = async ({ - dataset, - url, - token, -}) => { - if (provider) { - return { provider }; - } - - if (!dataset || !url || !token) { - throw new Error('Missing environment variables'); - } - - // Replace the environment variables with the parameters passed to the function - const exporter = new OTLPTraceExporter({ - url: `${url}/v1/traces`, - headers: { - Authorization: `Bearer ${token}`, - 'X-Axiom-Dataset': dataset, - }, - }) - - // Configure the provider to export traces to your Axiom dataset - provider = new NodeTracerProvider({ - resource: resourceFromAttributes({ - [ATTR_SERVICE_NAME]: 'my-app', // Replace with your service name - }, - { - // Use the latest schema version - // Info: https://opentelemetry.io/docs/specs/semconv/ - schemaUrl: 'https://opentelemetry.io/schemas/1.37.0', - }), - spanProcessor: new SimpleSpanProcessor(exporter), - }); - - // Register the provider - provider.register(); - - // Initialize Axiom AI SDK with the configured tracer - initAxiomAI({ tracer, redactionPolicy: RedactionPolicy.AxiomDefault }); - - return { provider }; -}; -``` - ### Create Axiom configuration file At the root of your project, create the Axiom configuration file `/axiom.config.ts`: @@ -132,7 +124,7 @@ The key parameters of the `Eval` function: - `scorers`: An array of scorer functions that score the `output` against the `expected` value. - `metadata`: Optional metadata for the evaluation, such as a description. -Create an evaluation for a support ticket classification system in the file `/src/evals/ticket-classification.eval.ts`. +As an example, create an evaluation for a support ticket classification system in the file `/src/evals/ticket-classification.eval.ts`. ```ts /src/evals/ticket-classification.eval.ts expandable import { experimental_Eval as Eval, Scorer } from 'axiom/ai/evals'; @@ -141,14 +133,6 @@ import { openai } from '@ai-sdk/openai'; import { wrapAISDKModel } from 'axiom/ai'; import { flag, pickFlags } from '../lib/app-scope'; import { z } from 'zod'; -import { ExactMatch } from 'autoevals'; - -// Define your schemas -const ticketCategorySchema = z.enum(['spam', 'question', 'feature_request', 'bug_report']); -const ticketResponseSchema = z.object({ - category: ticketCategorySchema, - response: z.string(), -}); // The function you want to evaluate async function classifyTicket({ subject, content }: { subject?: string; content: string }) { @@ -160,39 +144,48 @@ async function classifyTicket({ subject, content }: { subject?: string; content: { role: 'system', content: `You are a customer support engineer classifying tickets as: spam, question, feature_request, or bug_report. - -If spam, return a polite auto-close message. Otherwise, say a team member will respond shortly.`, + If spam, return a polite auto-close message. Otherwise, say a team member will respond shortly.`, }, { role: 'user', content: subject ? `Subject: ${subject}\n\n${content}` : content, }, ], - schema: ticketResponseSchema, + schema: z.object({ + category: z.enum(['spam', 'question', 'feature_request', 'bug_report']), + response: z.string() + }), }); return result.object; } -// Custom exact-match scorer +// Custom exact-match scorer that returns metadata const ExactMatchScorer = Scorer( 'Exact-Match', ({ output, expected }: { output: { response: string }; expected: { response: string } }) => { - return ExactMatch({ - output: output.response, - expected: expected.response, + const normalizedOutput = output.response.trim().toLowerCase(); + const normalizedExpected = expected.response.trim().toLowerCase(); + + return { + score: normalizedOutput === normalizedExpected, + metadata: { + details: 'Additional info about this score', + }, + }; }); } ); // Custom spam classification scorer const SpamClassificationScorer = Scorer( - 'Spam-Classification', - ({ output, expected }: { - output: { category: string }; + "Spam-Classification", + ({ output, expected }: { + output: { category: string }; expected: { category: string }; }) => { - return (expected.category === 'spam') === (output.category === 'spam') ? 1 : 0; + const isSpam = (x: { category: string }) => x.category === "spam"; + return isSpam(output) === isSpam(expected) ? 1 : 0; } ); @@ -240,55 +233,6 @@ Eval('spam-classification', { }); ``` -## Set up scorers - -A scorer is a function that scores a capability’s output. Scorers receive the `input`, the generated `output`, and the `expected` value, and return a score (typically 0-1). - -Examples to set up scorers: -- Use a simple exact match scorer that returns 1 if the output matches the expected value, and 0 otherwise: - - ```ts - import { Scorer } from 'axiom/ai/evals'; - - const ExactMatchScorer = Scorer( - 'Exact-Match', - ({ output, expected }: { output: string; expected: string }) => { - return output === expected ? 1 : 0; - } - ); - ``` - -- Use the `autoevals` library that provides prebuilt scorers for common tasks like semantic similarity, factual correctness, and text matching: - - ```ts - import { Scorer } from 'axiom/ai/evals'; - import { ExactMatch } from 'autoevals'; - - const WrappedExactMatch = Scorer( - 'Exact-Match', - ({ output, expected }: { output: string; expected: string }) => { - return ExactMatch({ output, expected }); - } - ); - ``` - -- Use a custom scorer that returns metadata alongside the score: - - ```ts - const CustomScorer = Scorer( - 'Custom-Scorer', - ({ output, expected }) => { - const score = computeScore(output, expected); - return { - score, - metadata: { - details: 'Additional info about this score', - }, - }; - } - ); - ``` - ## Run evaluations To run your evaluation suites from your terminal, [install the Axiom CLI](/reference/cli) and use the following commands. @@ -301,6 +245,14 @@ To run your evaluation suites from your terminal, [install the Axiom CLI](/refer | Run eval by name | `axiom eval "spam-classification"` | | List available evals without running | `axiom eval --list` | +## Custom scorers + +A scorer is a function that scores a capability’s output. Scorers receive the `input`, the generated `output`, and the `expected` value, and return a score. + +The `autoevals` library that provides prebuilt scorers for common tasks like semantic similarity, factual correctness, and text matching + +Use a custom scorer that returns metadata alongside the score: + ## Run experiments Flags let you parameterize your AI behavior (like model choice or prompting strategies) and run experiments with different configurations. They’re type-safe via Zod schemas, and you can override them at runtime. @@ -350,7 +302,7 @@ axiom eval --flags-config=experiment.json ## Analyze results in Console -Coming soon When you run an eval, Axiom AI SDK captures a detailed OpenTelemetry trace for the entire run. This includes parent spans for the evaluation suite and child spans for each individual test case, task execution, and scorer result. Axiom enriches the traces with `eval.*` attributes, allowing you to deeply analyze results in the Axiom Console. +When you run an eval, Axiom AI SDK captures a detailed OpenTelemetry trace for the entire run. This includes parent spans for the evaluation suite and child spans for each individual test case, task execution, and scorer result. Axiom enriches the traces with `eval.*` attributes, allowing you to deeply analyze results in the Axiom Console. The results of evals: - Pass/fail status for each test case @@ -358,9 +310,7 @@ The results of evals: - Comparison to baseline (if available) - Links to view detailed traces in Axiom -Additionally, the AI SDK sends the results to your Axiom dataset for long-term tracking and analysis. - -The Console will feature leaderboards and comparison views to track score progression across different versions of a capability, helping you verify that your changes are leading to measurable improvements. +The Console features leaderboards and comparison views to track score progression across different versions of a capability, helping you verify that your changes are leading to measurable improvements. ## What’s next? From 55e6bf4dc0e93a9b24ac8a013e385cb7fc4d126e Mon Sep 17 00:00:00 2001 From: Mano Toth Date: Mon, 17 Nov 2025 14:34:45 +0100 Subject: [PATCH 09/11] Refactor --- ai-engineering/measure.mdx | 198 ++++++++++------------------------ ai-engineering/quickstart.mdx | 101 ++++++++++++----- 2 files changed, 129 insertions(+), 170 deletions(-) diff --git a/ai-engineering/measure.mdx b/ai-engineering/measure.mdx index b31298abc..c9a9205ab 100644 --- a/ai-engineering/measure.mdx +++ b/ai-engineering/measure.mdx @@ -17,101 +17,9 @@ Evaluations (evals) are systematic tests that measure how well your AI features ## Prerequisites -Follow the [Quickstart](/ai-engineering/quickstart) to set up instrumentation for your app. +Follow the [Quickstart](/ai-engineering/quickstart): +- To run evals within the context of an existing AI app, follow the instrumentation setup in the [Quickstart](/ai-engineering/quickstart): - To run evals without an existing AI app, skip the part in the Quickstart about instrumentalising your app. -- To run evals within the context of an existing AI app, use the following instrumentation setup in the `src/instrumentation.ts` file that you have previously created in the [Quickstart](/ai-engineering/quickstart): - - ```ts /src/instrumentation.ts expandable - import { OTLPTraceExporter } from '@opentelemetry/exporter-trace-otlp-http'; - import { resourceFromAttributes } from '@opentelemetry/resources'; - import { BatchSpanProcessor, NodeTracerProvider } from '@opentelemetry/sdk-trace-node'; - import { ATTR_SERVICE_NAME } from '@opentelemetry/semantic-conventions'; - import { trace } from "@opentelemetry/api"; - import { initAxiomAI, RedactionPolicy } from 'axiom/ai'; - - // Import the type for the AxiomEvalInstrumentationHook - import type { AxiomEvalInstrumentationHook } from 'axiom/ai/config'; - - const tracer = trace.getTracer("my-tracer"); - - let provider: NodeTracerProvider | undefined; - - // Wrap your logic in the AxiomEvalInstrumentationHook function - export const setupAppInstrumentation: AxiomEvalInstrumentationHook = async ({ - dataset, - url, - token, - }) => { - if (provider) { - return { provider }; - } - - if (!dataset || !url || !token) { - throw new Error('Missing environment variables'); - } - - // Replace the environment variables with the parameters passed to the function - const exporter = new OTLPTraceExporter({ - url: `${url}/v1/traces`, - headers: { - Authorization: `Bearer ${token}`, - 'X-Axiom-Dataset': dataset, - }, - }) - - // Configure the provider to export traces to your Axiom dataset - provider = new NodeTracerProvider({ - resource: resourceFromAttributes({ - [ATTR_SERVICE_NAME]: 'my-app', // Replace with your service name - }, - { - // Use the latest schema version - // Info: https://opentelemetry.io/docs/specs/semconv/ - schemaUrl: 'https://opentelemetry.io/schemas/1.37.0', - }), - spanProcessor: new BatchSpanProcessor(exporter), - }); - - // Register the provider - provider.register(); - - // Initialize Axiom AI SDK with the configured tracer - initAxiomAI({ tracer, redactionPolicy: RedactionPolicy.AxiomDefault }); - - return { provider }; - }; - ``` - -### Create Axiom configuration file - -At the root of your project, create the Axiom configuration file `/axiom.config.ts`: - -```ts /axiom.config.ts -import { defineConfig } from 'axiom/ai/config'; -import { setupAppInstrumentation } from './src/instrumentation'; - -export default defineConfig({ - eval: { - url: process.env.AXIOM_URL, - token: process.env.AXIOM_TOKEN, - dataset: process.env.AXIOM_DATASET, - - // Optional: customize which files to run - include: ['**/*.eval.{ts,js}'], - - // Optional: exclude patterns - exclude: [], - - // Optional: timeout for eval execution - timeoutMs: 60_000, - - // Optional: instrumentation hook for OpenTelemetry - // (created this in the "Create instrumentation setup" step) - instrumentation: ({ url, token, dataset }) => - setupAppInstrumentation({ url, token, dataset }), - }, -}); -``` ## Write evalulation function @@ -124,7 +32,7 @@ The key parameters of the `Eval` function: - `scorers`: An array of scorer functions that score the `output` against the `expected` value. - `metadata`: Optional metadata for the evaluation, such as a description. -As an example, create an evaluation for a support ticket classification system in the file `/src/evals/ticket-classification.eval.ts`. +The example below creates an evaluation for a support ticket classification system in the file `/src/evals/ticket-classification.eval.ts`. ```ts /src/evals/ticket-classification.eval.ts expandable import { experimental_Eval as Eval, Scorer } from 'axiom/ai/evals'; @@ -160,7 +68,7 @@ async function classifyTicket({ subject, content }: { subject?: string; content: return result.object; } -// Custom exact-match scorer that returns metadata +// Custom exact-match scorer that returns score and metadata const ExactMatchScorer = Scorer( 'Exact-Match', ({ output, expected }: { output: { response: string }; expected: { response: string } }) => { @@ -170,7 +78,7 @@ const ExactMatchScorer = Scorer( return { score: normalizedOutput === normalizedExpected, metadata: { - details: 'Additional info about this score', + details: 'A scorer that checks for exact match', }, }; }); @@ -233,33 +141,9 @@ Eval('spam-classification', { }); ``` -## Run evaluations +## Set up flags -To run your evaluation suites from your terminal, [install the Axiom CLI](/reference/cli) and use the following commands. - -| Description | Command | -| ----------- | ------- | -| Run all evals | `axiom eval` | -| Run specific eval file | `axiom eval src/evals/ticket-classification.eval.ts` | -| Run evals matching a glob pattern | `axiom eval "**/*spam*.eval.ts"` | -| Run eval by name | `axiom eval "spam-classification"` | -| List available evals without running | `axiom eval --list` | - -## Custom scorers - -A scorer is a function that scores a capability’s output. Scorers receive the `input`, the generated `output`, and the `expected` value, and return a score. - -The `autoevals` library that provides prebuilt scorers for common tasks like semantic similarity, factual correctness, and text matching - -Use a custom scorer that returns metadata alongside the score: - -## Run experiments - -Flags let you parameterize your AI behavior (like model choice or prompting strategies) and run experiments with different configurations. They’re type-safe via Zod schemas, and you can override them at runtime. - -### Set up flags - -Create the file `src/lib/app-scope.ts` that uses the `ticketClassification` flagto test different language models. +Create the file `src/lib/app-scope.ts`: ```ts /src/lib/app-scope.ts import { createAppScope } from 'axiom/ai/evals'; @@ -276,29 +160,17 @@ const { flag, pickFlags } = createAppScope({ flagSchema }); export { flag, pickFlags }; ``` -### Override flags at runtime - -Override flags directly when you run the eval: - -```bash -axiom eval --flag.ticketClassification.model=gpt-4o -``` - -Alternatively, specify the flag overrides in a JSON file. - -```json experiment.json -{ - "ticketClassification": { - "model": "gpt-4o" - } -} -``` +## Run evaluations -And then specify the JSON file as the value of the `flags-config` parameter when you run the eval: +To run your evaluation suites from your terminal, [install the Axiom CLI](/reference/cli) and use the following commands. -```bash -axiom eval --flags-config=experiment.json -``` +| Description | Command | +| ----------- | ------- | +| Run all evals | `axiom eval` | +| Run specific eval file | `axiom eval src/evals/ticket-classification.eval.ts` | +| Run evals matching a glob pattern | `axiom eval "**/*spam*.eval.ts"` | +| Run eval by name | `axiom eval "spam-classification"` | +| List available evals without running | `axiom eval --list` | ## Analyze results in Console @@ -312,6 +184,44 @@ The results of evals: The Console features leaderboards and comparison views to track score progression across different versions of a capability, helping you verify that your changes are leading to measurable improvements. +## Additional configuration options + +### Custom scorers + +A scorer is a function that scores a capability’s output. Scorers receive the `input`, the generated `output`, and the `expected` value, and return a score. + +The example above uses two custom scorers. Scorers can return metadata alongside the score. + +You can use the [`autoevals` library](https://github.com/braintrustdata/autoevals) instead of custom scorers. `autoevals` provides prebuilt scorers for common tasks like semantic similarity, factual correctness, and text matching. + +### Run experiments + +Flags let you parameterize your AI behavior (like model choice or prompting strategies) and run experiments with different configurations. They’re type-safe via Zod schemas, and you can override them at runtime. + +The example above uses the `ticketClassification` flag to test different language models. Flags have a default value that you can override at runtime in one of the following ways: + +- Override flags directly when you run the eval: + + ```bash + axiom eval --flag.ticketClassification.model=gpt-4o + ``` + +- Alternatively, specify the flag overrides in a JSON file. + + ```json experiment.json + { + "ticketClassification": { + "model": "gpt-4o" + } + } + ``` + + And then specify the JSON file as the value of the `flags-config` parameter when you run the eval: + + ```bash + axiom eval --flags-config=experiment.json + ``` + ## What’s next? A capability is ready to be deployed when it meets your quality benchmarks. After deployment, the next steps can be the following: diff --git a/ai-engineering/quickstart.mdx b/ai-engineering/quickstart.mdx index 67624e927..857907e87 100644 --- a/ai-engineering/quickstart.mdx +++ b/ai-engineering/quickstart.mdx @@ -99,51 +99,100 @@ To send data to Axiom, configure a tracer. For example, use a dedicated instrume -1. Create instrumentation file: +1. Create an instrumentation file: - ```typescript /src/instrumentation.ts - import 'dotenv/config'; // Make sure to load environment variables + ```ts /src/instrumentation.ts expandable import { OTLPTraceExporter } from '@opentelemetry/exporter-trace-otlp-http'; import { resourceFromAttributes } from '@opentelemetry/resources'; - import { SimpleSpanProcessor, NodeTracerProvider } from '@opentelemetry/sdk-trace-node'; + import { BatchSpanProcessor, NodeTracerProvider } from '@opentelemetry/sdk-trace-node'; import { ATTR_SERVICE_NAME } from '@opentelemetry/semantic-conventions'; import { trace } from "@opentelemetry/api"; import { initAxiomAI, RedactionPolicy } from 'axiom/ai'; + import type { AxiomEvalInstrumentationHook } from 'axiom/ai/config'; const tracer = trace.getTracer("my-tracer"); let provider: NodeTracerProvider | undefined; - const exporter = new OTLPTraceExporter({ - url: `${process.env.AXIOM_URL}/v1/traces`, + // Wrap your logic in the AxiomEvalInstrumentationHook function + export const setupAppInstrumentation: AxiomEvalInstrumentationHook = async ({ + dataset, + url, + token, + }) => { + if (provider) { + return { provider }; + } + + if (!dataset || !url || !token) { + throw new Error('Missing environment variables'); + } + + // Replace the environment variables with the parameters passed to the function + const exporter = new OTLPTraceExporter({ + url: `${url}/v1/traces`, headers: { - Authorization: `Bearer ${process.env.AXIOM_TOKEN}`, - 'X-Axiom-Dataset': process.env.AXIOM_DATASET!, + Authorization: `Bearer ${token}`, + 'X-Axiom-Dataset': dataset, }, }) - // Configure the provider to export traces to your Axiom dataset - provider = new NodeTracerProvider({ - resource: resourceFromAttributes({ - [ATTR_SERVICE_NAME]: 'my-app', // Replace with your service name - }, - { - // Use the latest schema version - // Info: https://opentelemetry.io/docs/specs/semconv/ - schemaUrl: 'https://opentelemetry.io/schemas/1.37.0', - }), - spanProcessor: new SimpleSpanProcessor(exporter), - }); - - // Register the provider - provider.register(); - - // Initialize Axiom AI SDK with the configured tracer - initAxiomAI({ tracer, redactionPolicy: RedactionPolicy.AxiomDefault }); + // Configure the provider to export traces to your Axiom dataset + provider = new NodeTracerProvider({ + resource: resourceFromAttributes({ + [ATTR_SERVICE_NAME]: 'my-app', // Replace with your service name + }, + { + // Use the latest schema version + // Info: https://opentelemetry.io/docs/specs/semconv/ + schemaUrl: 'https://opentelemetry.io/schemas/1.37.0', + }), + spanProcessor: new BatchSpanProcessor(exporter), + }); + + // Register the provider + provider.register(); + + // Initialize Axiom AI SDK with the configured tracer + initAxiomAI({ tracer, redactionPolicy: RedactionPolicy.AxiomDefault }); + + return { provider }; + }; ``` For more information on specifying redaction policies, see [Redaction policies](/ai-engineering/redaction-policies). +### Create Axiom configuration file + +At the root of your project, create the Axiom configuration file `/axiom.config.ts`: + +```ts /axiom.config.ts +import { defineConfig } from 'axiom/ai/config'; +import { setupAppInstrumentation } from './src/instrumentation'; + +export default defineConfig({ + eval: { + url: process.env.AXIOM_URL, + token: process.env.AXIOM_TOKEN, + dataset: process.env.AXIOM_DATASET, + + // Optional: customize which files to run + include: ['**/*.eval.{ts,js}'], + + // Optional: exclude patterns + exclude: [], + + // Optional: timeout for eval execution + timeoutMs: 60_000, + + // Optional: instrumentation hook for OpenTelemetry + // (created this in the "Create instrumentation setup" step) + instrumentation: ({ url, token, dataset }) => + setupAppInstrumentation({ url, token, dataset }), + }, +}); +``` + ## Store environment variables Store environment variables in an `.env` file in the root of your project: From 3e3050cde1ad48271f8da30f86ea1bd262b285ef Mon Sep 17 00:00:00 2001 From: Mano Toth Date: Mon, 17 Nov 2025 14:57:34 +0100 Subject: [PATCH 10/11] Update measure.mdx --- ai-engineering/measure.mdx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ai-engineering/measure.mdx b/ai-engineering/measure.mdx index c9a9205ab..c1e961be0 100644 --- a/ai-engineering/measure.mdx +++ b/ai-engineering/measure.mdx @@ -18,7 +18,7 @@ Evaluations (evals) are systematic tests that measure how well your AI features ## Prerequisites Follow the [Quickstart](/ai-engineering/quickstart): -- To run evals within the context of an existing AI app, follow the instrumentation setup in the [Quickstart](/ai-engineering/quickstart): +- To run evals within the context of an existing AI app, follow the instrumentation setup in the [Quickstart](/ai-engineering/quickstart). - To run evals without an existing AI app, skip the part in the Quickstart about instrumentalising your app. ## Write evalulation function From 89ce5ca171fa3236cfffd0404f201195fefbf795 Mon Sep 17 00:00:00 2001 From: Mano Toth Date: Tue, 18 Nov 2025 09:26:41 +0000 Subject: [PATCH 11/11] Update measure.mdx --- ai-engineering/measure.mdx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ai-engineering/measure.mdx b/ai-engineering/measure.mdx index c1e961be0..64a1872f2 100644 --- a/ai-engineering/measure.mdx +++ b/ai-engineering/measure.mdx @@ -92,7 +92,7 @@ const SpamClassificationScorer = Scorer( output: { category: string }; expected: { category: string }; }) => { - const isSpam = (x: { category: string }) => x.category === "spam"; + const isSpam = (item: { category: string }) => item.category === "spam"; return isSpam(output) === isSpam(expected) ? 1 : 0; } );