From 2ae1a63e3fbb9af7096cbc7306c535ce95d01643 Mon Sep 17 00:00:00 2001
From: Christopher Ehrlich <ehrlich.christopher@gmail.com>
Date: Tue, 11 Nov 2025 15:50:57 +0700
Subject: [PATCH 1/7] initial eval docs

---
 ai-engineering/measure.mdx | 431 +++++++++++++++++++++++++++++++++++--
 1 file changed, 408 insertions(+), 23 deletions(-)
diff --git a/ai-engineering/measure.mdx b/ai-engineering/measure.mdx
index b288e8b0f..df51ac929 100644
--- a/ai-engineering/measure.mdx
+++ b/ai-engineering/measure.mdx
@@ -1,34 +1,45 @@
 ---
 title: "Measure"
 description: "Learn how to measure the quality of your AI capabilities by running evaluations against ground truth data."
-keywords: ["ai engineering", "AI engineering", "measure", "evals", "evaluation", "scoring", "graders"]
+keywords: ["ai engineering", "AI engineering", "measure", "evals", "evaluation", "scoring", "scorers"]
 ---
 
 import { Badge } from "/snippets/badge.jsx"
 import { definitions } from '/snippets/definitions.mdx'
 
 <Warning>
-The evaluation framework described here is in active development. Axiom is working with design partners to shape what’s built. [Contact Axiom](https://www.axiom.co/contact) to get early access and join a focused group of teams shaping these tools.
+The evaluation framework described here is in active development. Axiom is working with design partners to shape what's built. [Contact Axiom](https://www.axiom.co/contact) to get early access and join a focused group of teams shaping these tools.
 </Warning>
 
-The **Measure** stage is where you quantify the quality and effectiveness of your AI <Tooltip tip={definitions.Capability}>capability</Tooltip>. Instead of relying on anecdotal checks, this stage uses a systematic process called an <Tooltip tip={definitions.Eval}>eval</Tooltip> to score your capability’s performance against a known set of correct examples (<Tooltip tip={definitions.GroundTruth}>ground truth</Tooltip>). This provides a data-driven benchmark to ensure a capability is ready for production and to track its quality over time.
+The **Measure** stage is where you quantify the quality and effectiveness of your AI <Tooltip tip={definitions.Capability}>capability</Tooltip>. Instead of relying on anecdotal checks, this stage uses a systematic process called an <Tooltip tip={definitions.Eval}>eval</Tooltip> to score your capability's performance against a known set of correct examples (<Tooltip tip={definitions.GroundTruth}>ground truth</Tooltip>). This provides a data-driven benchmark to ensure a capability is ready for production and to track its quality over time.
+
+Evaluations (evals) are systematic tests that measure how well your AI features perform. Instead of manually testing AI outputs, evals automatically run your AI code against test datasets and score the results using custom metrics. This lets you catch regressions, compare different approaches, and confidently improve your AI features over time.
 
 ## The `Eval` function
 
-<Badge>Coming soon</Badge> The primary tool for the Measure stage is the `Eval` function, which will be available in the `axiom/ai` package. It provides a simple, declarative way to define a test suite for your capability directly in your codebase.
+The primary tool for the Measure stage is the `Eval` function, available in the `axiom/ai/evals` package. It provides a simple, declarative way to define a test suite for your capability directly in your codebase.
 
 An `Eval` is structured around a few key parameters:
 
-* `data`: An async function that returns your `collection` of `{ input, expected }` pairs, which serve as your ground truth.
+* `data`: An async function that returns your collection of `{ input, expected }` pairs, which serve as your ground truth.
 * `task`: The function that executes your AI capability, taking an `input` and producing an `output`.
-* `scorers`: An array of `grader` functions that score the `output` against the `expected` value.
-* `threshold`: A score between 0 and 1 that determines the pass/fail condition for the evaluation.
+* `scorers`: An array of scorer functions that score the `output` against the `expected` value.
+* `metadata`: Optional metadata for the evaluation, such as a description.
 
 Here is an example of a complete evaluation suite:
 
 ```ts /evals/text-match.eval.ts
-import { Levenshtein } from 'autoevals';
-import { Eval } from 'axiom/ai/evals';
+import { Eval, Scorer } from 'axiom/ai/evals';
+
+const LevenshteinScorer = Scorer(
+  'Levenshtein',
+  ({ output, expected }: { output: string; expected: string }) => {
+    // Calculate Levenshtein distance score
+    const distance = calculateLevenshtein(output, expected);
+    const maxLen = Math.max(output.length, expected.length);
+    return maxLen === 0 ? 1 : 1 - distance / maxLen;
+  }
+);
 
 Eval('text-match-eval', {
   // 1. Your ground truth dataset
@@ -46,40 +57,414 @@ Eval('text-match-eval', {
   },
 
   // 2. The task that runs your capability
-  task: async (input: string) => {
+  task: async ({ input }) => {
     return `hi, ${input}!`;
   },
 
   // 3. The scorers that grade the output
-  scorers: [Levenshtein],
+  scorers: [LevenshteinScorer],
+});
+```
+
+## Getting Started
+
+### Prerequisites
+
+- Node.js 22.20 or higher
+- Existing AI SDK setup (e.g., `@ai-sdk/openai`, `ai`)
+- Axiom account with API token and dataset
+
+### Installation
+
+```bash
+npm install axiom
+npm install --save-dev autoevals
+```
+
+Install required OpenTelemetry dependencies:
+
+```bash
+npm install @opentelemetry/api \
+            @opentelemetry/exporter-trace-otlp-http \
+            @opentelemetry/resources \
+            @opentelemetry/sdk-trace-node \
+            @opentelemetry/semantic-conventions
+```
+
+### Configuration
+
+#### 1. Set up environment variables
+
+Create a `.env` file:
+
+```bash
+AXIOM_URL="https://api.axiom.co"
+AXIOM_TOKEN="xaat-******"
+AXIOM_DATASET="my_dataset"
+```
+
+#### 2. Create instrumentation setup (optional)
+
+If you are evaluating components of a production application that is instrumented with OpenTelemetry, you can see your application spans in Axiom. In order to enable this, your instrumentation setup must be a function that can be passed in `axiom.config.ts`. An example is shown below.
+
+Create `src/instrumentation.node.ts`:
+
+```typescript
+import { OTLPTraceExporter } from '@opentelemetry/exporter-trace-otlp-http';
+import { resourceFromAttributes } from '@opentelemetry/resources';
+import { BatchSpanProcessor, NodeTracerProvider } from '@opentelemetry/sdk-trace-node';
+import { ATTR_SERVICE_NAME } from '@opentelemetry/semantic-conventions';
+import { initAxiomAI, RedactionPolicy } from 'axiom/ai';
+import type { AxiomEvalInstrumentationHook } from 'axiom/ai/config';
+import { tracer } from './tracer';
+
+let provider: NodeTracerProvider | undefined;
+
+export const setupAppInstrumentation: AxiomEvalInstrumentationHook = async ({
+  dataset,
+  url,
+  token,
+}) => {
+  if (provider) {
+    return { provider };
+  }
+
+  if (!dataset || !url || !token) {
+    throw new Error('Missing environment variables');
+  }
+
+  const exporter = new OTLPTraceExporter({
+    url: `${url}/v1/traces`,
+    headers: {
+      Authorization: `Bearer ${token}`,
+      'X-Axiom-Dataset': dataset,
+    },
+  });
+
+  provider = new NodeTracerProvider({
+    resource: resourceFromAttributes({
+      [ATTR_SERVICE_NAME]: 'my-app',
+    }),
+    spanProcessors: [new BatchSpanProcessor(exporter)],
+  });
+
+  provider.register();
+  initAxiomAI({ tracer, redactionPolicy: RedactionPolicy.AxiomDefault });
+
+  return { provider };
+};
+```
+
+Create `src/tracer.ts`:
+
+```typescript
+import { trace } from '@opentelemetry/api';
+
+export const tracer = trace.getTracer('my-tracer');
+```
+
+#### 1. Create `axiom.config.ts`
+
+Create a configuration file at the root of your project:
+
+```typescript
+import { defineConfig } from 'axiom/ai/config';
+import { setupAppInstrumentation } from './src/instrumentation.node';
+
+export default defineConfig({
+  eval: {
+    url: process.env.AXIOM_URL,
+    token: process.env.AXIOM_TOKEN,
+    dataset: process.env.AXIOM_DATASET,
+    
+    // Optional: customize which files to run
+    include: ['**/*.eval.{ts,js}'],
+    
+    // Optional: exclude patterns
+    exclude: [],
+    
+    // Optional: timeout for eval execution
+    timeoutMs: 60_000,
+    
+    // Optional: instrumentation hook for OpenTelemetry
+    instrumentation: ({ url, token, dataset }) => 
+      setupAppInstrumentation({ url, token, dataset }),
+  },
+});
+```
+
+## Setting up Flags
+
+Flags let you parameterize your AI behavior (like model choice or prompting strategies) and run experiments with different configurations. They're type-safe via Zod schemas and can be overridden at runtime.
+
+Create `src/lib/app-scope.ts`:
+
+```typescript
+import { createAppScope } from 'axiom/ai/evals';
+import { z } from 'zod';
+
+export const flagSchema = z.object({
+  ticketClassification: z.object({
+    model: z.string().default('gpt-4o-mini'),
+  }),
+});
+
+const { flag, pickFlags } = createAppScope({ flagSchema });
+
+export { flag, pickFlags };
+```
+
+## Writing a Real-World Eval
+
+Let's build a practical evaluation for a support ticket classification system.
+
+Create an eval file `src/evals/ticket-classification.eval.ts`:
+
+```typescript
+import { experimental_Eval as Eval, Scorer } from 'axiom/ai/evals';
+import { generateObject } from 'ai';
+import { openai } from '@ai-sdk/openai';
+import { wrapAISDKModel } from 'axiom/ai';
+import { flag, pickFlags } from '../lib/app-scope';
+import { z } from 'zod';
+import { ExactMatch } from 'autoevals';
+
+// Define your schemas
+const ticketCategorySchema = z.enum(['spam', 'question', 'feature_request', 'bug_report']);
+const ticketResponseSchema = z.object({
+  category: ticketCategorySchema,
+  response: z.string(),
+});
+
+// The function you want to evaluate
+async function classifyTicket({ subject, content }: { subject?: string; content: string }) {
+  const model = flag('ticketClassification.model');
+  
+  const result = await generateObject({
+    model: wrapAISDKModel(openai(model)),
+    messages: [
+      {
+        role: 'system',
+        content: `You are a customer support engineer classifying tickets as: spam, question, feature_request, or bug_report.
+        
+If spam, return a polite auto-close message. Otherwise, say a team member will respond shortly.`,
+      },
+      {
+        role: 'user',
+        content: subject ? `Subject: ${subject}\n\n${content}` : content,
+      },
+    ],
+    schema: ticketResponseSchema,
+  });
+
+  return result.object;
+}
 
-  // 4. The pass/fail threshold for the scores
-  threshold: 1,
+// Custom exact-match scorer
+const ExactMatchScorer = Scorer(
+  'Exact-Match',
+  ({ output, expected }: { output: { response: string }; expected: { response: string } }) => {
+    return ExactMatch({
+      output: output.response,
+      expected: expected.response,
+    });
+  }
+);
+
+// Custom spam classification scorer
+const SpamClassificationScorer = Scorer(
+  'Spam-Classification',
+  ({ output, expected }: { 
+    output: { category: string }; 
+    expected: { category: string };
+  }) => {
+    return (expected.category === 'spam') === (output.category === 'spam') ? 1 : 0;
+  }
+);
+
+// Define the evaluation
+Eval('spam-classification', {
+  // Specify which flags this eval uses
+  configFlags: pickFlags('ticketClassification'),
+  
+  // Test data with input/expected pairs
+  data: () => [
+    {
+      input: {
+        subject: "Congratulations! You've Been Selected for an Exclusive Reward",
+        content: 'Claim your $500 gift card now by clicking this link!',
+      },
+      expected: {
+        category: 'spam',
+        response: "We're sorry, but your message has been automatically closed.",
+      },
+    },
+    {
+      input: {
+        subject: 'FREE V1AGRA',
+        content: 'BUY NOW ON WWW.BEST-DEALS.COM!',
+      },
+      expected: {
+        category: 'spam',
+        response: "We're sorry, but your message has been automatically closed.",
+      },
+    },
+  ],
+  
+  // The task to run for each test case
+  task: async ({ input }) => {
+    return await classifyTicket(input);
+  },
+  
+  // Scorers to measure performance
+  scorers: [SpamClassificationScorer, ExactMatchScorer],
+  
+  // Optional metadata
+  metadata: {
+    description: 'Classify support tickets as spam or not spam',
+  },
 });
 ```
 
-## Grading with scorers
+## Scoring with Scorers
 
-<Badge>Coming soon</Badge> A <Tooltip tip={definitions.Grader}>grader</Tooltip> is a function that scores a capability’s output. Axiom will provide a library of built-in scorers for common tasks (e.g., checking for semantic similarity, factual correctness, or JSON validity). You can also provide your own custom functions to measure domain-specific logic. Each scorer receives the `input`, the generated `output`, and the `expected` value, and must return a score.
+A scorer is a function that scores a capability's output. Scorers receive the `input`, the generated `output`, and the `expected` value, and return a score (typically 0-1).
+
+### Simple Custom Scorer
+
+```typescript
+import { Scorer } from 'axiom/ai/evals';
+
+const ExactMatchScorer = Scorer(
+  'Exact-Match',
+  ({ output, expected }: { output: string; expected: string }) => {
+    return output === expected ? 1 : 0;
+  }
+);
+```
 
-## Running evaluations
+### Using AutoEvals Library
 
-<Badge>Coming soon</Badge> You will run your evaluation suites from your terminal using the `axiom` CLI.
+The `autoevals` library provides pre-built scorers for common tasks like semantic similarity, factual correctness, and text matching:
+
+```typescript
+import { Scorer } from 'axiom/ai/evals';
+import { ExactMatch } from 'autoevals';
+
+const WrappedExactMatch = Scorer(
+  'Exact-Match',
+  ({ output, expected }: { output: string; expected: string }) => {
+    return ExactMatch({ output, expected });
+  }
+);
+```
+
+### Scorer with Metadata
+
+Scorers can return additional metadata alongside the score:
+
+```typescript
+const CustomScorer = Scorer(
+  'Custom-Scorer',
+  ({ output, expected }) => {
+    const score = computeScore(output, expected);
+    return {
+      score,
+      metadata: {
+        details: 'Additional info about this score',
+      },
+    };
+  }
+);
+```
+
+## Running Evaluations
+
+You run your evaluation suites from your terminal using the `axiom` CLI.
+
+### Run all evals
 
 ```bash
-axiom run evals/text-match.eval.ts
+axiom eval
 ```
 
-This command will execute the specified test file using `vitest` in the background. Note that `vitest` will be a peer dependency for this functionality.
+This finds and runs all files matching `**/*.eval.{ts,js}`.
+
+### Run specific eval file
+
+```bash
+axiom eval src/evals/ticket-classification.eval.ts
+```
 
-## Analyzing results in the console
+### Run evals matching a glob pattern
+
+```bash
+axiom eval "**/*spam*.eval.ts"
+```
+
+### Run eval by name
+
+```bash
+axiom eval "spam-classification"
+```
+
+### List available evals without running
+
+```bash
+axiom eval --list
+```
+
+## Overriding Flags
+
+Flags allow you to run experiments by testing different configurations without changing code.
+
+### From CLI (dot notation)
+
+Override individual flags:
+
+```bash
+axiom eval --flag.ticketClassification.model=gpt-4o
+```
+
+### From JSON file
+
+Create `experiment.json`:
+
+```json
+{
+  "ticketClassification": {
+    "model": "gpt-4o"
+  }
+}
+```
+
+Then run:
+
+```bash
+axiom eval --flags-config=experiment.json
+```
+
+## Analyzing Results in the Console
 
 <Badge>Coming soon</Badge> When you run an <Tooltip tip={definitions.Eval}>eval</Tooltip>, the Axiom SDK captures a detailed OpenTelemetry trace for the entire run. This includes parent spans for the evaluation suite and child spans for each individual test case, task execution, and scorer result. These traces are enriched with `eval.*` attributes, allowing you to deeply analyze results in the Axiom Console.
 
+After running evals, you'll see:
+- Pass/fail status for each test case
+- Scores from each scorer
+- Comparison to baseline (if available)
+- Links to view detailed traces in Axiom
+
+Results are also sent to your Axiom dataset for long-term tracking and analysis.
+
 The Console will feature leaderboards and comparison views to track score progression across different versions of a capability, helping you verify that your changes are leading to measurable improvements.
 
-## What’s next?
+## What's Next?
+
+Once your capability meets your quality benchmarks in the Measure stage, it's ready to be deployed. Additional next steps include:
 
-Once your capability meets your quality benchmarks in the Measure stage, it’s ready to be deployed. The next step is to monitor its performance with real-world traffic.
+- **Baseline Comparisons**: Run evals multiple times to track regression over time
+- **Experiment with Flags**: Test different models or strategies using flag overrides
+- **Advanced Scorers**: Build custom scorers for domain-specific metrics
+- **CI/CD Integration**: Add `axiom eval` to your CI pipeline to catch regressions
 
-Learn more about this step of the AI engineering workflow in the [Observe](/ai-engineering/observe) docs.
\ No newline at end of file
+The next step is to monitor its performance with real-world traffic. Learn more about this step of the AI engineering workflow in the [Observe](/ai-engineering/observe) docs.

From a082b905e392d6d7641dc7f18d8b63df5f604a80 Mon Sep 17 00:00:00 2001
From: Christopher Ehrlich <ehrlich.christopher@gmail.com>
Date: Tue, 11 Nov 2025 15:53:36 +0700
Subject: [PATCH 2/7] add note about instrumentation fn

---
 ai-engineering/measure.mdx | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/ai-engineering/measure.mdx b/ai-engineering/measure.mdx
index df51ac929..28a0d6840 100644
--- a/ai-engineering/measure.mdx
+++ b/ai-engineering/measure.mdx
@@ -103,7 +103,7 @@ AXIOM_TOKEN="xaat-******"
 AXIOM_DATASET="my_dataset"
 ```
 
-#### 2. Create instrumentation setup (optional)
+#### 2. Create instrumentation setup(optional)
 
 If you are evaluating components of a production application that is instrumented with OpenTelemetry, you can see your application spans in Axiom. In order to enable this, your instrumentation setup must be a function that can be passed in `axiom.config.ts`. An example is shown below.
 
@@ -187,6 +187,7 @@ export default defineConfig({
     timeoutMs: 60_000,
     
     // Optional: instrumentation hook for OpenTelemetry
+    // (created this in the "Create instrumentation setup" step)
     instrumentation: ({ url, token, dataset }) => 
       setupAppInstrumentation({ url, token, dataset }),
   },

From 7df0bdb082039969ffbcf194a15404a19e0ec45d Mon Sep 17 00:00:00 2001
From: Mano Toth <mano@axiom.co>
Date: Tue, 11 Nov 2025 16:12:58 +0100
Subject: [PATCH 3/7] Stylistic fixes

---
 ai-engineering/measure.mdx | 54 +++++++++++++++++++-------------------
 1 file changed, 27 insertions(+), 27 deletions(-)

diff --git a/ai-engineering/measure.mdx b/ai-engineering/measure.mdx
index 28a0d6840..d706ee0c4 100644
--- a/ai-engineering/measure.mdx
+++ b/ai-engineering/measure.mdx
@@ -5,17 +5,17 @@ keywords: ["ai engineering", "AI engineering", "measure", "evals", "evaluation",
 ---
 
 import { Badge } from "/snippets/badge.jsx"
-import { definitions } from '/snippets/definitions.mdx'
+import { definitions } from "/snippets/definitions.mdx"
 
 <Warning>
-The evaluation framework described here is in active development. Axiom is working with design partners to shape what's built. [Contact Axiom](https://www.axiom.co/contact) to get early access and join a focused group of teams shaping these tools.
+The evaluation framework described here is in active development. Axiom is working with design partners to shape what’s built. [Contact Axiom](https://www.axiom.co/contact) to get early access and join a focused group of teams shaping these tools.
 </Warning>
 
-The **Measure** stage is where you quantify the quality and effectiveness of your AI <Tooltip tip={definitions.Capability}>capability</Tooltip>. Instead of relying on anecdotal checks, this stage uses a systematic process called an <Tooltip tip={definitions.Eval}>eval</Tooltip> to score your capability's performance against a known set of correct examples (<Tooltip tip={definitions.GroundTruth}>ground truth</Tooltip>). This provides a data-driven benchmark to ensure a capability is ready for production and to track its quality over time.
+The **Measure** stage is where you quantify the quality and effectiveness of your AI <Tooltip tip={definitions.Capability}>capability</Tooltip>. Instead of relying on anecdotal checks, this stage uses a systematic process called an <Tooltip tip={definitions.Eval}>eval</Tooltip> to score your capability’s performance against a known set of correct examples (<Tooltip tip={definitions.GroundTruth}>ground truth</Tooltip>). This provides a data-driven benchmark to ensure a capability is ready for production and to track its quality over time.
 
 Evaluations (evals) are systematic tests that measure how well your AI features perform. Instead of manually testing AI outputs, evals automatically run your AI code against test datasets and score the results using custom metrics. This lets you catch regressions, compare different approaches, and confidently improve your AI features over time.
 
-## The `Eval` function
+## `Eval` function
 
 The primary tool for the Measure stage is the `Eval` function, available in the `axiom/ai/evals` package. It provides a simple, declarative way to define a test suite for your capability directly in your codebase.
 
@@ -66,7 +66,7 @@ Eval('text-match-eval', {
 });
 ```
 
-## Getting Started
+## Get started
 
 ### Prerequisites
 
@@ -74,7 +74,7 @@ Eval('text-match-eval', {
 - Existing AI SDK setup (e.g., `@ai-sdk/openai`, `ai`)
 - Axiom account with API token and dataset
 
-### Installation
+### Install dependencies
 
 ```bash
 npm install axiom
@@ -91,7 +91,7 @@ npm install @opentelemetry/api \
             @opentelemetry/semantic-conventions
 ```
 
-### Configuration
+### Configure
 
 #### 1. Set up environment variables
 
@@ -103,7 +103,7 @@ AXIOM_TOKEN="xaat-******"
 AXIOM_DATASET="my_dataset"
 ```
 
-#### 2. Create instrumentation setup(optional)
+#### 2. Create instrumentation setup (optional)
 
 If you are evaluating components of a production application that is instrumented with OpenTelemetry, you can see your application spans in Axiom. In order to enable this, your instrumentation setup must be a function that can be passed in `axiom.config.ts`. An example is shown below.
 
@@ -194,9 +194,9 @@ export default defineConfig({
 });
 ```
 
-## Setting up Flags
+## Set up flags
 
-Flags let you parameterize your AI behavior (like model choice or prompting strategies) and run experiments with different configurations. They're type-safe via Zod schemas and can be overridden at runtime.
+Flags let you parameterize your AI behavior (like model choice or prompting strategies) and run experiments with different configurations. They’re type-safe via Zod schemas and can be overridden at runtime.
 
 Create `src/lib/app-scope.ts`:
 
@@ -215,9 +215,9 @@ const { flag, pickFlags } = createAppScope({ flagSchema });
 export { flag, pickFlags };
 ```
 
-## Writing a Real-World Eval
+## Write real-world evals
 
-Let's build a practical evaluation for a support ticket classification system.
+Let’s build a practical evaluation for a support ticket classification system.
 
 Create an eval file `src/evals/ticket-classification.eval.ts`:
 
@@ -327,11 +327,11 @@ Eval('spam-classification', {
 });
 ```
 
-## Scoring with Scorers
+## Score with scorers
 
-A scorer is a function that scores a capability's output. Scorers receive the `input`, the generated `output`, and the `expected` value, and return a score (typically 0-1).
+A scorer is a function that scores a capability’s output. Scorers receive the `input`, the generated `output`, and the `expected` value, and return a score (typically 0-1).
 
-### Simple Custom Scorer
+### Simple custom scorer
 
 ```typescript
 import { Scorer } from 'axiom/ai/evals';
@@ -344,7 +344,7 @@ const ExactMatchScorer = Scorer(
 );
 ```
 
-### Using AutoEvals Library
+### Use AutoEvals library
 
 The `autoevals` library provides pre-built scorers for common tasks like semantic similarity, factual correctness, and text matching:
 
@@ -360,7 +360,7 @@ const WrappedExactMatch = Scorer(
 );
 ```
 
-### Scorer with Metadata
+### Scorer with metadata
 
 Scorers can return additional metadata alongside the score:
 
@@ -379,7 +379,7 @@ const CustomScorer = Scorer(
 );
 ```
 
-## Running Evaluations
+## Run evaluations
 
 You run your evaluation suites from your terminal using the `axiom` CLI.
 
@@ -415,7 +415,7 @@ axiom eval "spam-classification"
 axiom eval --list
 ```
 
-## Overriding Flags
+## Override flags
 
 Flags allow you to run experiments by testing different configurations without changing code.
 
@@ -445,11 +445,11 @@ Then run:
 axiom eval --flags-config=experiment.json
 ```
 
-## Analyzing Results in the Console
+## Analyze results in Console
 
 <Badge>Coming soon</Badge> When you run an <Tooltip tip={definitions.Eval}>eval</Tooltip>, the Axiom SDK captures a detailed OpenTelemetry trace for the entire run. This includes parent spans for the evaluation suite and child spans for each individual test case, task execution, and scorer result. These traces are enriched with `eval.*` attributes, allowing you to deeply analyze results in the Axiom Console.
 
-After running evals, you'll see:
+After running evals, you’ll see:
 - Pass/fail status for each test case
 - Scores from each scorer
 - Comparison to baseline (if available)
@@ -459,13 +459,13 @@ Results are also sent to your Axiom dataset for long-term tracking and analysis.
 
 The Console will feature leaderboards and comparison views to track score progression across different versions of a capability, helping you verify that your changes are leading to measurable improvements.
 
-## What's Next?
+## What’s next?
 
-Once your capability meets your quality benchmarks in the Measure stage, it's ready to be deployed. Additional next steps include:
+Once your capability meets your quality benchmarks in the Measure stage, it’s ready to be deployed. Additional next steps include:
 
-- **Baseline Comparisons**: Run evals multiple times to track regression over time
-- **Experiment with Flags**: Test different models or strategies using flag overrides
-- **Advanced Scorers**: Build custom scorers for domain-specific metrics
-- **CI/CD Integration**: Add `axiom eval` to your CI pipeline to catch regressions
+- **Baseline comparisons**: Run evals multiple times to track regression over time
+- **Experiment with flags**: Test different models or strategies using flag overrides
+- **Advanced scorers**: Build custom scorers for domain-specific metrics
+- **CI/CD integration**: Add `axiom eval` to your CI pipeline to catch regressions
 
 The next step is to monitor its performance with real-world traffic. Learn more about this step of the AI engineering workflow in the [Observe](/ai-engineering/observe) docs.

From 0254557d225c1f28c3f7fae4e6901a10f9eb5c4e Mon Sep 17 00:00:00 2001
From: Mano Toth <mano@axiom.co>
Date: Thu, 13 Nov 2025 10:09:21 +0100
Subject: [PATCH 4/7] Quick fixes

---
 ai-engineering/measure.mdx    | 20 ++++++++++----------
 ai-engineering/quickstart.mdx |  3 +++
 2 files changed, 13 insertions(+), 10 deletions(-)

diff --git a/ai-engineering/measure.mdx b/ai-engineering/measure.mdx
index d706ee0c4..37cd0e96b 100644
--- a/ai-engineering/measure.mdx
+++ b/ai-engineering/measure.mdx
@@ -93,7 +93,7 @@ npm install @opentelemetry/api \
 
 ### Configure
 
-#### 1. Set up environment variables
+#### Set up environment variables
 
 Create a `.env` file:
 
@@ -103,13 +103,13 @@ AXIOM_TOKEN="xaat-******"
 AXIOM_DATASET="my_dataset"
 ```
 
-#### 2. Create instrumentation setup (optional)
+#### Create instrumentation setup (optional)
 
 If you are evaluating components of a production application that is instrumented with OpenTelemetry, you can see your application spans in Axiom. In order to enable this, your instrumentation setup must be a function that can be passed in `axiom.config.ts`. An example is shown below.
 
 Create `src/instrumentation.node.ts`:
 
-```typescript
+```ts /src/instrumentation.node.ts
 import { OTLPTraceExporter } from '@opentelemetry/exporter-trace-otlp-http';
 import { resourceFromAttributes } from '@opentelemetry/resources';
 import { BatchSpanProcessor, NodeTracerProvider } from '@opentelemetry/sdk-trace-node';
@@ -163,11 +163,11 @@ import { trace } from '@opentelemetry/api';
 export const tracer = trace.getTracer('my-tracer');
 ```
 
-#### 1. Create `axiom.config.ts`
+#### Create `axiom.config.ts`
 
 Create a configuration file at the root of your project:
 
-```typescript
+```ts /axiom.config.ts
 import { defineConfig } from 'axiom/ai/config';
 import { setupAppInstrumentation } from './src/instrumentation.node';
 
@@ -200,7 +200,7 @@ Flags let you parameterize your AI behavior (like model choice or prompting stra
 
 Create `src/lib/app-scope.ts`:
 
-```typescript
+```ts /src/lib/app-scope.ts
 import { createAppScope } from 'axiom/ai/evals';
 import { z } from 'zod';
 
@@ -333,7 +333,7 @@ A scorer is a function that scores a capability’s output. Scorers receive the
 
 ### Simple custom scorer
 
-```typescript
+```ts
 import { Scorer } from 'axiom/ai/evals';
 
 const ExactMatchScorer = Scorer(
@@ -348,7 +348,7 @@ const ExactMatchScorer = Scorer(
 
 The `autoevals` library provides pre-built scorers for common tasks like semantic similarity, factual correctness, and text matching:
 
-```typescript
+```ts
 import { Scorer } from 'axiom/ai/evals';
 import { ExactMatch } from 'autoevals';
 
@@ -364,7 +364,7 @@ const WrappedExactMatch = Scorer(
 
 Scorers can return additional metadata alongside the score:
 
-```typescript
+```ts
 const CustomScorer = Scorer(
   'Custom-Scorer',
   ({ output, expected }) => {
@@ -381,7 +381,7 @@ const CustomScorer = Scorer(
 
 ## Run evaluations
 
-You run your evaluation suites from your terminal using the `axiom` CLI.
+To run your evaluation suites from your terminal, [install the Axiom CLI](/reference/cli) and use the following commands.
 
 ### Run all evals
 
diff --git a/ai-engineering/quickstart.mdx b/ai-engineering/quickstart.mdx
index 72058199b..c301da429 100644
--- a/ai-engineering/quickstart.mdx
+++ b/ai-engineering/quickstart.mdx
@@ -5,6 +5,7 @@ keywords: ["ai engineering", "getting started", "install", "setup", "configurati
 ---
 
 import ReplaceDatasetToken from "/snippets/replace-dataset-token.mdx"
+import ReplaceDomain from "/snippets/replace-domain.mdx"
 import Prerequisites from "/snippets/standard-prerequisites.mdx"
 import AIInstrumentationApproaches from "/snippets/ai-instrumentation-approaches.mdx"
 
@@ -148,6 +149,7 @@ For more information on specifying redaction policies, see [Redaction policies](
 Store environment variables in an `.env` file in the root of your project:
 
 ```bash .env
+AXIOM_URL="AXIOM_DOMAIN"
 AXIOM_TOKEN="API_TOKEN"
 AXIOM_DATASET="DATASET_NAME"
 OPENAI_API_KEY=""
@@ -158,6 +160,7 @@ ANTHROPIC_API_KEY=""
 
 <Info>
 <ReplaceDatasetToken />
+<ReplaceDomain />
 
 Enter the API keys for the LLMs you want to work with.
 </Info>

From 7b8bd252794f3c312514e5e7bcb33d0edc47ff08 Mon Sep 17 00:00:00 2001
From: Mano Toth <mano@axiom.co>
Date: Thu, 13 Nov 2025 14:31:42 +0100
Subject: [PATCH 5/7] Fixes

---
 ai-engineering/create.mdx                         | 2 +-
 ai-engineering/observe/manual-instrumentation.mdx | 4 ++--
 ai-engineering/quickstart.mdx                     | 4 ++--
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/ai-engineering/create.mdx b/ai-engineering/create.mdx
index 9f9f23232..5827e0a96 100644
--- a/ai-engineering/create.mdx
+++ b/ai-engineering/create.mdx
@@ -9,7 +9,7 @@ import { definitions } from '/snippets/definitions.mdx'
 
 The **Create** stage is about defining a new AI <Tooltip tip={definitions.Capability}>capability</Tooltip> as a structured, version-able asset in your codebase. The goal is to move away from scattered, hard-coded string prompts and toward a more disciplined and organized approach to prompt engineering.
 
-### Defining a capability as a prompt object
+### Define a capability as a prompt object
 
 In Axiom AI engineering, every capability is represented by a `Prompt` object. This object serves as the single source of truth for the capability’s logic, including its messages, metadata, and the schema for its arguments.
 
diff --git a/ai-engineering/observe/manual-instrumentation.mdx b/ai-engineering/observe/manual-instrumentation.mdx
index 514281586..9e097aeb6 100644
--- a/ai-engineering/observe/manual-instrumentation.mdx
+++ b/ai-engineering/observe/manual-instrumentation.mdx
@@ -188,7 +188,7 @@ Example of a properly structured chat completion trace:
 ```typescript TypeScript expandable
 import { trace, SpanKind, SpanStatusCode } from '@opentelemetry/api';
 
-const tracer = trace.getTracer('my-ai-app');
+const tracer = trace.getTracer('my-app');
 
 // Create a span for the AI operation
 return tracer.startActiveSpan('chat gpt-4', {
@@ -233,7 +233,7 @@ from opentelemetry import trace
 from opentelemetry.trace import SpanKind
 import json
 
-tracer = trace.get_tracer("my-ai-app")
+tracer = trace.get_tracer("my-app")
 
 # Create a span for the AI operation
 with tracer.start_as_current_span("chat gpt-4", kind=SpanKind.CLIENT) as span:
diff --git a/ai-engineering/quickstart.mdx b/ai-engineering/quickstart.mdx
index c301da429..36a3a1ed3 100644
--- a/ai-engineering/quickstart.mdx
+++ b/ai-engineering/quickstart.mdx
@@ -117,7 +117,7 @@ To send data to Axiom, configure a tracer. For example, use a dedicated instrume
     // Configure the provider to export traces to your Axiom dataset
     const provider = new NodeTracerProvider({
       resource: resourceFromAttributes({
-        [ATTR_SERVICE_NAME]: 'my-ai-app', // Replace with your service name
+        [ATTR_SERVICE_NAME]: 'my-app', // Replace with your service name
       },
       {
         // Use the latest schema version
@@ -126,7 +126,7 @@ To send data to Axiom, configure a tracer. For example, use a dedicated instrume
       }),
       spanProcessor: new SimpleSpanProcessor(
         new OTLPTraceExporter({
-          url: `https://api.axiom.co/v1/traces`,
+          url: `${process.env.AXIOM_URL}/v1/traces`,
           headers: {
             Authorization: `Bearer ${process.env.AXIOM_TOKEN}`,
             'X-Axiom-Dataset': process.env.AXIOM_DATASET!,

From 2251591b9ecef6bdd95fb2f5d1d84c15768bf997 Mon Sep 17 00:00:00 2001
From: Mano Toth <mano@axiom.co>
Date: Fri, 14 Nov 2025 12:12:07 +0100
Subject: [PATCH 6/7] Add keywords

---
 ai-engineering/measure.mdx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ai-engineering/measure.mdx b/ai-engineering/measure.mdx
index 37cd0e96b..0db2d3042 100644
--- a/ai-engineering/measure.mdx
+++ b/ai-engineering/measure.mdx
@@ -1,7 +1,7 @@
 ---
 title: "Measure"
 description: "Learn how to measure the quality of your AI capabilities by running evaluations against ground truth data."
-keywords: ["ai engineering", "AI engineering", "measure", "evals", "evaluation", "scoring", "scorers"]
+keywords: ["ai engineering", "AI engineering", "measure", "evals", "evaluation", "scoring", "scorers", "graders", "scores"]
 ---
 
 import { Badge } from "/snippets/badge.jsx"

From d21a94136e337fbb61f297eff296a4634d9692c5 Mon Sep 17 00:00:00 2001
From: Christopher Ehrlich <ehrlich.christopher@gmail.com>
Date: Mon, 17 Nov 2025 10:09:15 +0100
Subject: [PATCH 7/7] Update ai-engineering/measure.mdx

Co-authored-by: Islam Shehata <islam@axiom.co>
---
 ai-engineering/measure.mdx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ai-engineering/measure.mdx b/ai-engineering/measure.mdx
index 0db2d3042..7518d26d9 100644
--- a/ai-engineering/measure.mdx
+++ b/ai-engineering/measure.mdx
@@ -222,7 +222,7 @@ Let’s build a practical evaluation for a support ticket classification system.
 Create an eval file `src/evals/ticket-classification.eval.ts`:
 
 ```typescript
-import { experimental_Eval as Eval, Scorer } from 'axiom/ai/evals';
+import { Eval, Scorer } from 'axiom/ai/evals';
 import { generateObject } from 'ai';
 import { openai } from '@ai-sdk/openai';
 import { wrapAISDKModel } from 'axiom/ai';