In [1]:
pip install braintrust autoevals pydantic

Note: you may need to restart the kernel to use updated packages.


In [2]:
from dotenv import load_dotenv
import braintrust as bt
from braintrust import wrap_litellm
from litellm import completion
from pydantic import BaseModel, Field
from typing import List, Optional, Dict, Any
from datetime import datetime
import json
load_dotenv(override=True)
MODEL_NAME="bedrock/anthropic.claude-3-sonnet-20240229-v1:0"

In [3]:
BT_PROJECT_NAME = "customer-service-agent-eval"
MAX_WORKERS = 5

bt_project = bt.projects.create(name=BT_PROJECT_NAME)

---

# Understanding the Problem

## Instructions:
1. **Read the objective carefully**: We need ~100 diverse, realistic customer support requests
2. **Key insight**: "Naively generated queries tend to be generic, repetitive, and fail to capture real usage patterns"
3. **Solution**: Use a systematic approach with defined dimensions

## Workshop Discussion Points:
- Why is synthetic data important when real data isn't available?
- What makes a query "realistic" vs "generic"?
- How can we ensure diversity in our generated data?

# Define Evaluation Dimensions

A dimension is a way to categorize different parts of a user query. Each dimension represents ONE axis of variation. In our example customer service chatbot.

Feature: What task or enquiry the user wants to perform e.g order cancelation
Persona: What type of client e.g first time buyer, existing buyer
scenario: How clear is the intent specified from the user e.g concise or verbose

## Instructions:
1. **Study each dimension carefully**:
   - **Intent**: What the user wants to accomplish
   - **Complexity**: How difficult the query is to handle
   - **Persona**: What type of user is making the request
   - **Language Style**: How the user communicates

2. **Critical principle**: "Choose dimensions that describe where the AI application is likely to fail"

3. **Review the Pydantic models**: Notice how we structure our data for validation

## Workshop Activity:
What other dimensions might be relevant for your specific use case?

# Dimension Tuple Generation Function

## Instructions:
1. **Examine the prompt structure**: Notice how we:
   - Provide clear instructions for balanced coverage
   - Include realistic constraints (e.g., new customers rarely have returns)
   - Request specific numbers of combinations

2. **Understand the parallel processing**: We make multiple calls and deduplicate results

3. **Run this cell**: It defines the function but doesn't execute it yet

## Key Concept:
Good prompt engineering includes constraints and examples to guide the LLM toward realistic outputs.

In [4]:
#### Define Pydantic Models for structured Output

class DimensionTuple(BaseModel):
    intent: str = Field(
        description="The user's primary goal or task (e.g., product_inquiry, order_status_check, return_request, technical_support, account_management, general_info)."
    )
    complexity: str = Field(
        description="The difficulty and structure of the query (e.g., simple, multi-turn, ambiguous)."
    )
    persona: str = Field(
        description="The type of user based on their behavior or relationship with the store (e.g., new_customer, repeat_customer, frustrated_customer, loyalty_member)."
    )
    language_style: str = Field(
        description="Linguistic characteristics of the query (e.g., formal, informal, contains_slang, includes_typos, verbose, concise)."
    )

class DimensionTuples(BaseModel):
    tuples: List[DimensionTuple]

class DimensionTuplesList(BaseModel):
    tuples: List[DimensionTuple]


In [5]:
TUPLES_GEN_PROMPT="""\
        I am designing a customer support chatbot for a retail company and need to generate a diverse set of synthetic test data to evaluate its performance. I've provided you with the key dimensions that make up a customer's inquiry, along with a list of possible values for each.
        
        ## Instructions
        
        Generate {{{num_tuples_to_generate}}} unique combinations of dimension values based on the dimensions provided below.
        
        * Each combination should represent a distinct customer support scenario.
        * Ensure **balanced coverage** across all dimensions; avoid over-representing any single value or combination.
        * The generated tuples should be as realistic and varied as possible. For example, a frustrated customer is likely to use informal language and ask a complex question about a return.
        * Never generate a tuple where the persona is 'new_customer' and the intent is 'return_request' or 'order_status_check' unless the complexity is multi-turn to simulate a scenario where they are new to this process.
        
        ## Dimensions
        
        * **intent**: What kind of inquiry are they making?
            * product_inquiry
            * order_status_check
            * request_for_action_or_service
            * return_request
            * cancel_order
            * technical_support
            * account_management
            * general_info
        * **complexity**: The difficulty and structure of the query.
            * simple
            * multi-turn
            * ambiguous
        * **persona**: The type of user making the request
            * new_customer
            * repeat_customer
            * frustrated_customer
            * loyalty_member
        * **language_style**: The linguistic characteristics of the query
            * formal
            * informal
            * contains_slang
            * includes_typos
        
        Generate {{{num_tuples_to_generate}}} unique dimension tuples.
    """

In [6]:
def get_or_create_tuples_prompt():
    try:
        # Try to load existing prompt
        prompt = bt.load_prompt(project=BT_PROJECT_NAME, slug="dimension-tuples-gen-prompt")
        # Validate it works
        prompt.build(num_tuples_to_generate=20)
        return prompt
    except Exception:
        # Create new prompt if loading/building fails
        bt_project.prompts.create(
            name="DimensionTuplesGenPrompt",
            slug="dimension-tuples-gen-prompt", 
            description="Prompt for generating dimension tuples",
            model="claude-4-sonnet-20250514",
            messages=[{"role": "user", "content": TUPLES_GEN_PROMPT}],
            if_exists="replace",
        )
        bt_project.publish()
        return bt.load_prompt(project=BT_PROJECT_NAME, slug="dimension-tuples-gen-prompt")

tuples_gen_prompt = get_or_create_tuples_prompt()

In [7]:
#fetch existing prompts
tuples_gen_prompt = bt.load_prompt(project=BT_PROJECT_NAME, slug="dimension-tuples-gen-prompt")


In [8]:
tuples_gen_prompt

<braintrust.logger.Prompt at 0x7ffb89e2a450>

In [9]:
_p = tuples_gen_prompt.build(num_tuples_to_generate=20)
print(_p["messages"])

[{'content': "        I am designing a customer support chatbot for a retail company and need to generate a diverse set of synthetic test data to evaluate its performance. I've provided you with the key dimensions that make up a customer's inquiry, along with a list of possible values for each.\n        \n        ## Instructions\n        \n        Generate 20 unique combinations of dimension values based on the dimensions provided below.\n        \n        * Each combination should represent a distinct customer support scenario.\n        * Ensure **balanced coverage** across all dimensions; avoid over-representing any single value or combination.\n        * The generated tuples should be as realistic and varied as possible. For example, a frustrated customer is likely to use informal language and ask a complex question about a return.\n        * Never generate a tuple where the persona is 'new_customer' and the intent is 'return_request' or 'order_status_check' unless the complexity is

![title](images/braintrust_playgrounds.png)

---

# Generate Dimension Combinations

## Instructions:
1. **Execute this cell**: It will generate diverse dimension combinations
2. **Watch the output**: You should see parallel generation happening
3. **Review the results**: Examine the generated tuples for:
   - Realistic combinations
   - Balanced coverage across dimensions
   - Absence of impossible scenarios

## Expected Output:
- "Generated X total tuples, Y unique"
- A list of DimensionTuple objects with varied combinations

---

In [10]:
def generate_synth_data_dimension_tuples(num_tuples: int = 20, model_kwargs: dict = {}):
    """Generate a list of dimension tuples based on the provided prompt."""
    timestamp = datetime.now().strftime("%Y%m%d_%H%M")
    prompt = tuples_gen_prompt.build(num_tuples_to_generate=num_tuples)
    rsp = completion(
        model="bedrock/anthropic.claude-3-sonnet-20240229-v1:0",
        messages=prompt["messages"],
        response_format=DimensionTuples,
        **model_kwargs,
    )
    
    # Parse JSON content and validate with Pydantic
    content = rsp.choices[0].message.content
    tuples_dict = json.loads(content)
    tuples_list: DimensionTuples = DimensionTuples(**tuples_dict)
    
    unique_tuples = []
    seen = set()
    
    for tup in tuples_list.tuples:
        tuple_str = tup.model_dump_json()
        if tuple_str in seen:
            continue
        seen.add(tuple_str)
        unique_tuples.append(tup)
    
    bt_experiment = bt.init(project=BT_PROJECT_NAME, experiment=f"synth_tuples_it_{timestamp}")
    for uniq_tup in unique_tuples:
        with bt_experiment.start_span(name="generate_dimension_tuples") as span:
            span.log(input=prompt["messages"], output=uniq_tup, metadata=dict(model_kwargs=model_kwargs))
    
    summary = bt_experiment.summarize(summarize_scores=False)
    return summary, tuples_list


In [22]:
generated_data=generate_synth_data_dimension_tuples(num_tuples=20)

In [23]:
generated_data

(ExperimentSummary(project_name='customer-service-agent-eval', project_id='8a70e092-c540-4db6-bb9d-2fa3da5d1e2e', experiment_id='99ba88b1-58d5-4c8f-848e-1d899253bb29', experiment_name='synth_tuples_it_20250906_1933', project_url='https://www.braintrust.dev/app/aiopsdream/p/customer-service-agent-eval', experiment_url='https://www.braintrust.dev/app/aiopsdream/p/customer-service-agent-eval/experiments/synth_tuples_it_20250906_1933', comparison_experiment_name=None, scores={}, metrics={}),
 DimensionTuples(tuples=[DimensionTuple(intent='product_inquiry', complexity='simple', persona='new_customer', language_style='formal'), DimensionTuple(intent='order_status_check', complexity='multi-turn', persona='repeat_customer', language_style='informal'), DimensionTuple(intent='return_request', complexity='ambiguous', persona='frustrated_customer', language_style='contains_slang'), DimensionTuple(intent='technical_support', complexity='simple', persona='loyalty_member', language_style='formal'), D

### Configuring Human Review
To set up human review, define the scores you want to collect in your project's Configuration tab. In our example the score name is "dimension_human_review"

![title](images/create_human_review.png)

---

# Review Generated Tuples

## Instructions:
1. **Examine the output**: Look at the variety of combinations generated
2. **Quality check**: Verify that combinations make sense (e.g., frustrated customers with complex queries)
 3. **Note the balance**: See how different dimensions are represented

---

![title](images/dimension_human_review.png)

## Workshop Discussion:
- Which combinations seem most realistic?
- Are there any combinations that seem problematic?
- How does this compare to manually brainstorming scenarios?

## Key Takeaways
- We use braintrust.init to manually create a new experiment.
- Remove invalid dimensions with human review

In [32]:
print(os.getenv('BRAINTRUST_API_KEY'))

sk-acUtIyX6RQDIQHPHpWJl86cYPnEVVtDHJ519oQ8c1qLqO2AA


### Using BTQL we select high quality dimension queries

In [108]:
import requests
from textwrap import dedent
import json
import os
import random

def get_valid_dimension_tuples():
    # Configuration variables
    experiment_id = "99ba88b1-58d5-4c8f-848e-1d899253bb29" ###Get this id from the expreridents console 
    bearer_token = os.getenv('BRAINTRUST_API_KEY')
    
    cursor = None
    while True:
        url = "https://api.braintrust.dev/btql"
        payload = json.dumps({
          "query": f"select: * | from: experiment('{experiment_id}') filter: scores.\"dimension_human_review\" = 1"
            })
        headers = {
          'Authorization': f'Bearer {bearer_token}',
          'Content-Type': 'application/json'}
        response = requests.request("POST", url, headers=headers, data=payload)
        response.raise_for_status()
        response_json = response.json()
        data = response_json.get("data", [])
        cursor = response_json.get("cursor")
        return [row["output"] for row in data]

valid_dim_tuples = get_valid_dimension_tuples()
print(len(valid_dim_tuples))
valid_dim_tuples[:2]

15


[{'complexity': 'simple',
  'intent': 'technical_support',
  'language_style': 'formal',
  'persona': 'new_customer'},
 {'complexity': 'multi-turn',
  'intent': 'return_request',
  'language_style': 'informal',
  'persona': 'loyalty_member'}]

# Query Generation Functions

## Instructions:
1. **Study the prompt template**: Notice how we:
   - Inject the dimension tuple into the prompt
   - Provide specific formatting instructions
   - Include examples of realistic variations
   - Request natural, conversational language

2. **Understand the parallel processing setup**: We'll generate multiple queries per tuple simultaneously

3. **Run this cell below**: This defines the functions but doesn't execute generation yet

## Key Concept:
Converting structured dimensions to natural language requires careful prompt engineering with examples and constraints.

---

In [109]:
SYNTH_DATA_GEN_PROMPT="""\
You are an AI assistant tasked with generating natural language queries for an online apparel retailer chatbot. Your goal is to create realistic, varied queries that match specific characteristics. Here's what you need to do:

First, here is the dimension tuple that defines the characteristics for this set of queries:
<dimension_json>
{{{dimension_tuple_json}}}
</dimension_json>
        
Follow these instructions to generate the queries:
        
1. Generate exactly {{{num_of_unique_queries_per_dimension}}} unique queries that perfectly match the specified dimensions in the dimension tuple.        
2. Focus on realism: The queries should sound like something a real person would type into a chatbot. They should be natural and conversational.        
3. Incorporate all dimensions: The language and content of each query must naturally reflect all five dimensions (intent, complexity, persona, language_style, and failure_scenario) as specified in the dimension tuple.        
4. Vary the style: Within the specified 'language_style', introduce natural variations such as:
           - Common misspellings
           - Missing punctuation
           - Different capitalization
           - Use of emojis
           - Text-speak (e.g., 'thx', 'pls', 'gonna')
        
5. Be concise and direct: Each query should be to the point and reflect how a real user would interact with a chatbot.
        
Here are some examples of realistic variations to guide you:
    - For a "Frustrated Customer" with "Technical Support" intent and "Incomplete Info" scenario:
          "my discount code aint working its stupid"
          "why is the site crashing"
          "i need help with the cart but its broken"
        
    - For a "New Customer" with "Product Inquiry" intent, "Simple" complexity, and "Formal" language style:
          "Could you please tell me about the sizing for the women's jackets?"
          "Hello, I have a question regarding the material of the new shirt."
        
        
Generate {{{num_queries_to_generate}}} unique queries,varying the text style naturally.
"""

In [110]:
print(
     (
         SYNTH_DATA_GEN_PROMPT.replace("{{{num_queries_to_generate}}}", "4")
         .replace("{{{dimension_tuple_json}}}", json.dumps(valid_dim_tuples[:4], indent=2))
         .replace("{{{num_of_unique_queries_per_dimension}}}", "2")
     ))

You are an AI assistant tasked with generating natural language queries for an online apparel retailer chatbot. Your goal is to create realistic, varied queries that match specific characteristics. Here's what you need to do:

First, here is the dimension tuple that defines the characteristics for this set of queries:
<dimension_json>
[
  {
    "complexity": "simple",
    "intent": "technical_support",
    "language_style": "formal",
    "persona": "new_customer"
  },
  {
    "complexity": "multi-turn",
    "intent": "return_request",
    "language_style": "informal",
    "persona": "loyalty_member"
  },
  {
    "complexity": "simple",
    "intent": "order_status_check",
    "language_style": "formal",
    "persona": "repeat_customer"
  },
  {
    "complexity": "multi-turn",
    "intent": "product_inquiry",
    "language_style": "includes_typos",
    "persona": "loyalty_member"
  }
]
</dimension_json>
        
Follow these instructions to generate the queries:
        
1. Generate exac

In [111]:
synth_query_gen_prompt = bt.load_prompt(project=BT_PROJECT_NAME, slug="synth-query-gen-prompt")
try:
    synth_query_gen_prompt.prompt
except Exception as e:
    bt_syth_query_gen_prompt = bt_project.prompts.create(
        name="synthetic_query_prompts",
        slug="synth-query-gen-prompt",
        description="Prompt for generating synthetic queries",
        model="claude-4-sonnet-20250514",
        messages=[{"role": "user", "content": SYNTH_DATA_GEN_PROMPT}],
        if_exists="replace",
    )

    bt_project.publish()
    synth_query_gen_prompt = bt.load_prompt(project=BT_PROJECT_NAME, slug="synth-query-gen-prompt")

In [112]:
class QueryList(BaseModel):
    queries: list[str]

In [154]:
def generate_synth_queries(num_queries: int = 10, model_kwargs: dict = {}) -> dict:
    random_dim_tuples = random.sample(valid_dim_tuples, min(num_queries, len(valid_dim_tuples)))

    prompt = synth_query_gen_prompt.build(
        num_queries_to_generate=num_queries,
        num_of_unique_queries_per_dimension=num_queries*2,
        dimension_tuple_json=json.dumps(random_dim_tuples, indent=2),
    )

    rsp = completion(
        model="bedrock/anthropic.claude-3-sonnet-20240229-v1:0",
        messages=prompt["messages"],
        response_format=QueryList,
        **model_kwargs,
    )

    query_string: QueryList = rsp.choices[0].message.content

    query_dict = json.loads(query_string)
    query_list: QueryList = QueryList(**query_dict)


    return {
        "prompt": prompt["messages"][0]['content'],
        "sampled_dim_tuples": random_dim_tuples,
        "synth_queries": query_list.queries
    }

In [155]:
generate_synth_queries(num_queries=2)

{'prompt': 'You are an AI assistant tasked with generating natural language queries for an online apparel retailer chatbot. Your goal is to create realistic, varied queries that match specific characteristics. Here\'s what you need to do:\n\nFirst, here is the dimension tuple that defines the characteristics for this set of queries:\n<dimension_json>\n[\n  {\n    "complexity": "multi-turn",\n    "intent": "return_request",\n    "language_style": "informal",\n    "persona": "loyalty_member"\n  },\n  {\n    "complexity": "simple",\n    "intent": "cancel_order",\n    "language_style": "formal",\n    "persona": "new_customer"\n  }\n]\n</dimension_json>\n        \nFollow these instructions to generate the queries:\n        \n1. Generate exactly 2 unique queries that perfectly match the specified dimensions in the dimension tuple.        \n2. Focus on realism: The queries should sound like something a real person would type into a chatbot. They should be natural and conversational.        \n

In [156]:
print(type(generate_synth_queries(num_queries=2)))

<class 'dict'>


In [164]:
#Add to braintrust

def add_to_braintrust_experiment(num_queries: int = 2,):
    """Add to Braintrust experiments"""
    timestamp = datetime.now().strftime("%Y%m%d_%H%M")
    all_queries = generate_synth_queries(num_queries=num_queries)
    bt_experiment = bt.init(project=BT_PROJECT_NAME, experiment=f"add_queries_it_{timestamp}")
    query_id = 1

    for query_item in all_queries["synth_queries"]:
        qid = f"{timestamp}_{query_id:03d}"
        query_id += 1
        with bt_experiment.start_span(name="add_query") as span:
            span.log(
                input=all_queries["prompt"],
                output=query_item,
                metadata={
                    "id": qid,
                }
            )

In [166]:
add_to_braintrust_experiment(num_queries=20)

### Remove invalid queries (human/SME review)

![title](images/huma_rewiew_queries.png)