# 01 - Experiments tracking - AWS Strands Agents  

This notebook demonstrates the unified testing approach using the `run_test()` method with human-readable results.

## Setup
______________________________________________________________

In [1]:
%pip install ipywidgets  "strands-agents" "strands-agents-tools" -q

Note: you may need to restart the kernel to use updated packages.


In [2]:
import json
import time
import uuid
import boto3
import yaml
from datetime import datetime
from utils import UnifiedTester


In [3]:
# Initialize Unified Tester
tester = UnifiedTester()

# Load configuration
with open('config_experiments.yml', 'r') as f:
    config = yaml.safe_load(f)

prompts = config['system_prompts']
test_queries = config['test_queries']

print("‚úÖ Unified Tester initialized!")
print(f"‚úÖ Available Prompts: {list(prompts.keys())}")
print(f"‚úÖ Number of Test queries: {len(test_queries)}")

‚úÖ Unified Tester initialized!
‚úÖ Available Prompts: ['version1', 'version2']
‚úÖ Number of Test queries: 2


## Tools Definition
__________________________________________________________________________________

### Dependencies setup

In [4]:
from strands_tools import retrieve, current_time
from strands import Agent, tool
from strands.models.bedrock import BedrockModel
import os

In [5]:
kb_name = "restaurant-assistant"
dynamodb = boto3.resource("dynamodb")
smm_client = boto3.client("ssm")
table_name = smm_client.get_parameter(
    Name=f"{kb_name}-table-name", WithDecryption=False
)
table = dynamodb.Table(table_name["Parameter"]["Value"])
kb_id = smm_client.get_parameter(Name=f"{kb_name}-kb-id", WithDecryption=False)
print("DynamoDB table:", table_name["Parameter"]["Value"])
print("Knowledge Base Id:", kb_id["Parameter"]["Value"])

DynamoDB table: restaurant-bookings
Knowledge Base Id: XGIDQ0ALGL


#### Tools setup

In [6]:
%%writefile tool_get_booking_details.py

from strands import tool
import boto3 

@tool
def tool_booking_details(booking_id: str, restaurant_name: str) -> dict:
    """Get the relevant details for booking_id in restaurant_name
    Args:
        booking_id: the id of the reservation
        restaurant_name: name of the restaurant handling the reservation

    Returns:
        booking_details: the details of the booking in JSON format
    """

    try:
        response = table.get_item(
            Key={"booking_id": booking_id, "restaurant_name": restaurant_name}
        )
        if "Item" in response:
            return response["Item"]
        else:
            return f"No booking found with ID {booking_id}"
    except Exception as e:
        return str(e)

Overwriting tool_get_booking_details.py


In [7]:
%%writefile tool_delete_booking.py

from strands import tool
import boto3 

@tool
def tool_delete_booking(booking_id: str, restaurant_name:str) -> str:
    """delete an existing booking_id at restaurant_name
    Args:
        booking_id: the id of the reservation
        restaurant_name: name of the restaurant handling the reservation

    Returns:
        confirmation_message: confirmation message
    """
    kb_name = 'restaurant-assistant'
    dynamodb = boto3.resource('dynamodb')
    smm_client = boto3.client('ssm')
    table_name = smm_client.get_parameter(
        Name=f'{kb_name}-table-name',
        WithDecryption=False
    )
    table = dynamodb.Table(table_name["Parameter"]["Value"])
    try:
        response = table.delete_item(Key={'booking_id': booking_id, 'restaurant_name': restaurant_name})
        if response['ResponseMetadata']['HTTPStatusCode'] == 200:
            return f'Booking with ID {booking_id} deleted successfully'
        else:
            return f'Failed to delete booking with ID {booking_id}'
    except Exception as e:
        return str(e)

Overwriting tool_delete_booking.py


In [8]:
%%writefile tool_create_booking.py

#Alternativelly, you can use the TOOL_SPEC approach when defining your tool

from typing import Any
from strands.types.tools import ToolResult, ToolUse
import boto3
import uuid


TOOL_SPEC = {
    "name": "tool_create_booking",
    "description": "Create a new booking at restaurant_name",
    "inputSchema": {
        "json": {
            "type": "object",
            "properties": {
                "date": {
                    "type": "string",
                    "description": """The date of the booking in the format YYYY-MM-DD. 
                    Do NOT accept relative dates like today or tomorrow. 
                    Ask for today's date for relative date."""
                },
                "hour": {
                    "type": "string",
                    "description": "the hour of the booking in the format HH:MM"
                },
                "restaurant_name": {
                    "type": "string",
                    "description": "name of the restaurant handling the reservation"
                },
                "guest_name": {
                    "type": "string",
                    "description": "The name of the customer to have in the reservation"
                },
                "num_guests": {
                    "type": "integer",
                    "description": "The number of guests for the booking"
                }
            },
            "required": ["date", "hour", "restaurant_name", "guest_name", "num_guests"]
        }
    }
}
# Function name must match tool name
def tool_create_booking(tool: ToolUse, **kwargs: Any) -> ToolResult:
    kb_name = 'restaurant-assistant'
    dynamodb = boto3.resource('dynamodb')
    smm_client = boto3.client('ssm')
    table_name = smm_client.get_parameter(
        Name=f'{kb_name}-table-name',
        WithDecryption=False
    )
    table = dynamodb.Table(table_name["Parameter"]["Value"])
    
    tool_use_id = tool["toolUseId"]
    date = tool["input"]["date"]
    hour = tool["input"]["hour"]
    restaurant_name = tool["input"]["restaurant_name"]
    guest_name = tool["input"]["guest_name"]
    num_guests = tool["input"]["num_guests"]
    
    results = f"Creating reservation for {num_guests} people at {restaurant_name}, " \
              f"{date} at {hour} in the name of {guest_name}"
    print(results)
    try:
        booking_id = str(uuid.uuid4())[:8]
        table.put_item(
            Item={
                'booking_id': booking_id,
                'restaurant_name': restaurant_name,
                'date': date,
                'name': guest_name,
                'hour': hour,
                'num_guests': num_guests
            }
        )
        return {
            "toolUseId": tool_use_id,
            "status": "success",
            "content": [{"text": f"Reservation created with booking id: {booking_id}"}]
        } 
    except Exception as e:
        return {
            "toolUseId": tool_use_id,
            "status": "error",
            "content": [{"text": str(e)}]
        } 

Overwriting tool_create_booking.py


In [9]:
import tool_create_booking
import tool_delete_booking
import tool_get_booking_details

#### Tool List

In [10]:
#Knowledge Base
os.environ["KNOWLEDGE_BASE_ID"] = kb_id["Parameter"]["Value"]

#Tools list
tool_list = [retrieve, current_time, tool_get_booking_details, tool_create_booking, tool_delete_booking]

## Tests Setup
_______________________________________________________________________________________________

In [11]:
# Test 1: Multiple models, single system prompt, single query

# Note: CSV results are automatically saved to test_results/ folder (save_to_csv=True by default)
results1 = tester.run_test(
    models=[
        "bedrock/openai.gpt-oss-120b-1:0",
        "bedrock/us.anthropic.claude-3-7-sonnet-20250219-v1:0"
    ],  # LiteLLM endpoints
    system_prompts=["version2"],  # System prompts list to test
    queries=test_queries[0],  # Queries to test
    prompts_dict=prompts,  # Dictionary of prompts
    tool=tool_list,  # Tools to test
    save_to_csv=True, # Default True
)

tester.display_results(results1)


üöÄ Starting LiteLLM Test Suite
üìä Total combinations to test: 2
ü§ñ Models: ['bedrock/openai.gpt-oss-120b-1:0', 'bedrock/us.anthropic.claude-3-7-sonnet-20250219-v1:0']
üìù Prompts: ['version2']
‚ùì Queries: 1 query(ies)

[1/2] Testing: bedrock/openai.gpt-oss-120b-1:0 | version2
Query: Make a reservation for tonight at 'The Smoking Ember' for 5 persons At 8pm in the name of Andres
------------------------------------------------------------
üîß Using AWS region: us-east-1
<answer>Restaurant Helper  
Your reservation at The Smoking Ember has been confirmed for 5 guests on 2025-11-10 at 20:00 under the name Andres. Should you need to modify or cancel this booking, please let us know.  

For any further assistance, you may contact us at +1 999 999 99 9999.</answer>
Tool #1: retrieve

Tool #2: current_time

Tool #3: current_time

Tool #4: tool_create_booking
Creating reservation for 5 people at The Smoking Ember, 2025-11-10 at 20:00 in the name of Andres
Restaurant Helper

Your rese

In [16]:
# Test 2: Single model, multiple prompts, single query
results2 = tester.run_test(
    models=["bedrock/us.anthropic.claude-opus-4-1-20250805-v1:0"],  # Single LiteLLM endpoint
    system_prompts=["version1", "version2"],  # Multiple prompts
    queries="bring me the menu for Rice and Spice and cancel the reservation for Andres tonight at 'The Smoking Ember'for 5 persons At 8pm  ",  # Single query
    prompts_dict=prompts,
    tool=tool_list )
  

tester.display_results(results2)


üöÄ Starting LiteLLM Test Suite
üìä Total combinations to test: 2
ü§ñ Models: ['bedrock/us.anthropic.claude-opus-4-1-20250805-v1:0']
üìù Prompts: ['version1', 'version2']
‚ùì Queries: 1 query(ies)

[1/2] Testing: bedrock/us.anthropic.claude-opus-4-1-20250805-v1:0 | version1
Query: bring me the menu for Rice and Spice and cancel the reservation for Andres tonight at 'The Smoking Ember'for 5 persons At 8pm  
------------------------------------------------------------
üîß Using AWS region: us-east-1
I'll help you with both requests right away.
Tool #1: retrieve

Tool #2: tool_delete_booking
<answer>
Hello, this is Restaurant Helper. I've got the menu for Rice & Spice for you, and I attempted to cancel the reservation but encountered an issue.

**Rice & Spice Menu** (Pan-Asian Fusion):

**SMALL BITES**
- Pork Belly Bao Buns with hoisin and pickled daikon - $12
- Korean Fried Chicken with gochujang glaze - $14
- Green Papaya Salad with peanuts and chili lime dressing - $10

**MAIN DI





## [OPTIONAL] üß™ Test Case Evaluation with LLM-as-Judge 

This section demonstrates how to run structured test case evaluation using the `run_evaluation()` method. This method:

- Loads test cases from `config_evaluation.yaml`
- Runs multi-turn conversations for each test case
- Uses an LLM-as-judge to evaluate responses against expected results
- Provides detailed scoring and analysis
- Optionally integrates with Langfuse for tracing

## Agent Evaluation (csv output)

In [19]:
# Load system prompts for evaluation
with open('config_experiments.yml', 'r') as f:
    config = yaml.safe_load(f)

prompts = config['system_prompts']
os.environ["KNOWLEDGE_BASE_ID"] = kb_id["Parameter"]["Value"]
tool_list = [retrieve, current_time, tool_get_booking_details, tool_create_booking, tool_delete_booking]

# Run evaluation with test cases

evaluation_results = tester.run_evaluation(
    models=[
        "bedrock/openai.gpt-oss-120b-1:0",
        "bedrock/us.anthropic.claude-3-7-sonnet-20250219-v1:0"
    ],  # Multiple models
    system_prompts=["version2"],   # Multiple prompts
    prompts_dict=prompts,
    tool=tool_list,
    test_cases_path="config_evaluation.yml",
    save_to_csv=True
)

print(f"\n‚úÖ Evaluation completed with {len(evaluation_results)} test case results")


üß™ Starting Test Case Evaluation
üìä Total combinations: 4
ü§ñ Models: ['bedrock/openai.gpt-oss-120b-1:0', 'bedrock/us.anthropic.claude-3-7-sonnet-20250219-v1:0']
üìù Prompts: ['version2']
üìã Test Cases: 2 test case(s)

[1/4] Evaluating: bedrock/openai.gpt-oss-120b-1:0 | version2 | restaurant_booking_flow_with_menu_inquiry
------------------------------------------------------------
üìù Test Case: restaurant_booking_flow_with_menu_inquiry

  Turn 1: Hi, I'd like to make a reservation at Bistro Parisienne for ...

[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm._turn_on_debug()'.

    ‚ùå Error in question 1: litellm.BadRequestError: BedrockException - validationException {"message":"The model returned the following errors: Mantle streaming error for requestId efb85564-9003-4933-b767-31abf75ca929: ErrorEvent { error: APIError { type: \"BadRequestError\", code: Some(400), message: \"Un

### Multi-Model Evaluation Comparison

Compare multiple models and prompts across all test cases:

In [None]:
# Comprehensive evaluation across multiple configurations
# WARNING!!: This will take longer as it tests all combinations

comprehensive_evaluation = tester.run_evaluation(
    models=["claude-4-sonnet", "qwen3-235b"],  # Multiple models
    system_prompts=["version2"],   # Multiple prompts
    prompts_dict=prompts,
    tool=tool_list,
    test_cases_path="config_evaluation.yaml",
    save_to_csv=True
)

print(f"\nüéØ Comprehensive evaluation completed with {len(comprehensive_evaluation)} results")
print("\nüìä Detailed analysis shows model and prompt performance across all test cases")