# 01 - Experiments tracking - AWS Strands Agents  

This notebook demonstrates the unified testing approach using the `run_test()` method with human-readable results.

## Setup
______________________________________________________________

In [13]:
%pip install ipywidgets  "strands-agents" "strands-agents-tools" -q

Note: you may need to restart the kernel to use updated packages.


In [14]:
import json
import time
import uuid
import boto3
import yaml
from datetime import datetime
from utils import UnifiedTester


In [15]:
# Initialize Unified Tester
tester = UnifiedTester()

# Load configuration
with open('config_experiments.yml', 'r') as f:
    config = yaml.safe_load(f)

prompts = config['system_prompts']
test_queries = config['test_queries']

print("‚úÖ Unified Tester initialized!")
print(f"‚úÖ Available Prompts: {list(prompts.keys())}")
print(f"‚úÖ Number of Test queries: {len(test_queries)}")

‚úÖ Unified Tester initialized!
‚úÖ Available Prompts: ['version1', 'version2']
‚úÖ Number of Test queries: 2


## Tools Definition
__________________________________________________________________________________

### Dependencies setup

In [16]:
from strands_tools import retrieve, current_time
from strands import Agent, tool
from strands.models.bedrock import BedrockModel
import os

In [17]:
kb_name = "restaurant-assistant"
dynamodb = boto3.resource("dynamodb")
smm_client = boto3.client("ssm")
table_name = smm_client.get_parameter(
    Name=f"{kb_name}-table-name", WithDecryption=False
)
table = dynamodb.Table(table_name["Parameter"]["Value"])
kb_id = smm_client.get_parameter(Name=f"{kb_name}-kb-id", WithDecryption=False)
print("DynamoDB table:", table_name["Parameter"]["Value"])
print("Knowledge Base Id:", kb_id["Parameter"]["Value"])

#Knowledge Base
os.environ["KNOWLEDGE_BASE_ID"] = kb_id["Parameter"]["Value"]

DynamoDB table: restaurant-bookings
Knowledge Base Id: XGIDQ0ALGL


#### Tools setup

In [18]:
%%writefile tool_get_booking_details.py

from strands import tool
import boto3 

@tool
def tool_booking_details(booking_id: str, restaurant_name: str) -> dict:
    """Get the relevant details for booking_id in restaurant_name
    Args:
        booking_id: the id of the reservation
        restaurant_name: name of the restaurant handling the reservation

    Returns:
        booking_details: the details of the booking in JSON format
    """

    try:
        response = table.get_item(
            Key={"booking_id": booking_id, "restaurant_name": restaurant_name}
        )
        if "Item" in response:
            return response["Item"]
        else:
            return f"No booking found with ID {booking_id}"
    except Exception as e:
        return str(e)

Overwriting tool_get_booking_details.py


In [19]:
%%writefile tool_delete_booking.py

from strands import tool
import boto3 

@tool
def tool_delete_booking(booking_id: str, restaurant_name:str) -> str:
    """delete an existing booking_id at restaurant_name
    Args:
        booking_id: the id of the reservation
        restaurant_name: name of the restaurant handling the reservation

    Returns:
        confirmation_message: confirmation message
    """
    kb_name = 'restaurant-assistant'
    dynamodb = boto3.resource('dynamodb')
    smm_client = boto3.client('ssm')
    table_name = smm_client.get_parameter(
        Name=f'{kb_name}-table-name',
        WithDecryption=False
    )
    table = dynamodb.Table(table_name["Parameter"]["Value"])
    try:
        response = table.delete_item(Key={'booking_id': booking_id, 'restaurant_name': restaurant_name})
        if response['ResponseMetadata']['HTTPStatusCode'] == 200:
            return f'Booking with ID {booking_id} deleted successfully'
        else:
            return f'Failed to delete booking with ID {booking_id}'
    except Exception as e:
        return str(e)

Overwriting tool_delete_booking.py


In [20]:
%%writefile tool_create_booking.py

#Alternativelly, you can use the TOOL_SPEC approach when defining your tool

from typing import Any
from strands.types.tools import ToolResult, ToolUse
import boto3
import uuid


TOOL_SPEC = {
    "name": "tool_create_booking",
    "description": "Create a new booking at restaurant_name",
    "inputSchema": {
        "json": {
            "type": "object",
            "properties": {
                "date": {
                    "type": "string",
                    "description": """The date of the booking in the format YYYY-MM-DD. 
                    Do NOT accept relative dates like today or tomorrow. 
                    Ask for today's date for relative date."""
                },
                "hour": {
                    "type": "string",
                    "description": "the hour of the booking in the format HH:MM"
                },
                "restaurant_name": {
                    "type": "string",
                    "description": "name of the restaurant handling the reservation"
                },
                "guest_name": {
                    "type": "string",
                    "description": "The name of the customer to have in the reservation"
                },
                "num_guests": {
                    "type": "integer",
                    "description": "The number of guests for the booking"
                }
            },
            "required": ["date", "hour", "restaurant_name", "guest_name", "num_guests"]
        }
    }
}
# Function name must match tool name
def tool_create_booking(tool: ToolUse, **kwargs: Any) -> ToolResult:
    kb_name = 'restaurant-assistant'
    dynamodb = boto3.resource('dynamodb')
    smm_client = boto3.client('ssm')
    table_name = smm_client.get_parameter(
        Name=f'{kb_name}-table-name',
        WithDecryption=False
    )
    table = dynamodb.Table(table_name["Parameter"]["Value"])
    
    tool_use_id = tool["toolUseId"]
    date = tool["input"]["date"]
    hour = tool["input"]["hour"]
    restaurant_name = tool["input"]["restaurant_name"]
    guest_name = tool["input"]["guest_name"]
    num_guests = tool["input"]["num_guests"]
    
    results = f"Creating reservation for {num_guests} people at {restaurant_name}, " \
              f"{date} at {hour} in the name of {guest_name}"
    print(results)
    try:
        booking_id = str(uuid.uuid4())[:8]
        table.put_item(
            Item={
                'booking_id': booking_id,
                'restaurant_name': restaurant_name,
                'date': date,
                'name': guest_name,
                'hour': hour,
                'num_guests': num_guests
            }
        )
        return {
            "toolUseId": tool_use_id,
            "status": "success",
            "content": [{"text": f"Reservation created with booking id: {booking_id}"}]
        } 
    except Exception as e:
        return {
            "toolUseId": tool_use_id,
            "status": "error",
            "content": [{"text": str(e)}]
        } 

Overwriting tool_create_booking.py


In [21]:
import tool_create_booking
import tool_delete_booking
import tool_get_booking_details

#### Tool List

In [22]:

#Tools list
tool_list = [retrieve, current_time, tool_get_booking_details, tool_create_booking, tool_delete_booking]

## Tests Setup
_______________________________________________________________________________________________

In [24]:
# Test 1: Multiple models, single system prompt, single query

# Note: CSV results are automatically saved to test_results/ folder (save_to_csv=True by default)
results1 = tester.run_test(
    models=[
        "bedrock/openai.gpt-oss-120b-1:0",
        "bedrock/us.anthropic.claude-3-7-sonnet-20250219-v1:0"
    ],  # LiteLLM endpoints
    system_prompts=["version2"],  # System prompts to test from config_experiments.yml
    queries=test_queries,  # Test all queries from config_experiments.yml
    prompts_dict=prompts,  # Dictionary of prompts
    tool=tool_list,  # Tools to test
    save_to_csv=True, # Default True
    conversation_window=5 # Last N conversation interactions to store as context. Default of 5.
)

tester.display_results(results1)


üöÄ Starting LiteLLM Test Suite
üìä Total combinations to test: 4
ü§ñ Models: ['bedrock/openai.gpt-oss-120b-1:0', 'bedrock/us.anthropic.claude-3-7-sonnet-20250219-v1:0']
üìù Prompts: ['version2']
‚ùì Queries: 2 query(ies)

[1/4] Testing: bedrock/openai.gpt-oss-120b-1:0 | version2
Query: Make a reservation for tonight at 'The Smoking Ember' for 5 persons At 8pm in the name of Andres
------------------------------------------------------------
üîß Using AWS region: us-east-1

[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm._turn_on_debug()'.

‚ö†Ô∏è Streaming issue detected, trying direct LiteLLM call with stream=False
‚úÖ Direct LiteLLM call successful
<answer>Restaurant Helper

Good evening. I will proceed with your reservation request. Please allow me a moment to confirm that ‚ÄúThe Smoking Ember‚Äù is listed in our directory.</answer>
‚úÖ SUCCESS | Time: 2.78s

[2/4] Testing: bedrock/

In [25]:
# Test 2: Single model, multiple prompts, single custom query

results2 = tester.run_test(
    models=["bedrock/us.anthropic.claude-opus-4-1-20250805-v1:0"],  # Single LiteLLM endpoint
    system_prompts=["version1", "version2"],  # Multiple prompts
    queries="Bring me the menu for Rice & Spice and do a reservation there for Diana for 5 persons At 8pm",  # Custom query
    prompts_dict=prompts,
    tool=tool_list,
    conversation_window=5
)

tester.display_results(results2)


üöÄ Starting LiteLLM Test Suite
üìä Total combinations to test: 2
ü§ñ Models: ['bedrock/us.anthropic.claude-opus-4-1-20250805-v1:0']
üìù Prompts: ['version1', 'version2']
‚ùì Queries: 1 query(ies)

[1/2] Testing: bedrock/us.anthropic.claude-opus-4-1-20250805-v1:0 | version1
Query: Bring me the menu for Rice & Spice and do a reservation there for Diana for 5 persons At 8pm
------------------------------------------------------------
üîß Using AWS region: us-east-1
I'll help you get the menu for Rice & Spice and make a reservation for Diana. Let me retrieve that information and create the booking for you.
Tool #1: retrieve
Now I need to create the reservation. However, I need to know the specific date for your reservation. Could you please provide the date you'd like to book for?

In the meantime, let me get today's date to help with the booking:
Tool #2: current_time
To complete your reservation at Rice & Spice for Diana (5 people at 8pm), I need to know which date you'd like. Wou





## [OPTIONAL] üß™ Test Case Evaluation with LLM-as-Judge 

This section demonstrates how to run structured test case evaluation using the `run_evaluation()` method. This method:

- Loads test cases from `config_evaluation.yaml`
- Runs multi-turn conversations for each test case
- Uses an LLM-as-judge to evaluate responses against expected results
- Provides detailed scoring and analysis
- Optionally integrates with Langfuse for tracing

## Agent Evaluation (csv output)

In [26]:
# Load system prompts for evaluation
with open('config_experiments.yml', 'r') as f:
    config = yaml.safe_load(f)

#prompts = config['system_prompts']
#os.environ["KNOWLEDGE_BASE_ID"] = kb_id["Parameter"]["Value"]
#tool_list = [retrieve, current_time, tool_get_booking_details, tool_create_booking, tool_delete_booking]

# Run evaluation with test cases

evaluation_results = tester.run_evaluation(
    models=[
        "bedrock/us.amazon.nova-pro-v1:0"
    ],  # Multiple models
    system_prompts=["version2"],   # Multiple prompts
    prompts_dict=prompts,
    tool=tool_list,
    test_cases_path="config_evaluation.yml",
    save_to_csv=True,
    conversation_window=10
)

print(f"\n‚úÖ Evaluation completed with {len(evaluation_results)} test case results")


üß™ Starting Test Case Evaluation
üìä Total combinations: 2
ü§ñ Models: ['bedrock/us.anthropic.claude-3-7-sonnet-20250219-v1:0']
üìù Prompts: ['version2']
üìã Test Cases: 2 test case(s)

[1/2] Evaluating: bedrock/us.anthropic.claude-3-7-sonnet-20250219-v1:0 | version2 | restaurant_booking_flow_with_menu_inquiry
------------------------------------------------------------
üìù Test Case: restaurant_booking_flow_with_menu_inquiry

  Turn 1: Hi, I'd like to make a reservation at Bistro Parisienne for ...
Restaurant Helper: Good day and welcome! I'd be delighted to help you with your reservation at Bistro Parisienne and show you their menu.

First, let me check if Bistro Parisienne is in our directory and retrieve their menu for you.
Tool #1: retrieve
Now, I'd like to get the current date to help with your reservation for tonight.
Tool #2: current_time
<answer>
Restaurant Helper: I found Bistro Parisienne in our directory. Here's their menu:

HORS D'OEUVRES:
‚Ä¢ Escargots de Bourgogn

### Multi-Model Evaluation Comparison

Compare multiple models and prompts across all test cases:

In [None]:
# Comprehensive evaluation across multiple configurations
# WARNING!!: This will take longer as it tests all combinations


prompts = config['system_prompts']
os.environ["KNOWLEDGE_BASE_ID"] = kb_id["Parameter"]["Value"]
tool_list = [retrieve, current_time, tool_get_booking_details, tool_create_booking, tool_delete_booking]

# Run evaluation with test cases

evaluation_results = tester.run_evaluation(
    models=[
        "bedrock/us.anthropic.claude-opus-4-20250514-v1:0",
        "bedrock/us.amazon.nova-pro-v1:0"
    ],  # Multiple models
    system_prompts=["version2"],   # Multiple prompts
    prompts_dict=prompts,
    tool=tool_list,
    test_cases_path="config_evaluation.yml",
    save_to_csv=True,
    conversation_window=5
)

print(f"\n‚úÖ Evaluation completed with {len(evaluation_results)} test case results")