# 01 - Experiments tracking - AWS Strands Agents  

This notebook demonstrates the unified testing approach using the `run_test()` method with human-readable results.

## Setup
______________________________________________________________

In [1]:
%pip install ipywidgets  "strands-agents" "strands-agents-tools" -q

Note: you may need to restart the kernel to use updated packages.


In [2]:
import json
import time
import uuid
import boto3
import yaml
from datetime import datetime
from utils import UnifiedTester


In [3]:
# Initialize Unified Tester
tester = UnifiedTester()

# Load configuration
with open('config_experiments.yml', 'r') as f:
    config = yaml.safe_load(f)

prompts = config['system_prompts']
test_queries = config['test_queries']

print("✅ Unified Tester initialized!")
print(f"✅ Available Models: {len(tester.models)}")
print(f"✅ Available Prompts: {list(prompts.keys())}")
print(f"✅ Number of Test queries: {len(test_queries)}")

✅ Unified Tester initialized!
✅ Available Models: 97
✅ Available Prompts: ['version1', 'version2']
✅ Number of Test queries: 2


## Tools Definition
__________________________________________________________________________________

### Dependencies setup

In [4]:
from strands_tools import retrieve, current_time
from strands import Agent, tool
from strands.models.bedrock import BedrockModel
import os

In [5]:
kb_name = "restaurant-assistant"
dynamodb = boto3.resource("dynamodb")
smm_client = boto3.client("ssm")
table_name = smm_client.get_parameter(
    Name=f"{kb_name}-table-name", WithDecryption=False
)
table = dynamodb.Table(table_name["Parameter"]["Value"])
kb_id = smm_client.get_parameter(Name=f"{kb_name}-kb-id", WithDecryption=False)
print("DynamoDB table:", table_name["Parameter"]["Value"])
print("Knowledge Base Id:", kb_id["Parameter"]["Value"])

DynamoDB table: restaurant-assistant-bookings
Knowledge Base Id: EQGAONOILG


#### Tools setup

In [6]:
%%writefile tool_get_booking_details.py

from strands import tool
import boto3 

@tool
def tool_booking_details(booking_id: str, restaurant_name: str) -> dict:
    """Get the relevant details for booking_id in restaurant_name
    Args:
        booking_id: the id of the reservation
        restaurant_name: name of the restaurant handling the reservation

    Returns:
        booking_details: the details of the booking in JSON format
    """

    try:
        response = table.get_item(
            Key={"booking_id": booking_id, "restaurant_name": restaurant_name}
        )
        if "Item" in response:
            return response["Item"]
        else:
            return f"No booking found with ID {booking_id}"
    except Exception as e:
        return str(e)

Overwriting tool_get_booking_details.py


In [7]:
%%writefile tool_delete_booking.py

from strands import tool
import boto3 

@tool
def tool_delete_booking(booking_id: str, restaurant_name:str) -> str:
    """delete an existing booking_id at restaurant_name
    Args:
        booking_id: the id of the reservation
        restaurant_name: name of the restaurant handling the reservation

    Returns:
        confirmation_message: confirmation message
    """
    kb_name = 'restaurant-assistant'
    dynamodb = boto3.resource('dynamodb')
    smm_client = boto3.client('ssm')
    table_name = smm_client.get_parameter(
        Name=f'{kb_name}-table-name',
        WithDecryption=False
    )
    table = dynamodb.Table(table_name["Parameter"]["Value"])
    try:
        response = table.delete_item(Key={'booking_id': booking_id, 'restaurant_name': restaurant_name})
        if response['ResponseMetadata']['HTTPStatusCode'] == 200:
            return f'Booking with ID {booking_id} deleted successfully'
        else:
            return f'Failed to delete booking with ID {booking_id}'
    except Exception as e:
        return str(e)

Overwriting tool_delete_booking.py


In [8]:
%%writefile tool_create_booking.py

#Alternativelly, you can use the TOOL_SPEC approach when defining your tool

from typing import Any
from strands.types.tools import ToolResult, ToolUse
import boto3
import uuid


TOOL_SPEC = {
    "name": "tool_create_booking",
    "description": "Create a new booking at restaurant_name",
    "inputSchema": {
        "json": {
            "type": "object",
            "properties": {
                "date": {
                    "type": "string",
                    "description": """The date of the booking in the format YYYY-MM-DD. 
                    Do NOT accept relative dates like today or tomorrow. 
                    Ask for today's date for relative date."""
                },
                "hour": {
                    "type": "string",
                    "description": "the hour of the booking in the format HH:MM"
                },
                "restaurant_name": {
                    "type": "string",
                    "description": "name of the restaurant handling the reservation"
                },
                "guest_name": {
                    "type": "string",
                    "description": "The name of the customer to have in the reservation"
                },
                "num_guests": {
                    "type": "integer",
                    "description": "The number of guests for the booking"
                }
            },
            "required": ["date", "hour", "restaurant_name", "guest_name", "num_guests"]
        }
    }
}
# Function name must match tool name
def tool_create_booking(tool: ToolUse, **kwargs: Any) -> ToolResult:
    kb_name = 'restaurant-assistant'
    dynamodb = boto3.resource('dynamodb')
    smm_client = boto3.client('ssm')
    table_name = smm_client.get_parameter(
        Name=f'{kb_name}-table-name',
        WithDecryption=False
    )
    table = dynamodb.Table(table_name["Parameter"]["Value"])
    
    tool_use_id = tool["toolUseId"]
    date = tool["input"]["date"]
    hour = tool["input"]["hour"]
    restaurant_name = tool["input"]["restaurant_name"]
    guest_name = tool["input"]["guest_name"]
    num_guests = tool["input"]["num_guests"]
    
    results = f"Creating reservation for {num_guests} people at {restaurant_name}, " \
              f"{date} at {hour} in the name of {guest_name}"
    print(results)
    try:
        booking_id = str(uuid.uuid4())[:8]
        table.put_item(
            Item={
                'booking_id': booking_id,
                'restaurant_name': restaurant_name,
                'date': date,
                'name': guest_name,
                'hour': hour,
                'num_guests': num_guests
            }
        )
        return {
            "toolUseId": tool_use_id,
            "status": "success",
            "content": [{"text": f"Reservation created with booking id: {booking_id}"}]
        } 
    except Exception as e:
        return {
            "toolUseId": tool_use_id,
            "status": "error",
            "content": [{"text": str(e)}]
        } 

Overwriting tool_create_booking.py


In [9]:
import tool_create_booking
import tool_delete_booking
import tool_get_booking_details

#### Tool List

In [10]:
#Knowledge Base
os.environ["KNOWLEDGE_BASE_ID"] = kb_id["Parameter"]["Value"]

#Tools list
tool_list = [retrieve, current_time, tool_get_booking_details, tool_create_booking, tool_delete_booking]

## Explore Available Models
-----------------------------------------------------------------------------

In [11]:
# List all available models grouped by provider
tester.list_models_by_provider()


📋 AVAILABLE MODELS (97 total)

🏢 AI21 (2 models)
------------------------------------------------------------------------------------------
  TYPE| MODEL_NAME                | ENDPOINT                        | REGION  | TOOL_SUPPORT
------------------------------------------------------------------------------------------
  📍    jamba-1.5-large           | ai21.jamba-1-5-large-v1:0       | us-east-1 | ❌
  📍    jamba-1.5-mini            | ai21.jamba-1-5-mini-v1:0        | us-east-1 | ❌

🏢 Amazon (22 models)
------------------------------------------------------------------------------------------
  TYPE| MODEL_NAME                | ENDPOINT                        | REGION  | TOOL_SUPPORT
------------------------------------------------------------------------------------------
  📍    amazon-rerank             | amazon.rerank-v1:0              | us-west-2 | ❌
  🌍    nova-lite                 | us.amazon.nova-lite-v1:0        | us-east-1 | 🔧
  📍    nova-lite-east-1          | amazon.nova

## Tests Setup
_______________________________________________________________________________________________

In [14]:
# Test 1: Multiple models, single system prompt, single query

# Note: CSV results are automatically saved to test_results/ folder (save_to_csv=True by default)
results1 = tester.run_test(
    models=["nova-pro", "qwen3-235b"],  # Model list to test
    system_prompts=["version2"],  # System prompts list to test
    queries=test_queries,  # Queries to test
    prompts_dict=prompts,  # Dictionary of prompts
    tool=tool_list,  # Tools to test
    save_to_csv=True # Default True
)

tester.display_results(results1)


🚀 Starting Unified Test Suite
📊 Total combinations to test: 4
🤖 Models: ['nova-pro', 'qwen3-235b']
📝 Prompts: ['version2']
❓ Queries: 2 query(ies)

[1/4] Testing: nova-pro | version2
Query: Make a reservation for tonight at Rice & Spice for 5 persons...
------------------------------------------------------------
<thinking> The user has requested to make a reservation for tonight at Rice & Spice for 5 persons at 8pm in the name of Andres. I need to verify that Rice & Spice exists in the directory before proceeding with the booking. </thinking>

Tool #1: retrieve
<thinking> Rice & Spice exists in the directory. I can now proceed with creating the booking for tonight at 8pm for 5 persons in the name of Andres. </thinking> 
Tool #2: tool_create_booking
Creating reservation for 5 people at Rice & Spice, 2023-10-02 at 20:00 in the name of Andres
<answer> Restaurant Helper: Your reservation for tonight at 8pm for 5 persons at Rice & Spice under the name Andres has been successfully created.

In [15]:
# Test 2: Single model, multiple prompts, single query
results2 = tester.run_test(
    models=["claude-4-sonnet"],
    system_prompts=["version1", "version2"],
    queries=test_queries[0],  # Single query
    prompts_dict=prompts,
    tool=tool_list
)

tester.display_results(results2)


🚀 Starting Unified Test Suite
📊 Total combinations to test: 2
🤖 Models: ['claude-4-sonnet']
📝 Prompts: ['version1', 'version2']
❓ Queries: 1 query(ies)

[1/2] Testing: claude-4-sonnet | version1
Query: Make a reservation for tonight at Rice & Spice for 5 persons...
------------------------------------------------------------
Hello! I'm Restaurant Helper, and I'd be happy to help you make a reservation at Rice & Spice.

To complete your booking for 5 people at 8pm under the name Andres, I need to know today's date since you mentioned "tonight". Could you please provide today's date in the format YYYY-MM-DD?

Once I have that information, I'll be able to create your reservation right away!
Tool #1: current_time
Perfect! I can see that today is October 23rd, 2025. Let me create your reservation for tonight at Rice & Spice.
Tool #2: tool_create_booking
Creating reservation for 5 people at Rice & Spice, 2025-10-23 at 20:00 in the name of Andres
<answer>
Excellent! I've successfully created





## [OPTIONAL] 🧪 Test Case Evaluation with LLM-as-Judge 

This section demonstrates how to run structured test case evaluation using the `run_evaluation()` method. This method:

- Loads test cases from `config_evaluation.yaml`
- Runs multi-turn conversations for each test case
- Uses an LLM-as-judge to evaluate responses against expected results
- Provides detailed scoring and analysis
- Optionally integrates with Langfuse for tracing

## Agent Evaluation (csv output)

In [None]:
# Load system prompts for evaluation
with open('config_experiments.yml', 'r') as f:
    config = yaml.safe_load(f)

prompts = config['system_prompts']
os.environ["KNOWLEDGE_BASE_ID"] = kb_id["Parameter"]["Value"]
tool_list = [retrieve, current_time, tool_get_booking_details, tool_create_booking, tool_delete_booking]

# Run evaluation with test cases
evaluation_results = tester.run_evaluation(
    models=["claude-4-sonnet"],  # Single model for faster evaluation
    system_prompts=["version2"],  # Single prompt version
    prompts_dict=prompts,
    tool=tool_list,
    test_cases_path="config_evaluation.yml",  # Test cases file
    save_to_csv=True  # Save results to CSV
)

print(f"\n✅ Evaluation completed with {len(evaluation_results)} test case results")

### Multi-Model Evaluation Comparison

Compare multiple models and prompts across all test cases:

In [None]:
# Comprehensive evaluation across multiple configurations
# WARNING!!: This will take longer as it tests all combinations

comprehensive_evaluation = tester.run_evaluation(
    models=["claude-4-sonnet", "qwen3-235b"],  # Multiple models
    system_prompts=["version2"],   # Multiple prompts
    prompts_dict=prompts,
    tool=tool_list,
    test_cases_path="config_evaluation.yaml",
    save_to_csv=True
)

print(f"\n🎯 Comprehensive evaluation completed with {len(comprehensive_evaluation)} results")
print("\n📊 Detailed analysis shows model and prompt performance across all test cases")