# Test Cases Evaluation with LLM-as-a-Judge - AWS Strands Agents SDK

This notebook demonstrates the unified testing approach using the `run_test()` method with human-readable results.

## Setup
______________________________________________________________

In [1]:
%pip install "strands-agents" "strands-agents-tools" "langfuse==3.1.1" opentelemetry-api opentelemetry-sdk opentelemetry-exporter-otlp -q

Note: you may need to restart the kernel to use updated packages.


In [2]:
import os, base64
import time, uuid, boto3
import yaml
from datetime import datetime
from utils import UnifiedTester



In [3]:
# Initialize Unified Tester
tester = UnifiedTester()

# Load configuration
with open('config_experiments.yml', 'r') as f:
    config = yaml.safe_load(f)

prompts = config['system_prompts']
test_queries = config['test_queries']

print("✅ Unified Tester initialized!")
print(f"✅ Available Models: {len(tester.models)}")
print(f"✅ Available Prompts: {list(prompts.keys())}")
print(f"✅ Number of Test queries: {len(test_queries)}")

✅ Unified Tester initialized!
✅ Available Models: 97
✅ Available Prompts: ['version1', 'version2']
✅ Number of Test queries: 2


## Tools Definition
__________________________________________________________________________________

#### Dependencies setup

In [4]:
from strands_tools import retrieve, current_time
from strands import Agent, tool
from strands.models.bedrock import BedrockModel
import os

#### AWS Clients setup (Bedrock KnowledgeBase and DynamoDB)

In [5]:
kb_name = "restaurant-assistant"
dynamodb = boto3.resource("dynamodb")
smm_client = boto3.client("ssm")
table_name = smm_client.get_parameter(
    Name=f"{kb_name}-table-name", WithDecryption=False
)
table = dynamodb.Table(table_name["Parameter"]["Value"])
kb_id = smm_client.get_parameter(Name=f"{kb_name}-kb-id", WithDecryption=False)
print("DynamoDB table:", table_name["Parameter"]["Value"])
print("Knowledge Base Id:", kb_id["Parameter"]["Value"])

import uuid
session_id = uuid.uuid4()

DynamoDB table: restaurant-assistant-bookings
Knowledge Base Id: EQGAONOILG


### Tools setup

In [6]:
%%writefile tool_get_booking_details.py

from strands import tool
import boto3 

@tool
def tool_booking_details(booking_id: str, restaurant_name: str) -> dict:
    """Get the relevant details for booking_id in restaurant_name
    Args:
        booking_id: the id of the reservation
        restaurant_name: name of the restaurant handling the reservation

    Returns:
        booking_details: the details of the booking in JSON format
    """

    try:
        response = table.get_item(
            Key={"booking_id": booking_id, "restaurant_name": restaurant_name}
        )
        if "Item" in response:
            return response["Item"]
        else:
            return f"No booking found with ID {booking_id}"
    except Exception as e:
        return str(e)

Overwriting tool_get_booking_details.py


In [7]:
%%writefile tool_delete_booking.py

from strands import tool
import boto3 

@tool
def tool_delete_booking(booking_id: str, restaurant_name:str) -> str:
    """delete an existing booking_id at restaurant_name
    Args:
        booking_id: the id of the reservation
        restaurant_name: name of the restaurant handling the reservation

    Returns:
        confirmation_message: confirmation message
    """
    kb_name = 'restaurant-assistant'
    dynamodb = boto3.resource('dynamodb')
    smm_client = boto3.client('ssm')
    table_name = smm_client.get_parameter(
        Name=f'{kb_name}-table-name',
        WithDecryption=False
    )
    table = dynamodb.Table(table_name["Parameter"]["Value"])
    try:
        response = table.delete_item(Key={'booking_id': booking_id, 'restaurant_name': restaurant_name})
        if response['ResponseMetadata']['HTTPStatusCode'] == 200:
            return f'Booking with ID {booking_id} deleted successfully'
        else:
            return f'Failed to delete booking with ID {booking_id}'
    except Exception as e:
        return str(e)

Overwriting tool_delete_booking.py


In [8]:
%%writefile tool_create_booking.py

#Alternativelly, you can use the TOOL_SPEC approach when defining your tool

from typing import Any
from strands.types.tools import ToolResult, ToolUse
import boto3
import uuid


TOOL_SPEC = {
    "name": "tool_create_booking",
    "description": "Create a new booking at restaurant_name",
    "inputSchema": {
        "json": {
            "type": "object",
            "properties": {
                "date": {
                    "type": "string",
                    "description": """The date of the booking in the format YYYY-MM-DD. 
                    Do NOT accept relative dates like today or tomorrow. 
                    Ask for today's date for relative date."""
                },
                "hour": {
                    "type": "string",
                    "description": "the hour of the booking in the format HH:MM"
                },
                "restaurant_name": {
                    "type": "string",
                    "description": "name of the restaurant handling the reservation"
                },
                "guest_name": {
                    "type": "string",
                    "description": "The name of the customer to have in the reservation"
                },
                "num_guests": {
                    "type": "integer",
                    "description": "The number of guests for the booking"
                }
            },
            "required": ["date", "hour", "restaurant_name", "guest_name", "num_guests"]
        }
    }
}
# Function name must match tool name
def tool_create_booking(tool: ToolUse, **kwargs: Any) -> ToolResult:
    kb_name = 'restaurant-assistant'
    dynamodb = boto3.resource('dynamodb')
    smm_client = boto3.client('ssm')
    table_name = smm_client.get_parameter(
        Name=f'{kb_name}-table-name',
        WithDecryption=False
    )
    table = dynamodb.Table(table_name["Parameter"]["Value"])
    
    tool_use_id = tool["toolUseId"]
    date = tool["input"]["date"]
    hour = tool["input"]["hour"]
    restaurant_name = tool["input"]["restaurant_name"]
    guest_name = tool["input"]["guest_name"]
    num_guests = tool["input"]["num_guests"]
    
    results = f"Creating reservation for {num_guests} people at {restaurant_name}, " \
              f"{date} at {hour} in the name of {guest_name}"
    print(results)
    try:
        booking_id = str(uuid.uuid4())[:8]
        table.put_item(
            Item={
                'booking_id': booking_id,
                'restaurant_name': restaurant_name,
                'date': date,
                'name': guest_name,
                'hour': hour,
                'num_guests': num_guests
            }
        )
        return {
            "toolUseId": tool_use_id,
            "status": "success",
            "content": [{"text": f"Reservation created with booking id: {booking_id}"}]
        } 
    except Exception as e:
        return {
            "toolUseId": tool_use_id,
            "status": "error",
            "content": [{"text": str(e)}]
        } 

Overwriting tool_create_booking.py


In [9]:
import tool_create_booking
import tool_delete_booking
import tool_get_booking_details

#### Set Tools List and KnowledgeBase Id capture

In [10]:
#Knowledge Base
os.environ["KNOWLEDGE_BASE_ID"] = kb_id["Parameter"]["Value"]

#Tools list
tool_list = [retrieve, current_time, tool_get_booking_details, tool_create_booking, tool_delete_booking]

## Explore Available Models
-----------------------------------------------------------------------------

In [11]:
# List all available models grouped by provider
tester.list_models_by_provider()


📋 AVAILABLE MODELS (97 total)

🏢 AI21 (2 models)
------------------------------------------------------------------------------------------
  TYPE| MODEL_NAME                | ENDPOINT                        | REGION  | TOOL_SUPPORT
------------------------------------------------------------------------------------------
  📍    jamba-1.5-large           | ai21.jamba-1-5-large-v1:0       | us-east-1 | ❌
  📍    jamba-1.5-mini            | ai21.jamba-1-5-mini-v1:0        | us-east-1 | ❌

🏢 Amazon (22 models)
------------------------------------------------------------------------------------------
  TYPE| MODEL_NAME                | ENDPOINT                        | REGION  | TOOL_SUPPORT
------------------------------------------------------------------------------------------
  📍    amazon-rerank             | amazon.rerank-v1:0              | us-west-2 | ❌
  🌍    nova-lite                 | us.amazon.nova-lite-v1:0        | us-east-1 | 🔧
  📍    nova-lite-east-1          | amazon.nova

## Test Case Evaluation with LLM-as-Judge 

This section demonstrates how to run structured test case evaluation using the `run_evaluation()` method. This method:

- Loads test cases from `config_evaluation.yaml`
- Runs multi-turn conversations for each test case
- Uses an LLM-as-judge to evaluate responses against expected results
- Provides detailed scoring and analysis
- Optionally integrates with Langfuse for tracing

### Evaluation with Langfuse Tracing 
If you have Langfuse credentials, you can enable tracing for detailed observability:

In [13]:
os.environ["LANGFUSE_SECRET_KEY"] = "sk-lf-283af809-1820-4eb4-9a3c-05dbc8d68e9b"  # Starts with sk-
os.environ["LANGFUSE_PUBLIC_KEY"] = "pk-lf-837662dd-32fa-4331-9b86-609b62640939"  # Starts with pk-
os.environ["LANGFUSE_HOST"] = "https://us.cloud.langfuse.com"

In [None]:
# Load system prompts for evaluation
with open('config_experiments.yml', 'r') as f:
    config = yaml.safe_load(f)

prompts = config['system_prompts']

# Initialize tester
tester = UnifiedTester()

# Run evaluation with corrected Langfuse tracing
evaluation_results_traced = tester.run_evaluation(
    models=["claude-4-sonnet"],
    system_prompts=["version1"],
    prompts_dict=prompts,
    tool=tool_list,
    test_cases_path="config_evaluation.yml",
    langfuse_public_key=os.environ["LANGFUSE_PUBLIC_KEY"], 
    langfuse_secret_key=os.environ["LANGFUSE_SECRET_KEY"], 
    langfuse_api_url=os.environ["LANGFUSE_HOST"],
    save_to_csv=True
)

### Agent Evaluation (csv output ONLY)

In [None]:
# Load system prompts for evaluation
with open('config_experiments.yml', 'r') as f:
    config = yaml.safe_load(f)

prompts = config['system_prompts']


# Run evaluation with test cases
evaluation_results = tester.run_evaluation(
    models=["nova-pro"],  # Single model for faster evaluation
    system_prompts=["version1"],  # Single prompt version
    prompts_dict=prompts,
    tool=tool_list,
    test_cases_path="config_evaluation.yml",  # Test cases file
    save_to_csv=True  # Save results to CSV
)

print(f"\n✅ Evaluation completed with {len(evaluation_results)} test case results")