# 01 - Experiments tracking - AWS Strands Agents with LiteLLM and LangFuse observability

This notebook demonstrates the unified testing approach using LiteLLM endpoints with the `run_test()` and `run_evaluation()` methods with human-readable results.

## Setup
______________________________________________________________

In [1]:
%pip install "strands-agents" "strands-agents-tools" "langfuse==3.1.1" "litellm" opentelemetry-api opentelemetry-sdk opentelemetry-exporter-otlp -q

Note: you may need to restart the kernel to use updated packages.


In [2]:
import os, base64
import time, uuid, boto3
import yaml
from datetime import datetime
from utils import UnifiedTester

In [3]:
# Initialize LiteLLM Unified Tester
tester = UnifiedTester()

# Load configuration
with open('config_experiments.yml', 'r') as f:
    config = yaml.safe_load(f)

prompts = config['system_prompts']
test_queries = config['test_queries']

print("‚úÖ LiteLLM Unified Tester initialized!")
print(f"‚úÖ Available Prompts: {list(prompts.keys())}")
print(f"‚úÖ Number of Test queries: {len(test_queries)}")

‚úÖ LiteLLM Unified Tester initialized!
‚úÖ Available Prompts: ['version1', 'version2']
‚úÖ Number of Test queries: 2


### LangFuse Setup

In [None]:
## 1. Set general environment variables first

os.environ["LANGFUSE_SECRET_KEY"] = "sk-lf-xxxxxxxxxx" # Your Langfuse project secret key
os.environ["LANGFUSE_PUBLIC_KEY"] = "pk-lf-xxxxxxxxxx" # Your Langfuse project public key
os.environ["LANGFUSE_HOST"] = "https://us.cloud.langfuse.com/" # Langfuse domain


def setup_langfuse_v3(langfuse_public_key, langfuse_secret_key, langfuse_api_url):
    """Set up LangFuse v3 with proper configuration"""
    
    
    # 2. Set up OpenTelemetry endpoint with proper authentication
    otel_endpoint = f"{langfuse_api_url}/api/public/otel/v1/traces"
    auth_token = base64.b64encode(
        f"{langfuse_public_key}:{langfuse_secret_key}".encode()
    ).decode()
    
    os.environ["OTEL_EXPORTER_OTLP_ENDPOINT"] = otel_endpoint
    os.environ["OTEL_EXPORTER_OTLP_HEADERS"] = f"Authorization=Basic {auth_token}"
    
    print("‚úÖ LangFuse v3 Environment Configured:")
    print(f"   Host: {langfuse_api_url}")
    print(f"   OTEL Endpoint: {otel_endpoint}")
    print(f"   Authentication: Configured")
    
    return True

# Set up LangFuse
setup_langfuse_v3(os.environ["LANGFUSE_PUBLIC_KEY"], os.environ["LANGFUSE_SECRET_KEY"], os.environ["LANGFUSE_HOST"])

## Tools Definition
__________________________________________________________________________________

#### Dependencies setup

In [5]:
from strands_tools import retrieve, current_time
from strands import Agent, tool
from strands.models.litellm import LiteLLMModel
import os

#### AWS Clients setup (Bedrock KnowledgeBase and DynamoDB)

In [6]:
kb_name = "restaurant-assistant"
dynamodb = boto3.resource("dynamodb")
smm_client = boto3.client("ssm")
table_name = smm_client.get_parameter(
    Name=f"{kb_name}-table-name", WithDecryption=False
)
table = dynamodb.Table(table_name["Parameter"]["Value"])
kb_id = smm_client.get_parameter(Name=f"{kb_name}-kb-id", WithDecryption=False)
print("DynamoDB table:", table_name["Parameter"]["Value"])
print("Knowledge Base Id:", kb_id["Parameter"]["Value"])

import uuid
session_id = uuid.uuid4()

#Knowledge Base
os.environ["KNOWLEDGE_BASE_ID"] = kb_id["Parameter"]["Value"]

DynamoDB table: restaurant-bookings
Knowledge Base Id: XGIDQ0ALGL


### Tools setup

In [7]:
%%writefile tool_get_booking_details.py

from strands import tool
import boto3 

@tool
def tool_booking_details(booking_id: str, restaurant_name: str) -> dict:
    """Get the relevant details for booking_id in restaurant_name
    Args:
        booking_id: the id of the reservation
        restaurant_name: name of the restaurant handling the reservation

    Returns:
        booking_details: the details of the booking in JSON format
    """

    try:
        response = table.get_item(
            Key={"booking_id": booking_id, "restaurant_name": restaurant_name}
        )
        if "Item" in response:
            return response["Item"]
        else:
            return f"No booking found with ID {booking_id}"
    except Exception as e:
        return str(e)

Overwriting tool_get_booking_details.py


In [8]:
%%writefile tool_delete_booking.py

from strands import tool
import boto3 

@tool
def tool_delete_booking(booking_id: str, restaurant_name:str) -> str:
    """delete an existing booking_id at restaurant_name
    Args:
        booking_id: the id of the reservation
        restaurant_name: name of the restaurant handling the reservation

    Returns:
        confirmation_message: confirmation message
    """
    kb_name = 'restaurant-assistant'
    dynamodb = boto3.resource('dynamodb')
    smm_client = boto3.client('ssm')
    table_name = smm_client.get_parameter(
        Name=f'{kb_name}-table-name',
        WithDecryption=False
    )
    table = dynamodb.Table(table_name["Parameter"]["Value"])
    try:
        response = table.delete_item(Key={'booking_id': booking_id, 'restaurant_name': restaurant_name})
        if response['ResponseMetadata']['HTTPStatusCode'] == 200:
            return f'Booking with ID {booking_id} deleted successfully'
        else:
            return f'Failed to delete booking with ID {booking_id}'
    except Exception as e:
        return str(e)

Overwriting tool_delete_booking.py


In [9]:
%%writefile tool_create_booking.py

#Alternativelly, you can use the TOOL_SPEC approach when defining your tool

from typing import Any
from strands.types.tools import ToolResult, ToolUse
import boto3
import uuid


TOOL_SPEC = {
    "name": "tool_create_booking",
    "description": "Create a new booking at restaurant_name",
    "inputSchema": {
        "json": {
            "type": "object",
            "properties": {
                "date": {
                    "type": "string",
                    "description": """The date of the booking in the format YYYY-MM-DD. 
                    Do NOT accept relative dates like today or tomorrow. 
                    Ask for today's date for relative date."""
                },
                "hour": {
                    "type": "string",
                    "description": "the hour of the booking in the format HH:MM"
                },
                "restaurant_name": {
                    "type": "string",
                    "description": "name of the restaurant handling the reservation"
                },
                "guest_name": {
                    "type": "string",
                    "description": "The name of the customer to have in the reservation"
                },
                "num_guests": {
                    "type": "integer",
                    "description": "The number of guests for the booking"
                }
            },
            "required": ["date", "hour", "restaurant_name", "guest_name", "num_guests"]
        }
    }
}
# Function name must match tool name
def tool_create_booking(tool: ToolUse, **kwargs: Any) -> ToolResult:
    kb_name = 'restaurant-assistant'
    dynamodb = boto3.resource('dynamodb')
    smm_client = boto3.client('ssm')
    table_name = smm_client.get_parameter(
        Name=f'{kb_name}-table-name',
        WithDecryption=False
    )
    table = dynamodb.Table(table_name["Parameter"]["Value"])
    
    tool_use_id = tool["toolUseId"]
    date = tool["input"]["date"]
    hour = tool["input"]["hour"]
    restaurant_name = tool["input"]["restaurant_name"]
    guest_name = tool["input"]["guest_name"]
    num_guests = tool["input"]["num_guests"]
    
    results = f"Creating reservation for {num_guests} people at {restaurant_name}, " \
              f"{date} at {hour} in the name of {guest_name}"
    print(results)
    try:
        booking_id = str(uuid.uuid4())[:8]
        table.put_item(
            Item={
                'booking_id': booking_id,
                'restaurant_name': restaurant_name,
                'date': date,
                'name': guest_name,
                'hour': hour,
                'num_guests': num_guests
            }
        )
        return {
            "toolUseId": tool_use_id,
            "status": "success",
            "content": [{"text": f"Reservation created with booking id: {booking_id}"}]
        } 
    except Exception as e:
        return {
            "toolUseId": tool_use_id,
            "status": "error",
            "content": [{"text": str(e)}]
        } 

Overwriting tool_create_booking.py


In [10]:
import tool_create_booking
import tool_delete_booking
import tool_get_booking_details

#### Set Agent Tools List

In [11]:
#Tools list

tool_list = [retrieve, current_time, tool_get_booking_details, tool_create_booking, tool_delete_booking]

## Tests Setup
_______________________________________________________________________________________________

In [13]:
test_queries

["Make a reservation for tonight at 'The Smoking Ember' for 5 persons At 8pm in the name of Andres",
 "Cancel the reservation for Andres tonight at 'The Smoking Ember'for 5 persons At 8pm"]

In [None]:
# Test 1: Multiple LiteLLM models, single system prompt, multiple queries

#Test name
test_name = "Restaurant helper Test"

#Langfuse trace attributes
trace_attributes = {
    "operation.name": test_name, 
    "langfuse.trace.name": test_name,
    "session.id": session_id,
    "user.id": "xxxxxx@amazon.com",
    "langfuse.tags": [
        f"Agent-{test_name}"
    ],
    "langfuse.environment": "development"
}

# Test Definition and Execution using LiteLLM endpoints

results1 = tester.run_test(
    models=[
        "bedrock/us.amazon.nova-pro-v1:0",
        "bedrock/us.anthropic.claude-3-7-sonnet-20250219-v1:0"
    ],  # LiteLLM endpoints
    system_prompts=["version2"],  # System prompts to test from config_experiments.yml
    queries=test_queries,  #  Test all queries from config_experiments.yml
    prompts_dict=prompts,  # Dictionary of prompts
    tool=tool_list,  # Tools to test
    save_to_csv=True, # Default True
    trace_attributes=trace_attributes, # Custom tracing metadata (tags)
    conversation_window = 5 # Last N conversation interactions to store as context. Default of 5
)

tester.display_results(results1)

‚úÖ Langfuse V3 tracing enabled

üöÄ Starting LiteLLM Test Suite
üìä Total combinations to test: 4
ü§ñ Models: ['bedrock/us.amazon.nova-pro-v1:0', 'bedrock/us.anthropic.claude-3-7-sonnet-20250219-v1:0']
üìù Prompts: ['version2']
‚ùì Queries: 2 query(ies)

[1/4] Testing: bedrock/us.amazon.nova-pro-v1:0 | version2
Query: Make a reservation for tonight at 'The Smoking Ember' for 5 persons At 8pm in the name of Andres
------------------------------------------------------------
üîß Using AWS region: us-east-1
<thinking> The user wants to make a reservation for tonight at 'The Smoking Ember' for 5 persons at 8pm in the name of Andres. First, I need to verify if 'The Smoking Ember' exists in the restaurant directory. If it does, I will proceed to create the booking. </thinking>

Tool #1: retrieve
<thinking> The restaurant 'The Smoking Ember' exists in the directory. Now I need to get the current time to determine the exact date for tonight's reservation. </thinking> 
Tool #2: current_ti

In [None]:
# Test 2: Single LiteLLM model, multiple prompts, all queries

#Test name
test_name = "Restaurant helper Test"

#Langfuse trace attributes
trace_attributes = {
    "operation.name": test_name, 
    "langfuse.trace.name": test_name,
    "session.id": session_id,
    "user.id": "xxxxx@amazon.com",
    "langfuse.tags": [
        f"Agent-{test_name}"
    ],
    "langfuse.environment": "development"
}

results2 = tester.run_test(
    models=["bedrock/us.anthropic.claude-opus-4-1-20250805-v1:0"],  # Single LiteLLM endpoint
    prompts_dict=prompts,
    system_prompts=["version1","version2"],  # Multiple prompts
    queries=test_queries,  # All queries
    tool=tool_list,
    trace_attributes=trace_attributes,
    conversation_window=10
)

tester.display_results(results2)

‚úÖ Langfuse V3 tracing enabled

üöÄ Starting LiteLLM Test Suite
üìä Total combinations to test: 4
ü§ñ Models: ['bedrock/us.anthropic.claude-opus-4-1-20250805-v1:0']
üìù Prompts: ['version1', 'version2']
‚ùì Queries: 2 query(ies)

[1/4] Testing: bedrock/us.anthropic.claude-opus-4-1-20250805-v1:0 | version1
Query: Make a reservation for tonight at 'The Smoking Ember' for 5 persons At 8pm in the name of Andres
------------------------------------------------------------
üîß Using AWS region: us-east-1
I'll help you make a reservation at The Smoking Ember for tonight. Let me first get today's date and then create the booking for you.
Tool #1: current_time
Now let me create the reservation for you at The Smoking Ember for tonight (2025-11-13) at 8:00 PM for 5 guests under the name Andres.
Tool #2: tool_create_booking
Creating reservation for 5 people at The Smoking Ember, 2025-11-13 at 20:00 in the name of Andres
Let me check if The Smoking Ember is in our restaurant directory.
Tool #

## [OPTIONAL] üß™ Test Case Evaluation with LLM-as-Judge 

This section demonstrates how to run structured test case evaluation using the `run_evaluation()` method with LiteLLM endpoints. This method:

- Loads test cases from `config_evaluation.yaml`
- Runs multi-turn conversations for each test case
- Uses an LLM-as-judge to evaluate responses against expected results
- Provides detailed scoring and analysis
- Optionally integrates with Langfuse for tracing

## Agent Evaluation (csv output)

In [17]:
# Load system prompts for evaluation
with open('config_experiments.yml', 'r') as f:
    config = yaml.safe_load(f)

#prompts = config['system_prompts']
#os.environ["KNOWLEDGE_BASE_ID"] = kb_id["Parameter"]["Value"]
#tool_list = [retrieve, current_time, tool_get_booking_details, tool_create_booking, tool_delete_booking]

# Run evaluation with test cases using LiteLLM
evaluation_results = tester.run_evaluation(
    models=["bedrock/us.amazon.nova-pro-v1:0"],  # LiteLLM endpoint
    system_prompts=["version2"],  # Single prompt version
    prompts_dict=prompts,
    tool=tool_list,
    test_cases_path="config_evaluation.yml",  # Test cases file
    save_to_csv=True,  # Save results to CSV
    conversation_window=5
)

print(f"\n‚úÖ Evaluation completed with {len(evaluation_results)} test case results")


üß™ Starting Test Case Evaluation
üìä Total combinations: 2
ü§ñ Models: ['bedrock/us.amazon.nova-pro-v1:0']
üìù Prompts: ['version2']
üìã Test Cases: 2 test case(s)

[1/2] Evaluating: bedrock/us.amazon.nova-pro-v1:0 | version2 | restaurant_booking_flow_with_menu_inquiry
------------------------------------------------------------
üìù Test Case: restaurant_booking_flow_with_menu_inquiry

  Turn 1: Hi, I'd like to make a reservation at Bistro Parisienne for ...
<thinking> The user wants to make a reservation at Bistro Parisienne and view their menu first. I need to verify if Bistro Parisienne exists in the directory and then retrieve the menu. </thinking>

Tool #1: retrieve
<thinking> Bistro Parisienne exists in the directory, and I have retrieved the menu. Now, I will ask the user for the required details to create the booking. </thinking>
<answer> Restaurant Helper: Good day! Here is the menu for Bistro Parisienne:

**HORS D'OEUVRES**
¬∑ Escargots de Bourgogne - $14
¬∑ French On