# Evaluation Integration Tests - Notebook API

This notebook tests evaluation using the notebook interface that mirrors CLI commands.

## Setup
Configure your agent ID and session ID:

## üöÄ Quick Start

**New to evaluation?** Here's the minimal code to get started:

```python
from bedrock_agentcore_starter_toolkit import Evaluation

# Initialize
eval_client = Evaluation(agent_id="YOUR_AGENT_ID", region="us-east-1")

# Run evaluation
results = eval_client.run(session_id="YOUR_SESSION_ID")

# View results
for result in results.get_successful_results():
    print(f"{result.evaluator_name}: {result.value:.2f} - {result.label}")
```

---

In [None]:
from bedrock_agentcore_starter_toolkit import Evaluation

# Test configuration - UPDATE THESE VALUES
TEST_AGENT_ID = "test_eval_1-Ux9OE986P4"  # Replace with your agent ID
TEST_SESSION_ID = "cc8a8e69-8bed-4e5f-9a06-9a58550fd713"  # Replace with your session ID
TEST_REGION = "us-east-1"  # Update with your AWS region

print("‚úÖ Configuration:")
print(f"  Agent ID: {TEST_AGENT_ID}")
print(f"  Session ID: {TEST_SESSION_ID}")
print(f"  Region: {TEST_REGION}")

## Test 1: Initialize Evaluation

Create evaluation instance with agent_id and region.

In [None]:
# Initialize with explicit agent_id and region
eval_client = Evaluation(agent_id=TEST_AGENT_ID, region=TEST_REGION)

print("‚úÖ Test 1 PASSED: Evaluation initialized")
print(f"Agent ID: {eval_client.agent_id}")
print(f"Region: {eval_client.region}")

## Test 2: eval_client.list_evaluators() - List All Evaluators

List all available evaluators (builtin and custom).

Equivalent to: `agentcore eval evaluator list`

In [None]:
# List all evaluators
evaluators_response = eval_client.list_evaluators()

evaluators = evaluators_response.get('evaluators', [])
print(f"\n‚úÖ Test 2 PASSED: Found {len(evaluators)} evaluators")

## Test 3: eval_client.get_evaluator() - Get Evaluator Details

Get detailed information about a specific evaluator.

Equivalent to: `agentcore eval evaluator get Builtin.Helpfulness`

In [None]:
## Test 4: eval_client.run() - Run Evaluation with Default Evaluator

Run evaluation on a session with default evaluator (Builtin.GoalSuccessRate).

Equivalent to: `agentcore eval run --session-id <session>`

# Run evaluation with default evaluator
results = eval_client.run(session_id=TEST_SESSION_ID)

print(f"\n‚úÖ Test 4 PASSED: Evaluation completed")
print(f"Session ID: {results.session_id}")
print(f"Results count: {len(results.results)}")

# Show successful results with details
successful = results.get_successful_results()
print(f"Successful: {len(successful)}")

if successful:
    result = successful[0]
    print(f"\nüìä Evaluation Result:")
    print(f"  Evaluator: {result.evaluator_name}")
    print(f"  Score: {result.value:.2f}")
    print(f"  Label: {result.label}")
    if result.explanation:
        print(f"  Explanation: {result.explanation[:200]}...")
    if result.token_usage:
        print(f"  Tokens: {result.token_usage.get('totalTokens', 0):,}")

In [None]:
## Test 5: eval_client.run() - Run with Multiple Evaluators

Run evaluation with multiple evaluators.

Equivalent to: `agentcore eval run -e Builtin.GoalSuccessRate -e Builtin.Accuracy`

# Run with multiple evaluators
results = eval_client.run(
    session_id=TEST_SESSION_ID,
    evaluators=["Builtin.GoalSuccessRate", "Builtin.Accuracy"]
)

print(f"\n‚úÖ Test 5 PASSED: Multi-evaluator run completed")
print(f"Results count: {len(results.results)}")

# Show comparison of multiple evaluators
successful = results.get_successful_results()
if successful:
    print(f"\nüìä Evaluator Comparison:")
    print(f"{'Evaluator':<30} {'Score':<10} {'Label':<20}")
    print("-" * 60)
    for result in successful:
        print(f"{result.evaluator_name:<30} {result.value:<10.2f} {result.label:<20}")

In [None]:
# Run with multiple evaluators
results = eval_client.run(
    session_id=TEST_SESSION_ID,
    evaluators=["Builtin.Helpfulness", "Builtin.Accuracy"]
)

print(f"\n‚úÖ Test 5 PASSED: Multi-evaluator run completed")
print(f"Results count: {len(results.results)}")

# Show evaluators used
evaluator_names = {r.evaluator_name for r in results.results}
print(f"Evaluators used: {evaluator_names}")

## Test 6: eval_client.run() - Evaluate Specific Trace

Evaluate only a specific trace (with previous traces for context).

Equivalent to: `agentcore eval run --trace-id <trace>`

In [None]:
# First, get a trace ID from observability
from bedrock_agentcore_starter_toolkit import Observability

obs = Observability(agent_id=TEST_AGENT_ID, region=TEST_REGION)
trace_data = obs.list(session_id=TEST_SESSION_ID)
trace_ids = list(trace_data.traces.keys())

if trace_ids:
    TEST_TRACE_ID = trace_ids[0]
    print(f"Using trace ID: {TEST_TRACE_ID}")
    
    # Run evaluation on specific trace
    results = eval_client.run(
        session_id=TEST_SESSION_ID,
        trace_id=TEST_TRACE_ID
    )
    
    print(f"\n‚úÖ Test 6 PASSED: Trace-specific evaluation completed")
    print(f"Trace ID: {results.trace_id}")
else:
    print("‚ö†Ô∏è  Test 6 SKIPPED: No traces found")
    TEST_TRACE_ID = None

## Test 7: eval_client.run() - Export to JSON

Run evaluation and export results to JSON file.

Equivalent to: `agentcore eval run --output results.json`

In [None]:
import tempfile
from pathlib import Path

# Create temp file for output
output_file = Path(tempfile.gettempdir()) / "test_eval_results.json"

# Run evaluation with output
results = eval_client.run(
    session_id=TEST_SESSION_ID,
    output=str(output_file)
)

# Verify file exists
assert output_file.exists(), "Output file not created"

print(f"\n‚úÖ Test 7 PASSED: Results exported to {output_file}")
print(f"File size: {output_file.stat().st_size} bytes")

In [None]:
# Use helper to create custom evaluator config
custom_config = create_evaluator_config(
    instructions=(
        "You are an objective judge evaluating the conciseness of an AI assistant's response. "
        "Your task is to assess whether the response is appropriately concise and to the point "
        "without losing essential information. IMPORTANT: Evaluate based on information density "
        "and relevance, not on length alone. A longer response can still be concise if all "
        "information is essential. # Conversation Context: ## Previous turns: {context} "
        "## Target turn to evaluate: {assistant_turn}"
    ),
    rating_scale=[
        (0.0, "Very Verbose", "Response is overly wordy and rambling. Contains excessive unnecessary details."),
        (0.33, "Somewhat Verbose", "Response has some unnecessary details but generally stays on topic."),
        (0.67, "Somewhat Concise", "Response is reasonably concise with minimal unnecessary information."),
        (1.0, "Very Concise", "Response is perfectly concise and to the point without losing essential information."),
    ]
)

# Create evaluator
import time
custom_name = f"test_conciseness_{int(time.time())}"

response = eval_client.create_evaluator(
    name=custom_name,
    config=custom_config,
    level="TRACE",
    description="Test evaluator for conciseness (created by notebook test)"
)

CUSTOM_EVALUATOR_ID = response.get('evaluatorId')
print(f"\n‚úÖ Test 8 PASSED: Custom evaluator created")
print(f"Evaluator ID: {CUSTOM_EVALUATOR_ID}")
print(f"\nüí° Tip: Use create_evaluator_config() helper to simplify config creation")

## Test 8: Create Custom Evaluator

Create a custom evaluator with configuration.

Equivalent to: `agentcore eval evaluator create`

In [None]:
# Define custom evaluator config using realistic template structure
custom_config = {
    "llmAsAJudge": {
        "modelConfig": {
            "bedrockEvaluatorModelConfig": {
                "modelId": "global.anthropic.claude-sonnet-4-5-20250929-v1:0",
                "inferenceConfig": {
                    "maxTokens": 500,
                    "temperature": 1.0
                }
            }
        },
        "ratingScale": {
            "numerical": [
                {
                    "value": 0.0,
                    "definition": "Response is overly wordy and rambling. Contains excessive unnecessary details.",
                    "label": "Very Verbose"
                },
                {
                    "value": 0.33,
                    "definition": "Response has some unnecessary details but generally stays on topic.",
                    "label": "Somewhat Verbose"
                },
                {
                    "value": 0.67,
                    "definition": "Response is reasonably concise with minimal unnecessary information.",
                    "label": "Somewhat Concise"
                },
                {
                    "value": 1.0,
                    "definition": "Response is perfectly concise and to the point without losing essential information.",
                    "label": "Very Concise"
                }
            ]
        },
        "instructions": "You are an objective judge evaluating the conciseness of an AI assistant's response. Your task is to assess whether the response is appropriately concise and to the point without losing essential information. IMPORTANT: Evaluate based on information density and relevance, not on length alone. A longer response can still be concise if all information is essential. # Conversation Context: ## Previous turns: {context} ## Target turn to evaluate: {assistant_turn}"
    }
}

# Create evaluator
import time
custom_name = f"test_conciseness_{int(time.time())}"

response = eval_client.create_evaluator(
    name=custom_name,
    config=custom_config,
    level="TRACE",
    description="Test evaluator for conciseness (created by notebook test)"
)

CUSTOM_EVALUATOR_ID = response.get('evaluatorId')
print(f"\n‚úÖ Test 8 PASSED: Custom evaluator created")
print(f"Evaluator ID: {CUSTOM_EVALUATOR_ID}")

## Test 9: Run Evaluation with Custom Evaluator

Use the newly created custom evaluator.

In [None]:
if 'CUSTOM_EVALUATOR_ID' in locals() and CUSTOM_EVALUATOR_ID:
    # Run evaluation with custom evaluator
    results = eval_client.run(
        session_id=TEST_SESSION_ID,
        evaluators=[CUSTOM_EVALUATOR_ID]
    )
    
    print(f"\n‚úÖ Test 9 PASSED: Custom evaluator executed")
    print(f"Results count: {len(results.results)}")
    
    # Show results from custom evaluator
    successful = results.get_successful_results()
    failed = results.get_failed_results()
    
    print(f"Successful evaluations: {len(successful)}")
    print(f"Failed evaluations: {len(failed)}")
    
    if successful:
        result = successful[0]
        print(f"\nüìä Custom Evaluator Result:")
        print(f"  Evaluator: {result.evaluator_name}")
        print(f"  Score: {result.value}")
        print(f"  Label: {result.label}")
        if result.explanation:
            print(f"  Explanation: {result.explanation[:150]}...")
    
    if failed:
        print(f"\n‚ö†Ô∏è  Some evaluations failed:")
        for result in failed:
            print(f"  - {result.evaluator_name}: {result.error}")
else:
    print("‚ö†Ô∏è  Test 9 SKIPPED: Custom evaluator not available")

## Test 10: Update Custom Evaluator

Update the custom evaluator description.

Equivalent to: `agentcore eval evaluator update`

In [None]:
if 'CUSTOM_EVALUATOR_ID' in locals() and CUSTOM_EVALUATOR_ID:
    # Update description
    response = eval_client.update_evaluator(
        evaluator_id=CUSTOM_EVALUATOR_ID,
        description="Updated: Test evaluator for conciseness (modified by notebook test)"
    )
    
    print(f"\n‚úÖ Test 10 PASSED: Evaluator updated")
    print(f"Updated at: {response.get('updatedAt')}")
else:
    print("‚ö†Ô∏è  Test 10 SKIPPED: Custom evaluator not available")

## Test 11: Get Custom Evaluator Details

Retrieve details of the custom evaluator.

In [None]:
if 'CUSTOM_EVALUATOR_ID' in locals() and CUSTOM_EVALUATOR_ID:
    # Get evaluator details
    details = eval_client.get_evaluator(CUSTOM_EVALUATOR_ID)
    
    print(f"\n‚úÖ Test 11 PASSED: Retrieved custom evaluator details")
    print(f"Name: {details.get('evaluatorName')}")
    print(f"Description: {details.get('description')}")
else:
    print("‚ö†Ô∏è  Test 11 SKIPPED: Custom evaluator not available")

## Test 12: Delete Custom Evaluator

Clean up by deleting the test evaluator.

Equivalent to: `agentcore eval evaluator delete`

In [None]:
if 'CUSTOM_EVALUATOR_ID' in locals() and CUSTOM_EVALUATOR_ID:
    # Delete evaluator
    eval_client.delete_evaluator(CUSTOM_EVALUATOR_ID)
    
    print(f"\n‚úÖ Test 12 PASSED: Evaluator deleted")
    print(f"Deleted evaluator ID: {CUSTOM_EVALUATOR_ID}")
else:
    print("‚ö†Ô∏è  Test 12 SKIPPED: Custom evaluator not available")

## Test 13: Initialize from Config

Test initializing evaluation client from config file.

In [None]:
# Try initializing from config
try:
    eval_from_config = Evaluation.from_config()
    print(f"\n‚úÖ Test 13 PASSED: Initialized from config")
    print(f"Agent ID: {eval_from_config.agent_id}")
    print(f"Session ID: {eval_from_config.session_id}")
except Exception as e:
    print(f"‚ö†Ô∏è  Test 13 SKIPPED: No config found ({e})")

## Test 14: Export Evaluator Config to JSON

Export evaluator configuration to JSON file.

In [None]:
# Export builtin evaluator config
export_file = Path(tempfile.gettempdir()) / "test_evaluator_config.json"

details = eval_client.get_evaluator(
    "Builtin.Helpfulness",
    output=str(export_file)
)

assert export_file.exists(), "Export file not created"

print(f"\n‚úÖ Test 14 PASSED: Evaluator config exported")
print(f"File: {export_file}")

## Summary

Display test results summary.

In [None]:
print("\n" + "="*80)
print("üéâ EVALUATION INTEGRATION TEST SUITE COMPLETE")
print("="*80)

print("\nTested Notebook API Commands:")
print("  ‚úÖ eval_client.run(session_id)                    ‚Üí Run evaluation")
print("  ‚úÖ eval_client.run(evaluators=[...])              ‚Üí Multiple evaluators")
print("  ‚úÖ eval_client.run(trace_id=...)                  ‚Üí Trace-specific eval")
print("  ‚úÖ eval_client.run(output='file.json')            ‚Üí Export results")
print("  ‚úÖ eval_client.list_evaluators()                  ‚Üí List evaluators")
print("  ‚úÖ eval_client.get_evaluator(id)                  ‚Üí Get details")
print("  ‚úÖ eval_client.create_evaluator(name, config)     ‚Üí Create custom")
print("  ‚úÖ eval_client.update_evaluator(id, ...)          ‚Üí Update custom")
print("  ‚úÖ eval_client.delete_evaluator(id)               ‚Üí Delete custom")
print("  ‚úÖ Evaluation.from_config()                       ‚Üí Init from config")

print("\nConfiguration:")
print(f"  Agent ID: {TEST_AGENT_ID}")
print(f"  Session ID: {TEST_SESSION_ID}")
print(f"  Region: {TEST_REGION}")

print("\nüí° API matches CLI commands:")
print("  CLI: agentcore eval run --session-id abc123")
print("  API: eval_client.run(session_id='abc123')")
print("")
print("  CLI: agentcore eval evaluator list")
print("  API: eval_client.list_evaluators()")
print("")
print("  CLI: agentcore eval evaluator create my-eval --config config.json")
print("  API: eval_client.create_evaluator('my-eval', config)")