# MLflow 3.3+ Prompt Management with Amazon Bedrock Converse API

This notebook demonstrates how to use MLflow 3.3+'s prompt management features with Amazon Bedrock models using the Converse API.


## Setup and Installation

In [None]:
# Install required packages - MLflow 3.3+ with enhanced prompt management
!pip install "mlflow>=3.3.0" "boto3>=1.34.0" "botocore>=1.34.0"

In [None]:
#retrieve ml tracking server arn
%store -r tracking_server_arn

In [None]:
# Import required libraries
import mlflow
import boto3
import json
from typing import List, Dict, Optional
import warnings
warnings.filterwarnings('ignore')

print(f"MLflow version: {mlflow.__version__}")
print(f"Boto3 version: {boto3.__version__}")

# MLflow 3.3+ uses register_prompt and load_prompt for prompt management
print("\n✅ MLflow 3.3+ prompt management functions available:")
print("- mlflow.register_prompt()")
print("- mlflow.load_prompt()")
print("- mlflow.search_prompts()")

In [None]:
region = boto3.Session().region_name
print(f"Using AWS Region: {region}")

In [None]:
# Initialize Bedrock client
bedrock_runtime = boto3.client('bedrock-runtime', region_name=region)

# Set MLflow tracking URI
mlflow.set_tracking_uri(tracking_server_arn) 

print("✅ AWS Bedrock client initialized")
print("✅ MLflow 3.3+ tracking configured")

## 1. Create Experiment

In [None]:
# Create a new experiment
experiment_name = "bedrock-prompt-management"
try:
    experiment_id = mlflow.create_experiment(experiment_name)
    print(f"Created new experiment: {experiment_name}")
except mlflow.exceptions.MlflowException:
    experiment = mlflow.get_experiment_by_name(experiment_name)
    experiment_id = experiment.experiment_id
    print(f"Using existing experiment: {experiment_name}")

mlflow.set_experiment(experiment_name)

## 2. Register Prompt Template (MLflow 3.3+ API)

In [None]:
# Register a prompt template using MLflow 3.3+ API
print("Registering customer support prompt template...")

try:
    # Note: MLflow 3.3+ uses {{variable}} syntax (double curly braces)
    prompt_template = "You are a helpful customer support agent. "\
                     "Customer question: {{question}}\n"\
                     "Product context: {{product_info}}\n"\
                     "Please provide a helpful and professional response."
    
    customer_support_prompt = mlflow.register_prompt(
        name="customer-support-prompt",
        template=prompt_template,
        commit_message="Initial customer support prompt template",
        tags={"use_case": "customer_support", "version": "v1"}
    )
    print(f"✅ Registered prompt: {customer_support_prompt.name} (version {customer_support_prompt.version})")
    
except Exception as e:
    print(f"Prompt registration error: {e}")
    print("This might be expected if the prompt already exists.")

## 3. Use Prompt Template with Bedrock Converse API

In [None]:
# Use the prompt template with Bedrock Converse API directly

with mlflow.start_run(run_name="basic_customer_support"):
    # Load the prompt template using MLflow 3.3+ API
    try:
        prompt = mlflow.genai.load_prompt("customer-support-prompt", version=1)
        print(f"Loaded prompt: {prompt.name} (version {prompt.version})")
        
        # Fill in the template variables using MLflow 3.3+ format method
        filled_prompt = prompt.format(
            question="How do I return a defective product?",
            product_info="Electronics with 30-day return policy"
        )
        
    except Exception as e:
        print(f"Error loading prompt: {e}")
        # Fallback to manual template
        filled_prompt = "You are a helpful customer support agent. "\
                       "Customer question: How do I return a defective product?\n"\
                       "Product context: Electronics with 30-day return policy\n"\
                       "Please provide a helpful and professional response."
    
    print(f"Filled prompt:\n{filled_prompt}\n")
    
    # Log parameters
    model_id = "us.anthropic.claude-sonnet-4-20250514-v1:0"
    mlflow.log_param("model_id", model_id)
    mlflow.log_param("prompt_name", "customer-support-prompt")
    mlflow.log_param("max_tokens", 1000)
    mlflow.log_param("temperature", 0.7)
    
    # Log the prompt
    mlflow.log_text(filled_prompt, "input_prompt.txt")
    
    # Invoke Bedrock model using Converse API directly
    try:
        response = bedrock_runtime.converse(
            modelId=model_id,
            messages=[
                {
                    "role": "user",
                    "content": [
                        {
                            "text": filled_prompt
                        }
                    ]
                }
            ],
            inferenceConfig={
                "maxTokens": 1000,
                "temperature": 0.7,
                "topP": 0.9
            }
        )
        
        # Extract response text
        response_text = response['output']['message']['content'][0]['text']
        usage = response.get('usage', {})
        stop_reason = response['stopReason']
        
        # Log response and metrics
        mlflow.log_text(response_text, "model_response.txt")
        mlflow.log_metric("response_length", len(response_text))
        mlflow.log_metric("prompt_length", len(filled_prompt))
        mlflow.log_param("stop_reason", stop_reason)
        
        # Log token usage if available
        if usage:
            mlflow.log_metric("input_tokens", usage.get("inputTokens", 0))
            mlflow.log_metric("output_tokens", usage.get("outputTokens", 0))
            mlflow.log_metric("total_tokens", usage.get("totalTokens", 0))
        
        print(f"Model response:\n{response_text}")
        
    except Exception as e:
        error_msg = f"Error invoking Bedrock model: {e}"
        print(error_msg)
        mlflow.log_text(error_msg, "error_log.txt")

## 4. System Prompt Example -  we will examine different metrics that can be tracked in MLFlow for evaluating prompt performance 

In [None]:
# Register a system prompt template
try:
    system_prompt_template = mlflow.genai.register_prompt(
        name="teaching-system-prompt",
        template="You are an excellent teacher who explains complex concepts in simple, engaging ways. "\
                "Always use analogies and examples that a {{age_group}} can understand.",
        commit_message="System prompt for teaching complex topics",
        tags={"type": "system_prompt", "domain": "education"}
    )
    print(f"✅ Registered system prompt: {system_prompt_template.name}")
except Exception as e:
    print(f"System prompt registration: {e}")

In [None]:
# Example with system prompt
with mlflow.start_run(run_name="system_prompt_example"):
    user_message = "Explain machine learning in simple terms."
    
    # Load and format system prompt
    try:
        system_prompt_template = mlflow.genai.load_prompt("teaching-system-prompt", version=1)
        system_prompt = system_prompt_template.format(age_group="12-year-old")
    except Exception as e:
        print(f"Using fallback system prompt: {e}")
        system_prompt = "You are an excellent teacher who explains complex concepts simply."
    
    # Log parameters
    model_id = "us.anthropic.claude-3-5-sonnet-20241022-v2:0"
    mlflow.log_param("model_id", model_id)
    mlflow.log_param("has_system_prompt", True)
    mlflow.log_param("system_prompt_name", "teaching-system-prompt")
    
    # Log prompts
    mlflow.log_text(system_prompt, "system_prompt.txt")
    mlflow.log_text(user_message, "user_message.txt")
    
    # Get response using Converse API with system prompt
    try:
        response = bedrock_runtime.converse(
            modelId=model_id,
            messages=[
                {
                    "role": "user",
                    "content": [
                        {
                            "text": user_message
                        }
                    ]
                }
            ],
            system=[
                {
                    "text": system_prompt
                }
            ],
            inferenceConfig={
                "maxTokens": 1500,
                "temperature": 0.7,
                "topP": 0.9
            }
        )
        
        # Extract response details
        response_text = response['output']['message']['content'][0]['text']
        usage = response.get('usage', {})
        
        # Log response and metrics
        mlflow.log_text(response_text, "model_response.txt")
        mlflow.log_metric("response_length", len(response_text))
        
        if usage:
            mlflow.log_metric("input_tokens", usage.get("inputTokens", 0))
            mlflow.log_metric("output_tokens", usage.get("outputTokens", 0))
            mlflow.log_metric("total_tokens", usage.get("totalTokens", 0))
        
        print(f"System Prompt Response:\n{response_text}")
        
    except Exception as e:
        error_msg = f"Error with system prompt: {e}"
        print(error_msg)
        mlflow.log_text(error_msg, "error_log.txt")

## 5. Search and Manage Prompts

In [None]:
# Search for registered prompts
print("Searching for registered prompts...")
try:
    prompts = mlflow.genai.search_prompts()
    print(f"Found {len(prompts)} registered prompts:")
    
    for prompt in prompts:
        print(f"- {prompt.name}")
        if hasattr(prompt, 'description') and prompt.description:
            print(f"  Description: {prompt.description}")
        if hasattr(prompt, 'tags') and prompt.tags:
            print(f"  Tags: {prompt.tags}")
        
        # To get version info, we need to load the prompt
        try:
            loaded_prompt = mlflow.genai.load_prompt(prompt.name, version=1)
            print(f"  Latest version: {loaded_prompt.version}")
            print(f"  Template preview: {loaded_prompt.template[:100]}...")
        except Exception as load_error:
            print(f"  Could not load version info: {load_error}")
        print()
        
except Exception as e:
    print(f"Error searching prompts: {e}")

## 6. View Results

In [None]:
# Get experiment information
experiment = mlflow.get_experiment_by_name(experiment_name)
print(f"Experiment ID: {experiment.experiment_id}")
print(f"Experiment Name: {experiment.name}")

# Get all runs from the experiment
runs = mlflow.search_runs(experiment_ids=[experiment.experiment_id])
print(f"\nTotal runs in experiment: {len(runs)}")

# Display summary of runs
if len(runs) > 0:
    print("\nRun Summary:")
    for idx, run in runs.iterrows():
        print(f"- {run['tags.mlflow.runName']}: {run['status']}")
        if 'metrics.total_tokens' in run and run['metrics.total_tokens'] is not None:
            print(f"  Total tokens: {int(run['metrics.total_tokens'])}")
        if 'metrics.response_length' in run and run['metrics.response_length'] is not None:
            print(f"  Response length: {int(run['metrics.response_length'])} chars")
        print()

## 7. Compare Versions - this will help us store and compare multiple versions of prompts against the same model. We can then choose which version of the prompt we use in our model invocations

In [None]:
# Create version 2 of the customer support prompt with improvements
print("Creating version 2 of customer support prompt...")

try:
    # Enhanced prompt template with more structure and empathy
    enhanced_prompt_template = """You are an expert customer support specialist with 10+ years of experience.
    
Customer Information:
- Question: {{question}}
- Product Context: {{product_info}}
- Customer Tier: {{customer_tier}}
- Urgency Level: {{urgency}}

Instructions:
1. Acknowledge the customer's concern with empathy
2. Provide a clear, step-by-step solution
3. Offer additional resources or escalation if needed
4. End with a professional closing and next steps

Please provide a comprehensive, helpful response."""
    
    # Register the enhanced version (this will create version 2)
    customer_support_v2 = mlflow.genai.register_prompt(
        name="customer-support-prompt",  # Same name creates new version
        template=enhanced_prompt_template,
        commit_message="Enhanced customer support prompt with structured approach and empathy",
        tags={"use_case": "customer_support", "version": "v2", "enhancement": "structured_empathetic"}
    )
    
    print(f"✅ Created version 2: {customer_support_v2.name} (version {customer_support_v2.version})")
    print(f"Commit message: {customer_support_v2.commit_message}")
    
except Exception as e:
    print(f"Error creating version 2: {e}")

In [None]:
# Load and compare both versions of the prompt
print("Comparing prompt versions...\n")

try:
    # Load version 1
    prompt_v1 = mlflow.genai.load_prompt("customer-support-prompt", version=1)
    print(f"📋 VERSION 1 (Original):")
    print(f"Template length: {len(prompt_v1.template)} characters")
    print(f"Template preview: {prompt_v1.template[:150]}...\n")
    
    # Load version 2
    prompt_v2 = mlflow.genai.load_prompt("customer-support-prompt", version=2)
    print(f"📋 VERSION 2 (Enhanced):")
    print(f"Template length: {len(prompt_v2.template)} characters")
    print(f"Template preview: {prompt_v2.template[:150]}...\n")
    
    # Show the difference in structure
    print(f"📊 COMPARISON:")
    print(f"Length difference: {len(prompt_v2.template) - len(prompt_v1.template)} characters")
    print(f"V1 Variables: {len(prompt_v1.template.split('{{'))-1} variables")
    print(f"V2 Variables: {len(prompt_v2.template.split('{{'))-1} variables")
    
except Exception as e:
    print(f"Error comparing versions: {e}")

In [None]:
# A/B test both prompt versions with the same input
import time

# Test scenario
test_scenario = {
    "question": "My laptop arrived damaged and I need a replacement urgently for work",
    "product_info": "Dell XPS 13 laptop, Premium warranty, ordered 3 days ago",
    "customer_tier": "Premium",
    "urgency": "High"
}

print("🧪 A/B Testing Prompt Versions...\n")
print(f"Test Scenario: {test_scenario['question']}\n")

model_id = "us.anthropic.claude-3-5-sonnet-20241022-v2:0"
results = {}

# Test both versions
for version in [1, 2]:
    with mlflow.start_run(run_name=f"customer_support_v{version}_ab_test"):
        try:
            # Load prompt version
            prompt = mlflow.genai.load_prompt("customer-support-prompt", version=version)
            
            # Format prompt (V1 uses fewer variables)
            if version == 1:
                filled_prompt = prompt.format(
                    question=test_scenario["question"],
                    product_info=test_scenario["product_info"]
                )
            else:
                filled_prompt = prompt.format(**test_scenario)
            
            # Log parameters
            mlflow.log_param("prompt_version", f"v{version}")
            mlflow.log_param("model_id", model_id)
            mlflow.log_param("test_type", "ab_test")
            mlflow.log_text(filled_prompt, f"prompt_v{version}.txt")
            
            # Call Bedrock
            start_time = time.time()
            response = bedrock_runtime.converse(
                modelId=model_id,
                messages=[{"role": "user", "content": [{"text": filled_prompt}]}],
                inferenceConfig={"maxTokens": 1500, "temperature": 0.7, "topP": 0.9}
            )
            response_time = time.time() - start_time
            
            # Extract results
            response_text = response['output']['message']['content'][0]['text']
            usage = response.get('usage', {})
            
            # Log metrics
            mlflow.log_text(response_text, f"response_v{version}.txt")
            mlflow.log_metric("response_time_seconds", response_time)
            mlflow.log_metric("response_length", len(response_text))
            mlflow.log_metric("response_word_count", len(response_text.split()))
            mlflow.log_metric("prompt_length", len(filled_prompt))
            
            if usage:
                mlflow.log_metric("total_tokens", usage.get("totalTokens", 0))
                mlflow.log_metric("input_tokens", usage.get("inputTokens", 0))
                mlflow.log_metric("output_tokens", usage.get("outputTokens", 0))
            
            # Store results for comparison
            results[f"v{version}"] = {
                "response_text": response_text,
                "response_time": response_time,
                "response_length": len(response_text),
                "word_count": len(response_text.split()),
                "total_tokens": usage.get("totalTokens", 0)
            }
            
            print(f"✅ VERSION {version} COMPLETED:")
            print(f"   Response: {len(response_text)} chars, {len(response_text.split())} words")
            print(f"   Time: {response_time:.2f}s, Tokens: {usage.get('totalTokens', 0)}")
            print(f"   Preview: {response_text[:200]}...\n")
            
        except Exception as e:
            print(f"❌ Error testing V{version}: {e}")
            mlflow.log_text(str(e), f"error_v{version}.txt")

# Compare results
if len(results) == 2:
    print("📊 A/B TEST COMPARISON:")
    print(f"{'Metric':<20} {'V1':<12} {'V2':<12} {'Difference':<15}")
    print("-" * 60)
    
    v1, v2 = results["v1"], results["v2"]
    
    metrics = [
        ("Response Length", "response_length"),
        ("Word Count", "word_count"),
        ("Total Tokens", "total_tokens"),
        ("Response Time", "response_time")
    ]
    
    for name, key in metrics:
        v1_val = v1[key]
        v2_val = v2[key]
        diff = v2_val - v1_val
        pct = (diff / v1_val * 100) if v1_val > 0 else 0
        print(f"{name:<20} {v1_val:<12.2f} {v2_val:<12.2f} {diff:+.2f} ({pct:+.1f}%)")
    
    print("\n💡 Key Insights:")
    if v2["response_length"] > v1["response_length"]:
        print("• V2 generates more detailed responses")
    if v2["total_tokens"] > v1["total_tokens"]:
        print("• V2 uses more tokens (higher cost, potentially better quality)")
    print("• Compare response quality manually to determine the winner")

## 8. Compare Models  - this will help us evaluate how two different models perform with the same prompt

In [None]:
# Simple model evaluation setup
import time
import pandas as pd

print("🔬 Setting up simple model evaluation...")

# Models to compare
models_to_compare = [
    {
        "name": "Claude 3 Sonnet",
        "model_id": "us.anthropic.claude-3-5-sonnet-20241022-v2:0",
        "generation": "3"
    },
    {
        "name": "Claude 4 Sonnet",
        "model_id": "us.anthropic.claude-sonnet-4-20250514-v1:0",
        "generation": "4"
    }
]

# Simple test prompt
test_prompt = """You are a helpful customer support agent.

Customer question: I'm having trouble with my subscription renewal. It keeps failing when I try to update my payment method.

Please provide a helpful solution with clear steps."""

print(f"✅ Configured {len(models_to_compare)} models for evaluation")
print(f"✅ Test prompt ready ({len(test_prompt)} characters)")

In [None]:
# Evaluate both models with the same prompt
print("🤖 Evaluating models...\n")

evaluation_results = []

for model in models_to_compare:
    print(f"Testing {model['name']}...")
    
    # Create MLflow run for this model evaluation
    with mlflow.start_run(run_name=f"eval_{model['name'].replace(' ', '_').lower()}"):
        try:
            # Log model information
            mlflow.log_param("model_name", model["name"])
            mlflow.log_param("model_id", model["model_id"])
            mlflow.log_param("model_generation", model["generation"])
            mlflow.log_param("evaluation_type", "simple_model_comparison")
            
            # Log the test prompt
            mlflow.log_text(test_prompt, "test_prompt.txt")
            
            # Call Bedrock with timing
            start_time = time.time()
            
            response = bedrock_runtime.converse(
                modelId=model["model_id"],
                messages=[
                    {
                        "role": "user",
                        "content": [
                            {
                                "text": test_prompt
                            }
                        ]
                    }
                ],
                inferenceConfig={
                    "maxTokens": 1000,
                    "temperature": 0.7,
                    "topP": 0.9
                }
            )
            
            response_time = time.time() - start_time
            
            # Extract response details
            response_text = response['output']['message']['content'][0]['text']
            usage = response.get('usage', {})
            stop_reason = response['stopReason']
            
            # Calculate simple metrics
            word_count = len(response_text.split())
            char_count = len(response_text)
            sentence_count = len([s for s in response_text.split('.') if s.strip()])
            
            # Log metrics to MLflow
            mlflow.log_metric("response_time_seconds", response_time)
            mlflow.log_metric("response_word_count", word_count)
            mlflow.log_metric("response_char_count", char_count)
            mlflow.log_metric("sentence_count", sentence_count)
            mlflow.log_param("stop_reason", stop_reason)
            
            # Log token usage if available
            if usage:
                input_tokens = usage.get("inputTokens", 0)
                output_tokens = usage.get("outputTokens", 0)
                total_tokens = usage.get("totalTokens", 0)
                
                mlflow.log_metric("input_tokens", input_tokens)
                mlflow.log_metric("output_tokens", output_tokens)
                mlflow.log_metric("total_tokens", total_tokens)
                
                # Simple cost estimate (approximate)
                estimated_cost = (input_tokens * 0.003 + output_tokens * 0.015) / 1000
                mlflow.log_metric("estimated_cost_usd", estimated_cost)
            
            # Log the full response
            mlflow.log_text(response_text, "model_response.txt")
            
            # Store results for comparison
            result = {
                "model_name": model["name"],
                "generation": model["generation"],
                "response_time": response_time,
                "word_count": word_count,
                "char_count": char_count,
                "total_tokens": total_tokens if usage else 0,
                "estimated_cost": estimated_cost if usage else 0,
                "response_preview": response_text[:150] + "..."
            }
            evaluation_results.append(result)
            
            print(f"  ✅ Response: {word_count} words, {response_time:.2f}s, {total_tokens if usage else 0} tokens")
            print(f"  Preview: {response_text[:100]}...\n")
            
        except Exception as e:
            error_msg = f"Error evaluating {model['name']}: {e}"
            print(f"  ❌ {error_msg}")
            mlflow.log_text(error_msg, "evaluation_error.txt")
            
            # Log error result
            result = {
                "model_name": model["name"],
                "generation": model["generation"],
                "error": str(e)
            }
            evaluation_results.append(result)

print(f"✅ Evaluation completed with {len(evaluation_results)} results")

In [None]:
# Display the tracking server url
s = boto3.client("sagemaker").list_mlflow_tracking_servers(TrackingServerStatus='Created')
tracking_server_name = s['TrackingServerSummaries'][0]['TrackingServerName']

u = boto3.client("sagemaker").describe_mlflow_tracking_server(TrackingServerName=tracking_server_name)
tracking_server_url = u['TrackingServerUrl']

print(tracking_server_url)

Open SageMaker MLFlow UI and see the trace logged under the traces tab.