# Amazon Nova Models on SageMaker Inference End-to-End Runbook


## 0. Update Credential

**Note**:
This section needs to be updated to reflect customers' account credential.

In [None]:
import boto3

# AWS Configuration - Update these for your environment
REGION = "us-east-1"  # Change to your preferred region
AWS_ACCOUNT_ID = "YOUR_ACCOUNT_ID"  # Replace with your AWS account ID

# CREDENTIAL SETUP OPTIONS:
# Option 1. AWS CLI: Run 'aws configure' and enter your access key, secret key, and region
# Option 2. Credentials file: Create ~/.aws/credentials with:
#    [default]
#    aws_access_key_id = YOUR_ACCESS_KEY
#    aws_secret_access_key = YOUR_SECRET_KEY
# Option 3. Environment variables: Set AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY

# Initialize AWS clients using default credential chain
sagemaker = boto3.client('sagemaker', region_name=REGION)
sts = boto3.client('sts')

# Verify credentials
try:
    identity = sts.get_caller_identity()
    print(f"Successfully authenticated to AWS Account: {identity['Account']}")

    if identity['Account'] != AWS_ACCOUNT_ID:
        print(f"Warning: Connected to account {identity['Account']}, expected {AWS_ACCOUNT_ID}")

except Exception as e:
    print(f"Failed to authenticate: {e}")
    print("Please run 'aws configure' or set up your credentials.")

2025/12/02 02:56:05 Successfully refreshed aws credentials for default
✓ Fresh credentials loaded: 618100645563


### Create SageMaker Execution Role

In [None]:
import json

# Create SageMaker Execution Role
role_name = f"SageMakerInference-ExecutionRole-{AWS_ACCOUNT_ID}"

trust_policy = {
    "Version": "2012-10-17",
    "Statement": [
        {
            "Effect": "Allow",
            "Principal": {"Service": "sagemaker.amazonaws.com"},
            "Action": "sts:AssumeRole"
        }
    ]
}

iam = boto3.client('iam', region_name=REGION)

# Create the role
role_response = iam.create_role(
    RoleName=role_name,
    AssumeRolePolicyDocument=json.dumps(trust_policy),
    Description='SageMaker execution role with S3 and SageMaker access'
)

# Attach required policies
iam.attach_role_policy(
    RoleName=role_name,
    PolicyArn='arn:aws:iam::aws:policy/AmazonSageMakerFullAccess'
)

iam.attach_role_policy(
    RoleName=role_name,
    PolicyArn='arn:aws:iam::aws:policy/AmazonS3FullAccess'
)

SAGEMAKER_EXECUTION_ROLE_ARN = role_response['Role']['Arn']
print(f"Created SageMaker execution role: {SAGEMAKER_EXECUTION_ROLE_ARN}")

## 1. Configuration **Static** Parameters + Initialize Sagemaker client

Update these variables according to your AWS environment and requirements.

In [None]:
# AWS Configuration
REGION = "us-east-1"

# ECR Account mapping by region
ECR_ACCOUNT_MAP = {
    "us-east-1": "708977205387",
    "us-west-2": "176779409107",
    "eu-west-2": "470633809225",
    "ap-northeast-1": "878185805882"
}

# Container Image - Replace with the image URI provided by your AWS contact
# Two image tags are available (both point to the same image):
IMAGE_LATEST = f"{ECR_ACCOUNT_MAP[REGION]}.dkr.ecr.{REGION}.amazonaws.com/nova-inference-repo:SM-Inference-latest"
IMAGE_VERSIONED = f"{ECR_ACCOUNT_MAP[REGION]}.dkr.ecr.{REGION}.amazonaws.com/nova-inference-repo:v1.0.0"

# Use the versioned tag for production deployments (recommended)
IMAGE = IMAGE_VERSIONED
print(f"IMAGE = {IMAGE}")
print(f"Available tags:")
print(f"  Latest: {IMAGE_LATEST}")
print(f"  Versioned: {IMAGE_VERSIONED}")

# Model Parameters
CONTEXT_LENGTH = "8000"        # Maximum total context length
MAX_CONCURRENCY = "16"         # Maximum concurrent sequences

# Optional: Default generation config (uncomment to use)
DEFAULT_TEMPERATURE = "0.0"
DEFAULT_TOP_P = "1.0"
DEFAULT_TOP_K = "-1"
DEFAULT_MAX_NEW_TOKENS = "4096"
DEFAULT_LOGPROBS = "5"

# Build environment variables conditionally
environment = {
    'CONTEXT_LENGTH': CONTEXT_LENGTH,
    'MAX_CONCURRENCY': MAX_CONCURRENCY,
}

# Add optional parameters if defined
if 'DEFAULT_TEMPERATURE' in globals():
    environment['DEFAULT_TEMPERATURE'] = DEFAULT_TEMPERATURE
if 'DEFAULT_TOP_P' in globals():
    environment['DEFAULT_TOP_P'] = DEFAULT_TOP_P
if 'DEFAULT_TOP_K' in globals():
    environment['DEFAULT_TOP_K'] = DEFAULT_TOP_K
if 'DEFAULT_MAX_NEW_TOKENS' in globals():
    environment['DEFAULT_MAX_NEW_TOKENS'] = DEFAULT_MAX_NEW_TOKENS
if 'DEFAULT_LOGPROBS' in globals():
    environment['DEFAULT_LOGPROBS'] = DEFAULT_LOGPROBS

print("Environment configuration:")
for key, value in environment.items():
    print(f"  {key}: {value}")

sagemaker = boto3.client('sagemaker', region_name=REGION)

## 2. Configure **Dynamic** Parameters for Test Cases


In [None]:
# Set up the prefix string to be reflected in model, endpoint config, and endpoint.
JOB_NAME = "Customer-Job-Name"

# Replace with S3 URI of the training output artifact
MODEL_S3_LOCATION = "S3_URI_OF_YOUR_MODEL_ARTIFACTS" # must end with /
  
# Model and instance type config for your endpoint
TESTCASE = {
    "model": "micro", 
    "instance": "ml.g5.12xlarge"
}
INSTANCE_TYPE = TESTCASE["instance"]
MODEL_NAME = JOB_NAME + "-" + TESTCASE["model"] + "-" + INSTANCE_TYPE.replace(".", "-")
ENDPOINT_CONFIG_NAME = MODEL_NAME + "-Config"
ENDPOINT_NAME = MODEL_NAME + "-Endpoint"

print(f"Model Name: {MODEL_NAME}")
print(f"Endpoint Config: {ENDPOINT_CONFIG_NAME}")
print(f"Endpoint Name: {ENDPOINT_NAME}")

## 3. Create SageMaker Model + Endpoint

### 3.1 Create Model and Endpoint Config

In [None]:
# create_model
try:
    model_response = sagemaker.create_model(
        ModelName=MODEL_NAME,
        PrimaryContainer={
            'Image': IMAGE,
            'ModelDataSource': {
                'S3DataSource': {
                    'S3Uri': MODEL_S3_LOCATION,
                    'S3DataType': 'S3Prefix',
                    'CompressionType': 'None'
                }
            },
            'Environment': environment
        },
        ExecutionRoleArn=SAGEMAKER_EXECUTION_ROLE_ARN,
        EnableNetworkIsolation=True
    )
    print("Model created successfully!")
    print(f"Model ARN: {model_response['ModelArn']}")
except Exception as e:
    print(f"Error creating model: {e}")

# create_endpoint_config
try:
    production_variant = {
        'VariantName': 'primary',
        'ModelName': MODEL_NAME,
        'InitialInstanceCount': 1,
        'InstanceType': INSTANCE_TYPE,
    }
    config_response = sagemaker.create_endpoint_config(
        EndpointConfigName=ENDPOINT_CONFIG_NAME,
        ProductionVariants=[production_variant]
    )
    print("Endpoint configuration created successfully!")
    print(f"Config ARN: {config_response['EndpointConfigArn']}")
except Exception as e:
    print(f"Error creating endpoint config: {e}")

### 3.2 Create Endpoint


In [None]:
import time

# create_endpoint
try:
    endpoint_response = sagemaker.create_endpoint(
        EndpointName=ENDPOINT_NAME,
        EndpointConfigName=ENDPOINT_CONFIG_NAME
    )
    print("Endpoint creation initiated successfully!")
    print(f"Endpoint ARN: {endpoint_response['EndpointArn']}")
except Exception as e:
    print(f"Error creating endpoint: {e}")

# Keep polling for endpoint status, may take 15-30 minutes
print("Waiting for endpoint creation to complete...")
print("This typically takes 15-30 minutes...\n")

while True:
    try:
        response = sagemaker.describe_endpoint(EndpointName=ENDPOINT_NAME)
        status = response['EndpointStatus']
        
        if status == 'Creating':
            print(f"⏳ Status: {status} - Provisioning infrastructure and loading model...")
        elif status == 'InService':
            print(f"✅ Status: {status}")
            print("\nEndpoint creation completed successfully!")
            print(f"Endpoint Name: {ENDPOINT_NAME}")
            print(f"Endpoint ARN: {response['EndpointArn']}")
            break
        elif status == 'Failed':
            print(f"❌ Status: {status}")
            print(f"Failure Reason: {response.get('FailureReason', 'Unknown')}")
            print("\nFull response:")
            print(response)
            break
        else:
            print(f"Status: {status}")
        
    except Exception as e:
        print(f"Error checking endpoint status: {e}")
        break
    
    time.sleep(30)  # Check every 30 seconds

## 4. Invoke Inference

In [None]:
import json
import botocore
from botocore.exceptions import ClientError

# Add timeout configuration
config = botocore.config.Config(
    read_timeout=120,
    connect_timeout=10,
    retries={'max_attempts': 3}
)

runtime_client = boto3.client('sagemaker-runtime', config=config, region_name=REGION)

def invoke_nova_endpoint(request_body):
    """
    Invoke Nova endpoint with automatic streaming detection.
    
    Args:
        request_body (dict): Request payload containing prompt and parameters
    
    Returns:
        dict: Response from the model (for non-streaming requests)
        None: For streaming requests (prints output directly)
    """
    body = json.dumps(request_body)
    is_streaming = request_body.get("stream", False)
    
    try:
        print(f"Invoking endpoint ({'streaming' if is_streaming else 'non-streaming'})...")
        
        if is_streaming:
            response = runtime_client.invoke_endpoint_with_response_stream(
                EndpointName=ENDPOINT_NAME,
                ContentType='application/json',
                Body=body
            )
            
            event_stream = response['Body']
            for event in event_stream:
                print("Event:", event)
                if 'PayloadPart' in event:
                    chunk = event['PayloadPart']
                    if 'Bytes' in chunk:
                        data = chunk['Bytes'].decode()
                        print("Chunk: ", data)
        else:
            response = runtime_client.invoke_endpoint(
                EndpointName=ENDPOINT_NAME,
                ContentType='application/json',
                Accept='application/json',
                Body=body
            )
            
            response_body = response['Body'].read().decode('utf-8')
            result = json.loads(response_body)
            print("Response: ", result)
            return result
    
    except ClientError as e:
        print(f"❌ Error: {e.response['Error']['Code']} - {e.response['Error']['Message']}")
    except Exception as e:
        print(f"❌ Unexpected error: {e}")

### Test both Streaming & Non-Streaming

In [None]:
# Test 1: Non-streaming chat
chat_request = {
    "messages": [
        {"role": "user", "content": "Hello! How are you?"}
    ],
    "max_tokens": 100,
    "max_completion_tokens": 100,
    "stream": False,
    "temperature": 0.7,
    "top_p": 0.9,
    "top_k": 50,
    "logprobs": True,
    "top_logprobs": 3,
    # "reasoning_effort": "low",  # Options: "low", "high"
    "allowed_token_ids": None,  # List of allowed token IDs
    "truncate_prompt_tokens": None,  # Truncate prompt to this many tokens
    "stream_options": None
}

response = invoke_nova_endpoint(chat_request)

print("\n" + "="*50 + "\n")

# Test 2: Non-streaming completion
completion_request = {
    "prompt": "The capital of France is",
    "max_tokens": 50,
    "stream": False,
    "temperature": 0.0,
    "top_p": 1.0,
    "top_k": -1,  # -1 means no limit
    "logprobs": 3,  # Number of log probabilities to return
    "allowed_token_ids": None,  # List of allowed token IDs
    "truncate_prompt_tokens": None,  # Truncate prompt to this many tokens
    "stream_options": None
}

response = invoke_nova_endpoint(completion_request)

print("\n" + "="*50 + "\n")

# Test 3: Streaming chat
streaming_chat_request = {
    "messages": [
        {"role": "user", "content": "Tell me a short story about a robot"}
    ],
    "max_tokens": 200,
    "stream": True,
    "temperature": 0.7,
    "top_p": 0.95,
    "top_k": 40,
    "logprobs": True,
    "top_logprobs": 2,
    # "reasoning_effort": "high",  # Options: "low", "high"
    "stream_options": {"include_usage": True}
}

invoke_nova_endpoint(streaming_chat_request)

print("\n" + "="*50 + "\n")

# Test 4: Streaming completion
streaming_completion_request = {
    "prompt": "The capital of France is",
    "max_tokens": 50,
    "stream": True,
    "temperature": 0.0,
    "top_p": 1.0,
    "top_k": -1,  # -1 means no limit
    "logprobs": 3,  # Number of log probabilities to return
    "allowed_token_ids": None,  # List of allowed token IDs
    "truncate_prompt_tokens": None,  # Truncate prompt to this many tokens
    "stream_options": {"include_usage": True}
}

invoke_nova_endpoint(streaming_completion_request)

## List ALL existing resources

In [None]:
print(sagemaker.list_models())
print(sagemaker.list_endpoint_configs())
print(sagemaker.list_endpoints())