# Autoscaling a SageMaker Endpoint

In [1]:
import boto3
import sagemaker
import pandas as pd

sess = sagemaker.Session()
bucket = sess.default_bucket()
role = sagemaker.get_execution_role()
region = boto3.Session().region_name

sm = boto3.Session().client(service_name="sagemaker", region_name=region)
autoscale = boto3.Session().client(service_name="application-autoscaling", region_name=region)

In [2]:
%store -r tensorflow_endpoint_name

In [3]:
try:
    tensorflow_endpoint_name
    print("[OK]")
except NameError:
    print("+++++++++++++++++++++++++++++++")
    print("[ERROR] Please run the notebooks in the previous notebook before you continue.")
    print("+++++++++++++++++++++++++++++++")

[OK]


In [6]:
print(tensorflow_endpoint_name)

tensorflow-training-2024-02-18-22-26-04-112-pt-1708379512


# Copy the Model to the Notebook

In [7]:
autoscale.register_scalable_target(
    ServiceNamespace="sagemaker",
    ResourceId="endpoint/" + tensorflow_endpoint_name + "/variant/AllTraffic",
    ScalableDimension="sagemaker:variant:DesiredInstanceCount",
    MinCapacity=1,
    MaxCapacity=2,
    RoleARN=role,
    SuspendedState={
        "DynamicScalingInSuspended": False,
        "DynamicScalingOutSuspended": False,
        "ScheduledScalingSuspended": False,
    },
)

{'ScalableTargetARN': 'arn:aws:application-autoscaling:eu-west-2:123137613716:scalable-target/056m69254590eabb4de8963651868d2800c3',
 'ResponseMetadata': {'RequestId': '85cd7155-e935-4a83-b9dd-544cab6c9ac1',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '85cd7155-e935-4a83-b9dd-544cab6c9ac1',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '131',
   'date': 'Mon, 19 Feb 2024 23:03:04 GMT'},
  'RetryAttempts': 0}}

In [8]:
# check the target is available
autoscale.describe_scalable_targets(
    ServiceNamespace="sagemaker",
    MaxResults=100,
)

{'ScalableTargets': [{'ServiceNamespace': 'sagemaker',
   'ResourceId': 'endpoint/tensorflow-training-2024-02-18-22-26-04-112-pt-1708379512/variant/AllTraffic',
   'ScalableDimension': 'sagemaker:variant:DesiredInstanceCount',
   'MinCapacity': 1,
   'MaxCapacity': 2,
   'RoleARN': 'arn:aws:iam::123137613716:role/aws-service-role/sagemaker.application-autoscaling.amazonaws.com/AWSServiceRoleForApplicationAutoScaling_SageMakerEndpoint',
   'CreationTime': datetime.datetime(2024, 2, 19, 23, 3, 5, 598000, tzinfo=tzlocal()),
   'SuspendedState': {'DynamicScalingInSuspended': False,
    'DynamicScalingOutSuspended': False,
    'ScheduledScalingSuspended': False},
   'ScalableTargetARN': 'arn:aws:application-autoscaling:eu-west-2:123137613716:scalable-target/056m69254590eabb4de8963651868d2800c3'}],
 'ResponseMetadata': {'RequestId': '4245e3bc-c0ca-4431-bb5d-d45ab16080bb',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '4245e3bc-c0ca-4431-bb5d-d45ab16080bb',
   'content-type':

In [9]:
autoscale.put_scaling_policy(
    PolicyName="bert-reviews-autoscale-policy",
    ServiceNamespace="sagemaker",
    ResourceId="endpoint/" + tensorflow_endpoint_name + "/variant/AllTraffic",
    ScalableDimension="sagemaker:variant:DesiredInstanceCount",
    PolicyType="TargetTrackingScaling",
    TargetTrackingScalingPolicyConfiguration={
        "TargetValue": 2.0,
        "PredefinedMetricSpecification": {
            "PredefinedMetricType": "SageMakerVariantInvocationsPerInstance",
        },
        "ScaleOutCooldown": 60,
        "ScaleInCooldown": 300,
    },
)

{'PolicyARN': 'arn:aws:autoscaling:eu-west-2:123137613716:scalingPolicy:69254590-eabb-4de8-9636-51868d2800c3:resource/sagemaker/endpoint/tensorflow-training-2024-02-18-22-26-04-112-pt-1708379512/variant/AllTraffic:policyName/bert-reviews-autoscale-policy',
 'Alarms': [{'AlarmName': 'TargetTracking-endpoint/tensorflow-training-2024-02-18-22-26-04-112-pt-1708379512/variant/AllTraffic-AlarmHigh-6811cbd8-50e1-4c67-ac46-c5ed40a3c399',
   'AlarmARN': 'arn:aws:cloudwatch:eu-west-2:123137613716:alarm:TargetTracking-endpoint/tensorflow-training-2024-02-18-22-26-04-112-pt-1708379512/variant/AllTraffic-AlarmHigh-6811cbd8-50e1-4c67-ac46-c5ed40a3c399'},
  {'AlarmName': 'TargetTracking-endpoint/tensorflow-training-2024-02-18-22-26-04-112-pt-1708379512/variant/AllTraffic-AlarmLow-460c6f3b-01d1-4fab-a6ec-0c4ae07d528a',
   'AlarmARN': 'arn:aws:cloudwatch:eu-west-2:123137613716:alarm:TargetTracking-endpoint/tensorflow-training-2024-02-18-22-26-04-112-pt-1708379512/variant/AllTraffic-AlarmLow-460c6f3b-01

In [10]:
from IPython.core.display import display, HTML

display(
    HTML(
        '<b>Review <a target="blank" href="https://console.aws.amazon.com/sagemaker/home?region={}#/endpoints/{}">SageMaker REST Endpoint</a></b>'.format(
            region, tensorflow_endpoint_name
        )
    )
)

In [11]:
%%time

waiter = sm.get_waiter("endpoint_in_service")
waiter.wait(EndpointName=tensorflow_endpoint_name)

CPU times: user 24.6 ms, sys: 2.17 ms, total: 26.7 ms
Wall time: 171 ms


# Test the Deployed Model

In [12]:
import json
from sagemaker.tensorflow.model import TensorFlowPredictor
from sagemaker.serializers import JSONLinesSerializer
from sagemaker.deserializers import JSONLinesDeserializer

predictor = TensorFlowPredictor(
    endpoint_name=tensorflow_endpoint_name,
    sagemaker_session=sess,
    model_name="saved_model",
    model_version=0,
    content_type="application/jsonlines",
    accept_type="application/jsonlines",
    serializer=JSONLinesSerializer(),
    deserializer=JSONLinesDeserializer(),
)

content_type is a no-op in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


### Waiting for the Endpoint to be ready to Serve Predictions

In [13]:
import time

time.sleep(30)

# Run a Lot of Predictions and Watch the SageMaker Endpoint Scale Out

In [14]:
from IPython.core.display import display, HTML

display(
    HTML(
        '<b>Review <a target="blank" href="https://console.aws.amazon.com/sagemaker/home?region={}#/endpoints/{}">SageMaker REST Endpoint</a></b>'.format(
            region, tensorflow_endpoint_name
        )
    )
)

In [16]:
inputs = [{"features": ["This is great!"]}, {"features": ["This is bad."]}]
# reduced it from 100000 to 1000 to save compute cost here
for i in range(0, 1000):
    predicted_classes = predictor.predict(inputs)

    for predicted_class in predicted_classes:
        print("Predicted star_rating: {}".format(predicted_class))

Predicted star_rating: {'predicted_label': 1}
Predicted star_rating: {'predicted_label': 1}
Predicted star_rating: {'predicted_label': 1}
Predicted star_rating: {'predicted_label': 1}
Predicted star_rating: {'predicted_label': 1}
Predicted star_rating: {'predicted_label': 1}
Predicted star_rating: {'predicted_label': 1}
Predicted star_rating: {'predicted_label': 1}
Predicted star_rating: {'predicted_label': 1}
Predicted star_rating: {'predicted_label': 1}
Predicted star_rating: {'predicted_label': 1}
Predicted star_rating: {'predicted_label': 1}
Predicted star_rating: {'predicted_label': 1}
Predicted star_rating: {'predicted_label': 1}
Predicted star_rating: {'predicted_label': 1}
Predicted star_rating: {'predicted_label': 1}
Predicted star_rating: {'predicted_label': 1}
Predicted star_rating: {'predicted_label': 1}
Predicted star_rating: {'predicted_label': 1}
Predicted star_rating: {'predicted_label': 1}
Predicted star_rating: {'predicted_label': 1}
Predicted star_rating: {'predicted

In [17]:
autoscale.describe_scaling_activities(
    ServiceNamespace="sagemaker",
    ResourceId="endpoint/" + tensorflow_endpoint_name + "/variant/AllTraffic",
    ScalableDimension="sagemaker:variant:DesiredInstanceCount",
    MaxResults=100
)

{'ScalingActivities': [{'ActivityId': '7a5d8844-61c3-4f66-9c9b-178d691c1ad8',
   'ServiceNamespace': 'sagemaker',
   'ResourceId': 'endpoint/tensorflow-training-2024-02-18-22-26-04-112-pt-1708379512/variant/AllTraffic',
   'ScalableDimension': 'sagemaker:variant:DesiredInstanceCount',
   'Description': 'Setting desired instance count to 2.',
   'Cause': 'monitor alarm TargetTracking-endpoint/tensorflow-training-2024-02-18-22-26-04-112-pt-1708379512/variant/AllTraffic-AlarmHigh-6811cbd8-50e1-4c67-ac46-c5ed40a3c399 in state ALARM triggered policy bert-reviews-autoscale-policy',
   'StartTime': datetime.datetime(2024, 2, 19, 23, 11, 32, 636000, tzinfo=tzlocal()),
   'EndTime': datetime.datetime(2024, 2, 19, 23, 15, 32, 662000, tzinfo=tzlocal()),
   'StatusCode': 'Successful',
   'StatusMessage': 'Successfully set desired instance count to 2. Change successfully fulfilled by sagemaker.'}],
 'ResponseMetadata': {'RequestId': 'e26d1699-0b24-4d4b-8811-4060561c5d40',
  'HTTPStatusCode': 200,
 

# Delete Endpoint
To save cost, we should delete the endpoint.

In [None]:
# sm.delete_endpoint(
#      EndpointName=tensorflow_endpoint_name
# )

In [18]:
%%html

<p><b>Shutting down your kernel for this notebook to release resources.</b></p>
<button class="sm-command-button" data-commandlinker-command="kernelmenu:shutdown" style="display:none;">Shutdown Kernel</button>
        
<script>
try {
    els = document.getElementsByClassName("sm-command-button");
    els[0].click();
}
catch(err) {
    // NoOp
}    
</script>

In [19]:
%%javascript

try {
    Jupyter.notebook.save_checkpoint();
    Jupyter.notebook.session.delete();
}
catch(err) {
    // NoOp
}

<IPython.core.display.Javascript object>