# SageMaker Example

## 2. Build the container

demo codes are in `app/`
build and push the docker with following commands:

In [None]:
!pip install  -U sagemaker boto3

In [None]:
!bash build_and_push_sglang.sh

## 3. Deploy on SageMaker

define the model and deploy on SageMaker


### 3.1 Init SageMaker session

In [None]:
# !pip install boto3 sagemaker transformers
import re
import json
import os,dotenv
import boto3
import sagemaker
from sagemaker import Model


dotenv.load_dotenv()
print(os.environ)

boto_sess = boto3.Session(
    region_name='us-east-1'
)

sess = sagemaker.session.Session(boto_session=boto_sess)
# role = sagemaker.get_execution_role()
role = os.environ.get('role')

In [None]:
# role

### 3.2 Prepare model file

#### Option 2: deploy vllm by model_id

In [None]:
!tar czvf model.tar.gz model_tar/

In [None]:


s3_code_prefix = f"sagemaker_endpoint/sglang/"
bucket = sess.default_bucket() 
code_artifact = sess.upload_data("model.tar.gz", bucket, s3_code_prefix)
print(f"S3 Code or Model tar ball uploaded to --- > {code_artifact}")

### 3.3 Deploy model

### test deployment from s3

In [None]:
import sagemaker
import boto3
sm_client = boto3.client(service_name="sagemaker")


### Create a SageMaker model



(llama3-8b-scale-to-zero-autoscaling)[https://github.com/aws-samples/sagemaker-genai-hosting-examples/blob/main/scale-to-zero-endpoint/llama3-8b-scale-to-zero-autoscaling.ipynb]

In [None]:

CONTAINER='434444145045.dkr.ecr.us-east-1.amazonaws.com/sagemaker_endpoint/sglang:v0.4.3.post2-cu124'
# CONTAINER='434444145045.dkr.ecr.us-east-1.amazonaws.com/sagemaker_endpoint/vllm:v0.7.2'
model_path = "s3://sagemaker-us-east-1-434444145045/Qwen2-1-5B-Instruct/6d0410c634ea438fa5018072e84c10a6/finetuned_model_merged/"
model_id = 'Qwen/Qwen2-1.5B-Instruct'
base_name = sagemaker.utils.name_from_base("sagemaker")
sm_client = boto3.client(service_name="sagemaker")
model_name = base_name +"-model"
endpoint_name = base_name+"-endpoint"
component_name = base_name+"-component"
endpoint_config_name =  base_name+"-config"
env={
    "HF_MODEL_ID": model_id,
    "S3_MODEL_PATH":model_path,
}
container_config = {
    'Image': CONTAINER,
    'ModelDataUrl': code_artifact,
    'Environment': env
}

print(model_name)
print(endpoint_name)
print(component_name)
print(endpoint_config_name)

In [None]:
# endpoint_name = "sagemaker-2025-03-01-11-30-13-897-endpoint"
# component_name = "sagemaker-2025-03-01-11-30-13-897-component"

In [None]:

response = sm_client.create_model(
    ModelName=model_name,
    ExecutionRoleArn=role,
    PrimaryContainer=container_config
)

print(f"Model created: {response['ModelArn']}")

### Create a SageMaker endpoint configuration

In [None]:
instance_type = "ml.g5.2xlarge"
model_data_download_timeout_in_seconds = 1200
container_startup_health_check_timeout_in_seconds = 1200

min_instance_count = 0 # Minimum instance must be set to 0
max_instance_count = 2

sm_client.create_endpoint_config(
    EndpointConfigName=endpoint_config_name,
    ExecutionRoleArn=role,
    ProductionVariants=[
        {
            "VariantName": "AllTraffic",
            "InstanceType": instance_type,
            "InitialInstanceCount": 1,
            "ModelDataDownloadTimeoutInSeconds": model_data_download_timeout_in_seconds,
            "ContainerStartupHealthCheckTimeoutInSeconds": container_startup_health_check_timeout_in_seconds,
            "ManagedInstanceScaling": {
                "Status": "ENABLED",
                "MinInstanceCount": min_instance_count,
                "MaxInstanceCount": max_instance_count,
            },
            "RoutingConfig": {"RoutingStrategy": "LEAST_OUTSTANDING_REQUESTS"},
        }
    ],
)

### Create the SageMaker endpoint



In [None]:
sm_client.create_endpoint(
    EndpointName=endpoint_name,
    EndpointConfigName=endpoint_config_name,
)

In [None]:
import time
resp = sm_client.describe_endpoint(EndpointName=endpoint_name)
status = resp["EndpointStatus"]
print("Status: " + status)

while status == "Creating":
    time.sleep(60)
    resp = sm_client.describe_endpoint(EndpointName=endpoint_name)
    status = resp["EndpointStatus"]
    print("Status: " + status)

print("Arn: " + resp["EndpointArn"])
print("Status: " + status)

In [None]:
t1 = time.time()
sm_client.create_inference_component(
    InferenceComponentName=component_name,
    EndpointName=endpoint_name,
    VariantName="AllTraffic",
    Specification={
        "ModelName": model_name,
        "ComputeResourceRequirements": {
		    "NumberOfAcceleratorDevicesRequired": 1, 
			#"NumberOfCpuCoresRequired": 2, 
			"MinMemoryRequiredInMb": 1024*8
	    }
    },
    RuntimeConfig={"CopyCount": 1},
)

## 4. Test

you can invoke your model with SageMaker SDK

### 4.1 Message api non-stream mode

In [None]:
def check_inference_component_status(inference_component_name):
    sagemaker_client = boto3.client('sagemaker')
    
    try:
        response = sagemaker_client.describe_inference_component(
            InferenceComponentName=inference_component_name
        )
        # print(response)
        status = response['InferenceComponentStatus']
        print(f"Inference Component Status: {status}")
        
        if status == 'InService':
            print("Inference component has been successfully deployed")
            return True
        elif status in ['Creating', 'Updating']:
            print("Inference component is still being deployed...")
            return False
        else:
            print(f"Inference component deployment failed with status: {status}")
            # You might want to check the FailureReason if available
            if 'FailureReason' in response:
                print(f"Failure reason: {response['FailureReason']}")
            return True
            
    except Exception as e:
        print(f"Error checking inference component status: {str(e)}")
        return True
    
while True:
    if  check_inference_component_status(component_name):
        print(f"Inference Component is ready:{time.time()-t1:.1f} s")
        break
    time.sleep(30)
    
from sagemaker.predictor import retrieve_default 
from sagemaker import Predictor
from sagemaker import serializers, deserializers
# predictor = retrieve_default(endpoint_name, sagemaker_session=sess) 

predictor = Predictor(
            endpoint_name=endpoint_name,
            sagemaker_session=sess,
            serializer=serializers.JSONSerializer(),
            component_name=component_name
        )

payload = {
    "messages": [
    {
        "role": "user",
        "content": "who are you"
    }
    ],
    "model":"qwen",
    "max_tokens": 1024,
    "stream": False
}
response = predictor.predict(payload) 
print(response)

In [None]:
# component_name = "sagemaker-2025-03-01-00-39-49-809-component"
# endpoint_name = "sagemaker-2025-03-01-00-39-49-809-endpoint"
runtime = boto3.client('runtime.sagemaker',region_name='us-east-1')
payload = {
    "messages": [
    {
        "role": "user",
        "content": "who are you"
    }
    ],
    "model":"qwen",
    "max_tokens": 1024,
    "stream": False
}
response = runtime.invoke_endpoint(
    EndpointName=endpoint_name,
    InferenceComponentName=component_name,
    ContentType='application/json',
    Body=json.dumps(payload)
)

print(json.loads(response['Body'].read())["choices"][0]["message"]["content"])

### 4.2 Message api stream mode

In [None]:
payload = {
    "messages": [
    {
        "role": "user",
        "content": "Write a quick sort in python"
    }
    ],
    "model":"custome",
    "max_tokens": 4096,
    "stream": True
}

response = runtime.invoke_endpoint_with_response_stream(
    EndpointName=endpoint_name,
    InferenceComponentName=component_name,
    ContentType='application/json',
    Body=json.dumps(payload)
)

buffer = ""
for t in response['Body']:
    buffer += t["PayloadPart"]["Bytes"].decode()
    last_idx = 0
    for match in re.finditer(r'^data:\s*(.+?)(\n\n)', buffer):
        try:
            data = json.loads(match.group(1).strip())
            last_idx = match.span()[1]
            print(data["choices"][0]["delta"]["content"], end="")
        except (json.JSONDecodeError, KeyError, IndexError) as e:
            pass
    buffer = buffer[last_idx:]

### 4.3 Register a new autoscaling target
After you create your SageMaker endpoint and inference components, you register a new auto scaling target for Application Auto Scaling. In the following code block, you set MinCapacity to 0, which is required for your endpoint to scale down to zero

https://aws.amazon.com/cn/blogs/machine-learning/unlock-cost-savings-with-the-new-scale-down-to-zero-feature-in-amazon-sagemaker-inference/

In [None]:
aas_client = sess.boto_session.client("application-autoscaling")
cloudwatch_client = sess.boto_session.client("cloudwatch")

- 这一步必须，先注册resource_id

In [None]:

# Autoscaling parameters
resource_id = f"inference-component/{component_name}"
service_namespace = "sagemaker"
scalable_dimension = "sagemaker:inference-component:DesiredCopyCount"

min_copy_count = 0
max_copy_count = 4

aas_client.register_scalable_target(
    ServiceNamespace=service_namespace,
    ResourceId=resource_id,
    ScalableDimension=scalable_dimension,
    MinCapacity=min_copy_count,
    MaxCapacity=max_copy_count,
)

- After you have registered your new scalable target, the next step is to define your target tracking policy. In the following code example, we set the TargetValue to 5. This setting instructs the auto scaling system to increase capacity when the number of concurrent requests per model reaches or exceeds 5.

In [None]:
aas_client.describe_scalable_targets(
    ServiceNamespace=service_namespace,
    ResourceIds=[resource_id],
    ScalableDimension=scalable_dimension,
)

# The policy name for the target traking policy
target_tracking_policy_name = f"Target-tracking-policy-{component_name}"

aas_client.put_scaling_policy(
    PolicyName=target_tracking_policy_name,
    PolicyType="TargetTrackingScaling",
    ServiceNamespace=service_namespace,
    ResourceId=resource_id,
    ScalableDimension=scalable_dimension,
    TargetTrackingScalingPolicyConfiguration={
        "PredefinedMetricSpecification": {
            "PredefinedMetricType": "SageMakerInferenceComponentConcurrentRequestsPerCopyHighResolution",
        },
        # Low TPS + load TPS
        "TargetValue": 5,  # you need to adjust this value based on your use case
        "ScaleInCooldown": 180,  # default 300
        "ScaleOutCooldown": 180,  # default 300
    },
)

### Scale out from zero policy (step scaling policy )
To enable your endpoint to scale out from zero instances, do the following:

Configure Step Scaling Policy
Create a step scaling policy that defines when and how to scale out from zero. This policy will add 1 model copy when triggered, enabling SageMaker to provision the instances required to handle incoming requests after being idle. The following shows you how to define a step scaling policy. Here we have configured to scale out from 0 to 1 model copy ("ScalingAdjustment": 1), depending on your use case you can adjust ScalingAdjustment as required.


In [None]:
#The policy name for the step scaling policy

# Autoscaling parameters
resource_id = f"inference-component/{component_name}"
service_namespace = "sagemaker"
scalable_dimension = "sagemaker:inference-component:DesiredCopyCount"
step_scaling_policy_name = f"Step-scaling-policy-{component_name}"

aas_client.put_scaling_policy(
    PolicyName=step_scaling_policy_name,
    PolicyType="StepScaling",
    ServiceNamespace=service_namespace,
    ResourceId=resource_id,
    ScalableDimension=scalable_dimension,
    StepScalingPolicyConfiguration={
        "AdjustmentType": "ChangeInCapacity",
        "MetricAggregationType": "Maximum",
        "Cooldown": 60,
        "StepAdjustments":
          [
             {
               "MetricIntervalLowerBound": 0,
               "ScalingAdjustment": 1
             }
          ]
    },
)

In [None]:
resp = aas_client.describe_scaling_policies(
    PolicyNames=[step_scaling_policy_name,target_tracking_policy_name],
    ServiceNamespace=service_namespace,
    ResourceId=resource_id,
    ScalableDimension=scalable_dimension,
)
step_scaling_policy_arn = resp['ScalingPolicies'][0]['PolicyARN']
print(f"step_scaling_policy_arn: {step_scaling_policy_arn}")
print(resp['ScalingPolicies'])

### Create the CloudWatch alarm that will trigger our policy
Finally, create a CloudWatch alarm with the metric NoCapacityInvocationFailures. When triggered, the alarm initiates the previously defined scaling policy. For more information about the NoCapacityInvocationFailures metric, see documentation.

We have also set the following:  

- EvaluationPeriods to 1  
- DatapointsToAlarm to 1  
- ComparisonOperator to GreaterThanOrEqualToThreshold  
- This results in 1 min waiting for the step scaling policy to trigger  



In [None]:
# The alarm name for the step scaling alarm
step_scaling_alarm_name = f"step-scaling-alarm-scale-to-zero-aas-{component_name}"

cloudwatch_client.put_metric_alarm(
    AlarmName=step_scaling_alarm_name,
    AlarmActions=[step_scaling_policy_arn],  # Replace with your actual ARN
    MetricName='NoCapacityInvocationFailures',
    Namespace='AWS/SageMaker',
    Statistic='Maximum',
    Dimensions=[
        {
            'Name': 'InferenceComponentName',
            'Value': component_name  # Replace with actual InferenceComponentName
        }
    ],
    Period=30, # 定义了 CloudWatch 收集和聚合指标数据的时间间隔，CloudWatch 支持的最小 Period 值通常为 10 或 60 秒，取决于指标类型和监控级别
    EvaluationPeriods=1, #定义了在多少个连续的数据点中需要满足条件才会触发警报，=1 表示只需要评估 1 个时间段（即 30 秒，由 Period 定义）
    DatapointsToAlarm=1, #表示在 EvaluationPeriods 中需要满足阈值条件的数据点数量，=1 表示在评估的 1 个时间段内，只要有 1 个数据点满足条件就触发警报
    Threshold=1, #表示当 NoCapacityInvocationFailures 指标值大于或等于 1 时触发警报
    ComparisonOperator='GreaterThanOrEqualToThreshold',
    TreatMissingData='missing' #缺失的数据点不会触发警报状态的任何变化，notBreaching将缺失的数据点视为"良好"或"未违反阈值"，breaching将缺失的数据点视为"违反阈值"，ignore保持当前警报状态不变，直到有新数据点出现
)

### check updating status

In [None]:
while True:
    if  check_inference_component_status(component_name):
        break
    time.sleep(30)
start_time = time.time()

### Test the solution
Notice the MinInstanceCount: 0 setting in the Endpoint configuration, which allows the endpoint to scale down to zero instances. With the scaling policy, CloudWatch alarm, and minimum instances set to zero, your SageMaker Inference Endpoint will now be able to automatically scale down to zero instances when not in use, helping you optimize your costs and resource utilization.

### IC copy count scales in to zero
We'll pause for a few minutes without making any invocations to our model. Based on our target tracking policy, when our SageMaker endpoint doesn't receive requests for about 10 to 15 minutes, it will automatically scale down to zero the number of model copies.

In [None]:
import sys
# time.sleep(900)
while True:
    desc = sm_client.describe_inference_component(InferenceComponentName=component_name)
    status = desc["InferenceComponentStatus"]
    print(status)
    sys.stdout.flush()
    if status not in ["InService", "Failed"]:
        break
    time.sleep(30)

total_time = time.time() - start_time
print(f"\nTotal time taken: {total_time:.2f} seconds ({total_time/60:.2f} minutes)")

desc = sm_client.describe_inference_component(InferenceComponentName=component_name)
print(desc)

### Endpoint's instances scale in to zero
After 10 additional minutes of inactivity, SageMaker automatically terminates all underlying instances of the endpoint, eliminating all associated costs.

In [None]:
# sess.wait_for_endpoint(endpoint_name)


- After 10 additional minutes of inactivity, SageMaker automatically stops all underlying instances of the endpoint, eliminating all associated instance costs.

If we try to invoke our endpoint while instances are scaled down to zero, we get a validation error:  
`An error occurred (ValidationError) when calling the InvokeEndpoint operation: Inference Component has no capacity to process this request. ApplicationAutoScaling may be in-progress (if configured) or try to increase the capacity by invoking UpdateInferenceComponentRuntimeConfig API.`

In [None]:
from sagemaker.predictor import retrieve_default 
from sagemaker import Predictor
from sagemaker import serializers, deserializers
# predictor = retrieve_default(endpoint_name, sagemaker_session=sess) 

predictor = Predictor(
            endpoint_name=endpoint_name,
            sagemaker_session=sess,
            serializer=serializers.JSONSerializer(),
            component_name=component_name
        )

payload = {
    "messages": [
    {
        "role": "user",
        "content": "who are you"
    }
    ],
    "model":"qwen",
    "max_tokens": 1024,
    "stream": False
}
response = predictor.predict(payload) 
print(response)

In [None]:
## test for scaling up
t1 = time.time()
i = 0
while True:
    predictor.predict(payload) 
    i += 1
    if i % 10 == 0:
        print(response)
        print(f"---{i}----time: {(time.time() - t1):.1f}")
    if i == 100000:
        break

### 4.4 Delete

In [None]:
!aws sagemaker delete-inference-component --inference-component-name {component_name}


In [None]:
!aws sagemaker delete-endpoint --endpoint-name {endpoint_name}


In [None]:
!aws sagemaker delete-endpoint-config --endpoint-config-name {endpoint_config_name}


In [None]:
!aws sagemaker delete-model --model-name {model_name}