# Sagemaker inference component for DeepSeek
use SageMaker new inference Componet to deply mutiple DeepSeek R1 Distill Model
- use SM DLC lmi image
- use mutiple GPU seperation & division for copies of DeepSeek R1 models(32B + 1.5B)


In [1]:
import boto3
import sagemaker
from sagemaker import Model, image_uris, serializers, deserializers

role = sagemaker.get_execution_role()  # execution role for the endpoint
sess = sagemaker.session.Session()  # sagemaker session for interacting with different AWS APIs
region = sess._region_name  # region name of the current SageMaker Studio environment
account_id = sess.account_id()  # account_id of the current SageMaker Studio environment
bucket = sess.default_bucket()
s3_client = boto3.client("s3")
sm_client = boto3.client("sagemaker")
smr_client = boto3.client("sagemaker-runtime")

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


## endpoint setup & deploy

In [5]:
import boto3
import sagemaker

role = sagemaker.get_execution_role()
sm_client = boto3.client(service_name="sagemaker")
endpoint_config_name="Sagemaker-inference-componet3"
endpoint_name = "Sagemaker-inference-componet-mme3"

!aws sagemaker delete-endpoint-config --endpoint-config-name Sagemaker-inference-componet3


sm_client.create_endpoint_config(
    EndpointConfigName=endpoint_config_name,
    ExecutionRoleArn=role,
    ProductionVariants=[{
        "VariantName": "AllTraffic",
        "InstanceType": "ml.g5.48xlarge",
        "InitialInstanceCount": 1,
		"RoutingConfig": {
            "RoutingStrategy": "LEAST_OUTSTANDING_REQUESTS"
        }
    }]
)

{'EndpointConfigArn': 'arn:aws:sagemaker:us-west-2:687912291502:endpoint-config/Sagemaker-inference-componet3',
 'ResponseMetadata': {'RequestId': '0850d666-ebb2-45e8-87d4-2556fb63d0ad',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '0850d666-ebb2-45e8-87d4-2556fb63d0ad',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '110',
   'date': 'Fri, 21 Feb 2025 13:41:15 GMT'},
  'RetryAttempts': 0}}

In [6]:
sm_client.create_endpoint(
    EndpointName=endpoint_name,
    EndpointConfigName=endpoint_config_name,
)

{'EndpointArn': 'arn:aws:sagemaker:us-west-2:687912291502:endpoint/Sagemaker-inference-componet-mme3',
 'ResponseMetadata': {'RequestId': '0166bfbd-0da5-4ef2-8719-aa9d7ed45ce5',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '0166bfbd-0da5-4ef2-8719-aa9d7ed45ce5',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '101',
   'date': 'Fri, 21 Feb 2025 13:41:15 GMT'},
  'RetryAttempts': 0}}

## dummy model data

In [5]:
%%sh
mkdir code
tar czvf source_code.tar.gz code/
rm -rf code

code/


In [6]:
s3_code_prefix = "deepseek/source_code"
source_data = sess.upload_data("source_code.tar.gz", bucket, s3_code_prefix)
print(f"S3 Code tar ball uploaded to --- > {source_data}")

S3 Code tar ball uploaded to --- > s3://sagemaker-us-west-2-687912291502/deepseek/source_code/source_code.tar.gz


In [7]:
%%sh
mkdir mymodel
tar czvf mymodel.tar.gz mymodel/
rm -rf mymodel

mymodel/


In [8]:
s3_code_prefix = "deepseek/model"
model_data = sess.upload_data("mymodel.tar.gz", bucket, s3_code_prefix)
print(f"S3 Model tar ball uploaded to --- > {model_data}")

S3 Model tar ball uploaded to --- > s3://sagemaker-us-west-2-687912291502/deepseek/model/mymodel.tar.gz


## ds qwen 1.5b model setup & deploy

In [None]:
import sagemaker
from sagemaker import Model, image_uris, serializers, deserializers
from sagemaker import get_execution_role
from sagemaker.pytorch import PyTorchModel
import time

hf_model_id="deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
# get lim image
image_uri = "763104351884.dkr.ecr.us-west-2.amazonaws.com/djl-inference:0.31.0-lmi13.0.0-cu124"
print(f"Image going to be used is ---- > {image_uri}")


model_name= "deepseek-r1-distill-qwen-1-5b-"+time.strftime("%Y-%m-%d-%H-%M-%S", time.gmtime())
endpoint_name = "Sagemaker-inference-componet-mme3"

vllm_config = {
    "HF_MODEL_ID": hf_model_id,
    "OPTION_TENSOR_PARALLEL_DEGREE": "max",
    "HF_TOKEN": "",
    "OPTION_ROLLING_BATCH": "vllm",
    "OPTION_OUTPUT_FORMATTER": "json",
    "OPTION_MAX_ROLLING_BATCH_SIZE": "16",
    "OPTION_MODEL_LOADING_TIMEOUT": "1600",
}

container_config = {
    'Image': image_uri,
    'ModelDataUrl': source_data,
    'Environment': vllm_config
}

response = sm_client.create_model(
    ModelName=model_name,
    ExecutionRoleArn=role,
    PrimaryContainer=container_config
)

print(f"Model created: {response['ModelArn']}")

Image going to be used is ---- > 763104351884.dkr.ecr.us-west-2.amazonaws.com/djl-inference:0.31.0-lmi13.0.0-cu124
Model created: arn:aws:sagemaker:us-west-2:687912291502:model/deepseek-r1-distill-qwen-1-5b-2025-02-22-03-56-13


In [21]:
sm_client.create_inference_component(
    InferenceComponentName="IC-deepseek-r1-distill-qwen-1-5b-"+time.strftime("%Y-%m-%d-%H-%M-%S", time.gmtime()),
    EndpointName=endpoint_name,
    VariantName="AllTraffic",
    Specification={
        "ModelName": model_name,
        "ComputeResourceRequirements": {
		    "NumberOfAcceleratorDevicesRequired": 1, 
			#"NumberOfCpuCoresRequired": 2, 
			"MinMemoryRequiredInMb": 4096
	    }
    },
    RuntimeConfig={"CopyCount": 1},
)

{'InferenceComponentArn': 'arn:aws:sagemaker:us-west-2:687912291502:inference-component/IC-deepseek-r1-distill-qwen-1-5b-2025-02-22-03-56-14',
 'ResponseMetadata': {'RequestId': '094ba7a1-4b7a-4d0a-9536-c81d896e8445',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '094ba7a1-4b7a-4d0a-9536-c81d896e8445',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '141',
   'date': 'Sat, 22 Feb 2025 03:56:14 GMT'},
  'RetryAttempts': 0}}

## ds qwen 32b model setup & deploy

#### boto3 api

In [None]:
import sagemaker
from sagemaker import Model, image_uris, serializers, deserializers
from sagemaker import get_execution_role
from sagemaker.pytorch import PyTorchModel
import time

hf_model_id="deepseek-ai/DeepSeek-R1-Distill-Qwen-32B"
# 获取 lim 推理容器
image_uri = "763104351884.dkr.ecr.us-west-2.amazonaws.com/djl-inference:0.31.0-lmi13.0.0-cu124"
print(f"Image going to be used is ---- > {image_uri}")


model_name= "deepseek-r1-distill-qwen-32b-"+time.strftime("%Y-%m-%d-%H-%M-%S", time.gmtime())
endpoint_name = "Sagemaker-inference-componet-mme3"

vllm_config = {
    "HF_MODEL_ID": hf_model_id,
    "OPTION_TENSOR_PARALLEL_DEGREE": "4",
    "HF_TOKEN": "",
    "OPTION_ROLLING_BATCH": "vllm",
    "OPTION_OUTPUT_FORMATTER": "json",
    "OPTION_MAX_ROLLING_BATCH_SIZE": "10",
    "OPTION_MAX_MODEL_LEN":"8092",
    "OPTION_MODEL_LOADING_TIMEOUT": "3600",
}
container_config = {
    'Image': image_uri,
    'ModelDataUrl': source_data,
    'Environment': vllm_config
}

response = sm_client.create_model(
    ModelName=model_name,
    ExecutionRoleArn=role,
    PrimaryContainer=container_config
)
print(f"Model created: {response['ModelArn']}")





Image going to be used is ---- > 763104351884.dkr.ecr.us-west-2.amazonaws.com/djl-inference:0.31.0-lmi13.0.0-cu124
Model created: arn:aws:sagemaker:us-west-2:687912291502:model/deepseek-r1-distill-qwen-32b-2025-02-22-02-57-08


In [10]:
sm_client.create_inference_component(
    InferenceComponentName="IC-deepseek-r1-distill-qwen-32b-"+time.strftime("%Y-%m-%d-%H-%M-%S", time.gmtime()),
    EndpointName=endpoint_name,
    VariantName="AllTraffic",
    Specification={
        "ModelName": model_name,
        "ComputeResourceRequirements": {
		    "NumberOfAcceleratorDevicesRequired": 4, 
			#"NumberOfCpuCoresRequired": 2, 
			"MinMemoryRequiredInMb": 80024
	    }
    },
    RuntimeConfig={"CopyCount": 1},
)

{'InferenceComponentArn': 'arn:aws:sagemaker:us-west-2:687912291502:inference-component/IC-deepseek-r1-distill-qwen-32b-2025-02-22-02-57-09',
 'ResponseMetadata': {'RequestId': '99c16a61-bffc-4939-b763-dc75f86d05c0',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '99c16a61-bffc-4939-b763-dc75f86d05c0',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '140',
   'date': 'Sat, 22 Feb 2025 02:57:09 GMT'},
  'RetryAttempts': 0}}

## inference test

In [3]:
from joblib import Parallel, delayed
import json
import codecs
import re
import datetime
import random

recipe_food = """
How to make cake?
"""

prompt_template = f"""
<|begin_of_text|>
<|start_header_id|>system<|end_header_id|>
You are a helpful chef assistant who is an expert in screating recipes.
<|eot_id|>

<|start_header_id|>user<|end_header_id|>
Create a recipe here.

{recipe_food}


Provide the summary directly, without any introduction or preamble. Do not start the response with "Here is a...".<|eot_id|>

<|start_header_id|>assistant<|end_header_id|>
"""


endpoint_name = "Sagemaker-inference-componet-mme3"
IC_s=[]
InferenceComponents = sm_client.list_inference_components(
    EndpointNameEquals=endpoint_name
)['InferenceComponents']
for InferenceComponent in InferenceComponents:
    IC_name = InferenceComponent['InferenceComponentName']
    print(IC_name)
    IC_s.append(IC_name)

import json
payload = {
        "inputs": prompt_template,
        "parameters": {
            "do_sample":True,
            "max_new_tokens":256,
            "top_p":0.9,
            "temperature":0.6,
        }
    }


def invoke_test():
    response = smr_client.invoke_endpoint(
        EndpointName=endpoint_name,
        #InferenceComponentName = "IC-deepseek-r1-distill-qwen-1-5b-2025-02-22-03-56-14",
        InferenceComponentName = 'IC-deepseek-r1-distill-qwen-32b-2025-02-22-02-57-09',
        ContentType="application/json",
        Accept="application/json",
        Body=json.dumps(payload),
    )
    
    result = json.loads(response['Body'].read().decode())
    print(result)


invoke_test()
    
## 10 并发
#results = Parallel(n_jobs=100, prefer='threads', verbose=1,)(
#    delayed(invoke_test)()
#    for index in range(1,10000)
#)

IC-deepseek-r1-distill-qwen-1-5b-2025-02-22-03-56-14
IC-deepseek-r1-distill-qwen-32b-2025-02-22-02-57-09
{'generated_text': "Sure! Here's a classic vanilla cake recipe:\n\n**Ingredients:**\n- 1 ½ cups (300g) all-purpose flour\n- 1 cup (200g) granulated sugar\n- 1 teaspoon baking powder\n- ½ teaspoon baking soda\n- ½ teaspoon salt\n- ½ cup (120ml) whole milk\n- ½ cup (120ml) vegetable oil\n- 2 large eggs\n- 2 teaspoons pure vanilla extract\n\n**Instructions:**\n1. Preheat your oven to 350°F (175°C) and grease two 9-inch round cake pans.\n2. In a large bowl, whisk together the flour, sugar, baking powder, baking soda, and salt.\n3. Add the milk, oil, eggs, and vanilla extract to the dry ingredients. Mix until just combined.\n4. Pour the batter evenly into the prepared pans.\n5. Bake for 25-30 minutes, or until a toothpick inserted into the center comes out clean.\n6. Let the cakes cool in the pans for 10 minutes, then transfer to a wire rack to cool completely before frosting.\n<|eot_id|

## inference componnet managment

In [2]:
endpoint_name = "Sagemaker-inference-componet-mme3"
sm_client.list_inference_components(
    EndpointNameEquals=endpoint_name
)


{'InferenceComponents': [{'CreationTime': datetime.datetime(2025, 2, 22, 3, 56, 14, 793000, tzinfo=tzlocal()),
   'InferenceComponentArn': 'arn:aws:sagemaker:us-west-2:687912291502:inference-component/IC-deepseek-r1-distill-qwen-1-5b-2025-02-22-03-56-14',
   'InferenceComponentName': 'IC-deepseek-r1-distill-qwen-1-5b-2025-02-22-03-56-14',
   'EndpointArn': 'arn:aws:sagemaker:us-west-2:687912291502:endpoint/sagemaker-inference-componet-mme3',
   'EndpointName': 'Sagemaker-inference-componet-mme3',
   'VariantName': 'AllTraffic',
   'InferenceComponentStatus': 'InService',
   'LastModifiedTime': datetime.datetime(2025, 2, 22, 3, 57, 37, 793000, tzinfo=tzlocal())},
  {'CreationTime': datetime.datetime(2025, 2, 22, 2, 57, 9, 771000, tzinfo=tzlocal()),
   'InferenceComponentArn': 'arn:aws:sagemaker:us-west-2:687912291502:inference-component/IC-deepseek-r1-distill-qwen-32b-2025-02-22-02-57-09',
   'InferenceComponentName': 'IC-deepseek-r1-distill-qwen-32b-2025-02-22-02-57-09',
   'EndpointAr

In [2]:
endpoint_name = "Sagemaker-inference-componet-mme3"
response = sm_client.update_inference_component(
    InferenceComponentName='IC-deepseek-r1-distill-qwen-32b-2025-02-22-02-57-09',
    Specification={
        "ModelName": "deepseek-r1-distill-qwen-32b-2025-02-22-02-57-08",
        'ComputeResourceRequirements': {
            'NumberOfAcceleratorDevicesRequired': 4,
            'MinMemoryRequiredInMb': 86024,
            'MaxMemoryRequiredInMb': 92400
        }
    },
    RuntimeConfig={
        'CopyCount': 1
    }
)

In [19]:
sm_client.delete_inference_component(InferenceComponentName='IC-deepseek-r1-distill-qwen-32b-****')

{'ResponseMetadata': {'RequestId': '0b326388-6144-483d-9dd1-5f826b4c2a03',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '0b326388-6144-483d-9dd1-5f826b4c2a03',
   'content-type': 'application/x-amz-json-1.1',
   'date': 'Fri, 21 Feb 2025 10:29:34 GMT',
   'content-length': '0'},
  'RetryAttempts': 0}}

In [3]:
##delete all IC componet
endpoint_name = "Sagemaker-inference-componet-mme3"
InferenceComponents = sm_client.list_inference_components(
    EndpointNameEquals=endpoint_name
)['InferenceComponents']
for InferenceComponent in InferenceComponents:
    IC_name = InferenceComponent['InferenceComponentName']
    print(IC_name)
    sm_client.delete_inference_component(InferenceComponentName=IC_name)
#print(response)

IC-deepseek-r1-distill-qwen-32b-2025-02-22-01-33-32
IC-deepseek-r1-distill-qwen-1-5b-2025-02-22-01-33-31


## clear

In [None]:
import time
sagemaker_client = boto3.client('sagemaker')
model_list = sagemaker_client.list_models()
for model in model_list['Models']:
    model_name = model['ModelName']
    model_arn = model['ModelArn']
    creation_time = model['CreationTime']
    print(f"model name: {model_name}, ARN: {model_arn}, created timestamp: {creation_time}")
    sagemaker_client.delete_model(ModelName=model_name)

    time.sleep(2)

In [None]:
sess.delete_endpoint(endpoint_name)
sess.delete_endpoint_config(endpoint_name)
model.delete_model()