## Build an Consum Inference Container

his notebook demonstrates how to build and use a custom Docker container for serving with Amazon SageMaker that leverages on <strong>sagemaker-inference-toolkit</strong> libraries for serving models through Amazon SageMaker's endpoints.


Useful links:
- https://github.com/awslabs/multi-model-server/
- https://github.com/aws/sagemaker-inference-toolkit

- https://github.com/aws-samples/amazon-sagemaker-mask-r-cnn-pytorch/blob/master/MaskRCNN_bring_your_own.ipynb

We start by defining some variables like the current execution role, the ECR repository that we are going to use for pushing the custom Docker container and a default Amazon S3 bucket to be used by Amazon SageMaker.

In [7]:
import boto3
import sagemaker
from sagemaker import get_execution_role

ecr_namespace = 'monai-classification'
#prefix = 'medical-image-server-container'

ecr_repository_name = ecr_namespace
role = get_execution_role()
account_id = role.split(':')[4]
region = boto3.Session().region_name
sess = sagemaker.session.Session()
bucket = sagemaker_session.default_bucket()
bucket_prefix = 'Inference_output' ## output for the results


print(account_id)
print(region)
print(role)
print(bucket)

741261399688
us-east-1
arn:aws:iam::741261399688:role/service-role/AmazonSageMaker-ExecutionRole-20191114T174410
sagemaker-us-east-1-741261399688


In [3]:
! pygmentize Dockerfile

[34mFROM[39;49;00m [33mpython:3.7[39;49;00m
[34mARG[39;49;00m [31mNB_USER[39;49;00m=[33m"sagemaker-user"[39;49;00m
[34mARG[39;49;00m [31mNB_UID[39;49;00m=[33m"1000"[39;49;00m
[34mARG[39;49;00m [31mNB_GID[39;49;00m=[33m"100"[39;49;00m
[34mRUN[39;49;00m [33m\[39;49;00m
    apt-get update && [33m\[39;49;00m
    apt-get install -y sudo && [33m\[39;49;00m
    useradd -m -s /bin/bash -N -u [31m$NB_UID[39;49;00m [31m$NB_USER[39;49;00m && [33m\[39;49;00m
    chmod g+w /etc/passwd && [33m\[39;49;00m
    [36mecho[39;49;00m [33m"[39;49;00m[33m${[39;49;00m[31mNB_USER[39;49;00m[33m}[39;49;00m[33m    ALL=(ALL)    NOPASSWD:    ALL[39;49;00m[33m"[39;49;00m >> /etc/sudoers && [33m\[39;49;00m
    # Prevent apt-get cache from being persisted to this layer.
    rm -rf /var/lib/apt/lists/*
[34mRUN[39;49;00m pip install [33m\[39;49;00m
        [33m'boto3>=1,<2'[39;49;00m [33m\[39;49;00m
        [33m'sagemaker>=2,<3'[39;49;00m [33

## build and push the container image to ECR

In [4]:
prefix = 'monai-classification'

In [3]:
%%capture
!build_and_push.sh $prefix 

In [5]:
!aws ecr list-images \
    --repository-name $prefix

{
    "imageIds": [
        {
            "imageDigest": "sha256:1c0c40b0755054a9e13ddb10124fe28f308aacda85ec2d1d4d6253197b7d943f",
            "imageTag": "latest"
        }
    ]
}


## Use the image for prediction 

find the model artifact in S3

In [13]:
## you should change your model artifact after sagemaker training job
s3_model_path = 's3://sagemaker-us-east-1-741261399688/pytorch-training-2022-04-22-12-50-11-763/output/model.tar.gz'

find the image uri from ECR 

In [10]:
container_image_uri = '{0}.dkr.ecr.{1}.amazonaws.com/{2}:latest'.format(account_id, region, prefix)
container_image_uri

'741261399688.dkr.ecr.us-east-1.amazonaws.com/monai-classification:latest'

In [11]:
from time import gmtime, strftime
from sagemaker.model import Model

model_name = 'medical-image-classification-model-server' + strftime("%Y-%m-%d-%H-%M-%S", gmtime())

model = Model(model_data = s3_model_path,
              image_uri = container_image_uri,
              env = {
                  'SAGEMAKER_PROGRAM': 'predictor'
              },
              role=role,
              name = model_name,
              predictor_cls = sagemaker.predictor.Predictor,
              sagemaker_session=sess #comment this line for local mode.
             )

In [14]:
sagemaker_client = boto3.client('sagemaker', region_name=region)
                                
model_name = 'medical-image-classification-model-server' + strftime("%Y-%m-%d-%H-%M-%S", gmtime())

create_model_response = sagemaker_client.create_model(
    ModelName = model_name,
    ExecutionRoleArn = role,
    PrimaryContainer = {
        'Image': container_image_uri,
        'ModelDataUrl': s3_model_path,
    })

In [15]:
create_model_response

{'ModelArn': 'arn:aws:sagemaker:us-east-1:741261399688:model/medical-image-classification-model-server2022-04-26-03-28-52',
 'ResponseMetadata': {'RequestId': '54522cc4-c6ce-4a04-8990-40a2cfc43f37',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '54522cc4-c6ce-4a04-8990-40a2cfc43f37',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '122',
   'date': 'Tue, 26 Apr 2022 03:28:51 GMT'},
  'RetryAttempts': 0}}

In [20]:
bucket_prefix = 'Inference_output'
bucket = sagemaker_session.default_bucket()

In [None]:
## create a endpoint configure 

In [42]:
import datetime
from time import gmtime, strftime

# Create an endpoint config name. Here we create one based on the date  
# so it we can search endpoints based on creation time.
endpoint_config_name = f"MedicalImageEndpointConfig-{strftime('%Y-%m-%d-%H-%M-%S', gmtime())}"

# The name of the model that you want to host. This is the name that you specified when creating the model.
model_name='pytorch-inference-2022-01-27-08-48-43-106'

create_endpoint_config_response = sagemaker_client.create_endpoint_config(
    EndpointConfigName=endpoint_config_name, # You will specify this name in a CreateEndpoint request.
    # List of ProductionVariant objects, one for each model that you want to host at this endpoint.
    ProductionVariants=[
        {
            "VariantName": "variant1", # The name of the production variant.
            "ModelName": model_name, 
            "InstanceType": "ml.m5.xlarge", # Specify the compute instance type.
            "InitialInstanceCount": 1 # Number of instances to launch initially.
        }
    ],
    AsyncInferenceConfig={
        "OutputConfig": {
            # Location to upload response outputs when no location is provided in the request.
            "S3OutputPath": f"s3://{bucket}/{bucket_prefix}/output",
            # (Optional) specify Amazon SNS topics
            
        },
        "ClientConfig": {
            # (Optional) Specify the max number of inflight invocations per instance
            # If no value is provided, Amazon SageMaker will choose an optimal value for you
            "MaxConcurrentInvocationsPerInstance": 4
        }
    }
)

print(f"Created EndpointConfig: {create_endpoint_config_response['EndpointConfigArn']}")

Created EndpointConfig: arn:aws:sagemaker:us-east-1:707754867495:endpoint-config/medicalimageendpointconfig-2022-02-23-08-19-43


In [43]:
endpoint_name = 'AsynchronousMedicalInference3' 

# The name of the endpoint configuration associated with this endpoint.

create_endpoint_response = sagemaker_client.create_endpoint(
                                            EndpointName=endpoint_name, 
                                            EndpointConfigName=endpoint_config_name,
                                           ) 

In [44]:
endpoint_name

'AsynchronousMedicalInference3'

In [59]:
## invoke the endpoint
sagemaker_runtime = boto3.client("sagemaker-runtime", region_name='us-east-1')
input_location = 's3://sagemaker-us-east-1-707754867495/inference_input/test-2.json'
response = sagemaker_runtime.invoke_endpoint_async(
    EndpointName=endpoint_name,
    InputLocation=input_location
)

In [60]:
response

{'ResponseMetadata': {'RequestId': '0b1e8cca-c5a0-4920-a093-2cefa7a50929',
  'HTTPStatusCode': 202,
  'HTTPHeaders': {'x-amzn-requestid': '0b1e8cca-c5a0-4920-a093-2cefa7a50929',
   'x-amzn-sagemaker-outputlocation': 's3://sagemaker-us-east-1-707754867495/Inference_output/output/f99f3083-c41a-4deb-8dce-d8c9b759138a.out',
   'date': 'Wed, 23 Feb 2022 09:34:25 GMT',
   'content-type': 'application/json',
   'content-length': '54'},
  'RetryAttempts': 0},
 'OutputLocation': 's3://sagemaker-us-east-1-707754867495/Inference_output/output/f99f3083-c41a-4deb-8dce-d8c9b759138a.out',
 'InferenceId': 'd2c2d7d9-1179-4d9c-b90f-26c6c8b063b4'}