# SageMaker Realtime Dynamic Batching Inference with Torchserve

**Imports**

In [1]:
import base64
import json
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image
import os
import boto3, time, json
import sagemaker

**Initiate session and retrieve region, account details**

In [2]:
sm_sess = sagemaker.Session()
role = sagemaker.get_execution_role()

In [3]:
sess = boto3.Session()
region = sess.region_name
account = boto3.client("sts").get_caller_identity().get("Account")

**Prepare model**

In [4]:
bucket = sm_sess.default_bucket()
prefix = "ts-dynamic-batching"
model_file_name = "BERTSeqClassificationZip"

!wget https://awsbatchblog.s3.us-west-2.amazonaws.com/BERTSeqClassification.mar
!mkdir -p bert_model
!unzip BERTSeqClassification.mar -d ./bert_model/
!tar cvfz {model_file_name}.tar.gz  -C ./bert_model/ .

!aws s3 cp BERTSeqClassificationZip.tar.gz s3://{bucket}/{prefix}/models/

!rm -rf bert_model

f"s3://{bucket}/{prefix}/models/"

--2021-11-04 00:30:12--  https://awsbatchblog.s3.us-west-2.amazonaws.com/BERTSeqClassification.mar
Resolving awsbatchblog.s3.us-west-2.amazonaws.com (awsbatchblog.s3.us-west-2.amazonaws.com)... 52.218.136.145
Connecting to awsbatchblog.s3.us-west-2.amazonaws.com (awsbatchblog.s3.us-west-2.amazonaws.com)|52.218.136.145|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 405606799 (387M) [binary/octet-stream]
Saving to: ‘BERTSeqClassification.mar’


2021-11-04 00:30:19 (55.1 MB/s) - ‘BERTSeqClassification.mar’ saved [405606799/405606799]

Archive:  BERTSeqClassification.mar
  inflating: ./bert_model/model.py   
  inflating: ./bert_model/pytorch_model.bin  
  inflating: ./bert_model/index_to_name.json  
  inflating: ./bert_model/setup_config.json  
  inflating: ./bert_model/config.json  
  inflating: ./bert_model/Transformer_handler_generalized.py  
  inflating: ./bert_model/MAR-INF/MANIFEST.json  
./
./model.py
./pytorch_model.bin
./index_to_name.json
./MAR-INF/
./M

's3://sagemaker-us-west-2-850464037171/ts-dynamic-batching/models/'

In [5]:
model_artifact = f's3://{bucket}/{prefix}/models/BERTSeqClassificationZip.tar.gz'

In [6]:
model_name = "hf-dynamic-torchserve-sagemaker"

## Using AWS Deep Learning Container
`Note: See end of notebook for using a custom container`

In [7]:
# We'll use a pytorch inference DLC image that ships with sagemaker-pytorch-inference-toolkit v2.0.7. This version includes support for Torchserve environment variables used below.
image_uri = "763104351884.dkr.ecr.us-west-2.amazonaws.com/pytorch-inference:1.9.0-gpu-py38-cu111-ubuntu20.04"

#### Create Sagemaker model, deploy and predict

In [8]:
from sagemaker.pytorch.model import PyTorchModel

env_variables_dict = {
    "SAGEMAKER_TS_BATCH_SIZE": "3",
    "SAGEMAKER_TS_MAX_BATCH_DELAY": "100000"
}

pytorch_model = PyTorchModel(
    model_data=model_artifact,
    role=role,
    image_uri=image_uri,
    source_dir="code",
    framework_version='1.9',
    entry_point="inference.py",
    env=env_variables_dict
)


In [9]:
# Change the instance type as necessary, or use 'local' for executing in Sagemaker local mode
instance_type = "ml.c5.18xlarge"

predictor = pytorch_model.deploy(initial_instance_count=1, instance_type=instance_type, serializer=sagemaker.serializers.JSONSerializer(), deserializer=sagemaker.deserializers.BytesDeserializer())

-------------!

## Predictions

#### The following prediction call could timeout for certain instance types (SageMaker 60 second limit)

In [10]:
import time
start = time.time()
result = predictor.predict("{Bloomberg has decided to publish a new report on global economic situation.}")
print("TIME:", time.time() - start)
print("ENDPOINT RESULT:", result)

TIME: 0.2232494354248047
ENDPOINT RESULT: b'["Not Accepted"]'


#### This following prediction calls could timeout since the first call to predictor hangs waiting for response on certain instance types, although some may succeed

In [11]:
import time
start = time.time()
result1 = predictor.predict("{Bloomberg has decided to publish a new report on global economic situation.}")
result2 = predictor.predict("{Bloomberg has decided to publish a new report on global economic situation.}")
result3 = predictor.predict("{Bloomberg has decided to publish a new report on global economic situation.}")

print("TIME:", time.time() - start)
print("ENDPOINT RESULT 1:", result1)
print("ENDPOINT RESULT 2:", result2)
print("ENDPOINT RESULT 3:", result3)

TIME: 0.220505952835083
ENDPOINT RESULT 1: b'["Not Accepted"]'
ENDPOINT RESULT 2: b'["Not Accepted"]'
ENDPOINT RESULT 3: b'["Not Accepted"]'


#### By spawning a pool of 3 processes we're able to return the results successfully

In [12]:
import multiprocessing

def invoke(endpoint_name):
    predictor = sagemaker.predictor.Predictor(endpoint_name,
                                              sm_sess,
                                              serializer=sagemaker.serializers.JSONSerializer(),
                                              deserializer=sagemaker.deserializers.BytesDeserializer())
    return predictor.predict("{Bloomberg has decided to publish a new report on global economic situation.}")

endpoint_name = predictor.endpoint_name
pool = multiprocessing.Pool(3)
results = pool.map(invoke, 3*[endpoint_name])
pool.close()
pool.join()
print(results)

[b'["Not Accepted"]', b'["Not Accepted"]', b'["Not Accepted"]']


In [13]:
predictor.delete_endpoint(predictor.endpoint_name)

## Using a custom container

#### Details (Also see ./docker/)
* Prebaked config.properties file included
  * 1.66 Minute Batch Delay (Longer than SageMaker 60s Timeout)
  * Batch size of 3
  * Note: the image needs to be built and pushed only once.

In [14]:
%%sh

container_name=custom-dynamic-torchserve
account=$(aws sts get-caller-identity --query Account --output text)

# Get the region defined in the current configuration (default to us-west-2 if none defined)
region=$(aws configure get region)
region=${region:-us-west-2}

fullname="${account}.dkr.ecr.${region}.amazonaws.com/${container_name}"

# If the repository doesn't exist in ECR, create it.
aws ecr describe-repositories --repository-names "${container_name}" > /dev/null 2>&1
if [ $? -ne 0 ]
then
    aws ecr create-repository --repository-name "${container_name}" > /dev/null
fi

# Get the login command from ECR and execute it directly
$(aws ecr get-login --region ${region} --no-include-email)

# Build the docker image locally with the image name and then push it to ECR
# with the full name.
docker build  -t ${container_name} docker/
docker tag ${container_name} ${fullname}

docker push ${fullname}

Process is interrupted.
