In [None]:
# Install dependencies
!apt-get update
!apt-get install ffmpeg libsm6 libxext6  -y
!pip install -q smdebug
!pip install -q seaborn
!pip install -q plotly
!pip install -q opencv-python
!pip install -q shap
!pip install -q bokeh
!pip install -q imageio

# Using SageMaker Neo to Compile a Tensorflow ResNet-V2 Model

[SageMaker Neo](https://aws.amazon.com/sagemaker/neo/) makes it easy to compile pre-trained TensorFlow models and build an inference optimized container without the need for any custom model serving or inference code.

<img src="https://production-media.paperswithcode.com/methods/Screen_Shot_2020-09-25_at_10.26.40_AM_SAB79fQ.png" align="center" style="padding: 8px;width:500px;">

Residual Networks([ResNet](https://paperswithcode.com/method/resnet)) is a classic neural network used as a backbone for many computer vision tasks. This model was the winner of ImageNet challenge in 2015.

In this example, we will show how deploy a pre-trained RseNet model to a SageMaker Endpoint with Neo compilation using the [SageMaker Python SDK](https://github.com/aws/sagemaker-python-sdk), and then use the models to perform inference requests. We also provide a performance comparison so you can see the benefits of model compilation.

## Setup

First, we need to ensure we have SageMaker Python SDK 2.x, Tensorflow 2.4.x and the latest TensorflowHub. Then, import necessary Python packages.

In [None]:
!pip install -U --quiet --upgrade "sagemaker"
!pip install -U --quiet "tensorflow==2.4.2"
!pip install -U --quiet tensorflow_hub

In [None]:
import tarfile
import numpy as np
import sagemaker
import time
from sagemaker.utils import name_from_base

Next, we'll get the IAM execution role and a few other SageMaker specific variables from our notebook environment, so that SageMaker can access resources in your AWS account later in the example.

In [None]:
from sagemaker import get_execution_role
from sagemaker.session import Session

role = get_execution_role()
sess = Session()
region = sess.boto_region_name
bucket = sess.default_bucket()

SageMaker [Neo supports Tensorflow 2.4.x](https://docs.amazonaws.cn/en_us/sagemaker/latest/dg/neo-supported-cloud.html). Check your version of Tensorflow to prevent downstream framework errors.

In [None]:
import tensorflow as tf

print(tf.__version__)  # This notebook runs on TensorFlow 2.4.x or earlier

## Download ResNet Model

The SageMaker Neo TensorFlow Serving Container works with any model stored in TensorFlow's [SavedModel format](https://www.tensorflow.org/guide/saved_model). This could be the output of your own training job or a model trained elsewhere. For this example, we will use a pre-trained version of the ResNet model from Tensorflow Hub [here](https://tfhub.dev/google/imagenet/resnet_v2_50/classification/5)

In [None]:
import tensorflow as tf
import tensorflow_hub as hub

imported_model = hub.load("https://tfhub.dev/google/imagenet/resnet_v2_50/classification/5")
signature_serving_default = imported_model.signatures["serving_default"]
tf.saved_model.save(
    imported_model, "./imagenet/resnet_v2_50/00000001/", signatures=signature_serving_default
)

After downloading the model, we can inspect it using TensorFlow's ``saved_model_cli`` command. In the command output, you should see 

```
MetaGraphDef with tag-set: 'serve' contains the following SignatureDefs:

signature_def['serving_default']:
...
```

The command output should also show details of the model inputs and outputs.

In [None]:
!saved_model_cli show --all --dir {"./imagenet/resnet_v2_50/00000001/"} | grep "serving_default" -A 6

Next we need to create a model archive file containing the exported model.

In [None]:
!tar -C "$PWD" -czf imagenet.tar.gz imagenet/

## Upload the model archive file to S3

We now have a suitable model archive ready in our notebook. We need to upload it to S3 before we can create a SageMaker Model. We'll use the SageMaker Python SDK to handle the upload.

In [None]:
model_data = Session().upload_data(path="imagenet.tar.gz", key_prefix="model")
print("model uploaded to: {}".format(model_data))

## Create a SageMaker Model and Endpoint

Now that the model archive is in S3, we can create an unoptimized Model and deploy it to an 
Endpoint.

In [None]:
from sagemaker.tensorflow import TensorFlowModel

instance_type = "ml.c5.xlarge"
framework = "TENSORFLOW"
framework_version = "2.4"

In [None]:
uncompiled_model = TensorFlowModel(
    model_data=model_data, framework_version=framework_version, role=role
)
unoptimized_predictor = uncompiled_model.deploy(
    initial_instance_count=1, instance_type=instance_type
)

## Make predictions using the endpoint

The endpoint is now up and running, and ready to handle inference requests. The `deploy` call above returned a `predictor` object. The `predict` method of this object handles sending requests to the endpoint. It also automatically handles JSON serialization of our input arguments, and JSON deserialization of the prediction results.

We'll use this sample image:

<img src="kitten.jpg" align="left" style="padding: 8px;">

In [None]:
# read the image file into a tensor (numpy array)
import cv2

image = cv2.imread("./kitten.jpg")
image = cv2.resize(image, dsize=(224, 224), interpolation=cv2.INTER_CUBIC)
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
image = np.asarray(image)
image = cv2.normalize(image.astype("float"), None, 0, 1, cv2.NORM_MINMAX)
image = np.expand_dims(image, axis=0)

In [None]:
start_time = time.time()

# get a prediction from the endpoint
# the image input is automatically converted to a JSON request.
# the JSON response from the endpoint is returned as a python dict
result = unoptimized_predictor.predict(image)
print("Prediction took %.2f seconds" % (time.time() - start_time))

In [None]:
# read the labels from a file
labels = []
with open("labels.txt", "r") as f:
    labels = [l.strip() for l in f]

# add class labels to the predicted result
max_value = max(result["predictions"][0])
max_index = (result["predictions"][0]).index(max_value)

# show prediction result
print("Prediction Result: ", labels[max_index - 1])

## Uncompiled Predictor Performance

In [None]:
shape_input = np.random.rand(1, 224, 224, 3)
unoptimized_results = []

for _ in range(100):
    start = time.time()
    unoptimized_predictor.predict(shape_input)
    unoptimized_results.append((time.time() - start) * 1000)

print("\nPredictions for un-compiled model: \n")
print("\nP95: " + str(np.percentile(unoptimized_results, 95)) + " ms\n")
print("P90: " + str(np.percentile(unoptimized_results, 90)) + " ms\n")
print("P50: " + str(np.percentile(unoptimized_results, 50)) + " ms\n")
print("Average: " + str(np.average(unoptimized_results)) + " ms\n")

## Compile model using SageMaker Neo

In [None]:
# Replace the value of data_shape below and
# specify the name & shape of the expected inputs for your trained model in JSON
# Note that -1 is replaced with 1 for the batch size placeholder
data_shape = {"inputs": [1, 224, 224, 3]}

instance_family = "ml_c5"

# name_from_base add a time stamp suffix to the provided name
compilation_job_name = name_from_base("tf2-resnet-compile")
# output path for compiled model artifact
compiled_model_path = "s3://{}/{}/output".format(bucket, compilation_job_name)

In [None]:
compiled_model = uncompiled_model.compile(
    target_instance_family=instance_family,
    input_shape=data_shape,
    job_name=compilation_job_name,
    role=role,
    framework=framework.lower(),
    framework_version=framework_version,
    output_path=compiled_model_path,
)

## Create Optimized Endpoint

In [None]:
print(compiled_model.image_uri)

In [None]:
compiled_model.image_uri = (
    "301217895009.dkr.ecr.us-west-2.amazonaws.com/sagemaker-inference-tensorflow:2.4.2-cpu-py3"
)

In [None]:
print(compiled_model.image_uri)

In [None]:
optimized_predictor = compiled_model.deploy(initial_instance_count=1, instance_type=instance_type)

In [None]:
start_time = time.time()

# get a prediction from the endpoint
# the image input is automatically converted to a JSON request.
# the JSON response from the endpoint is returned as a python dict
result = optimized_predictor.predict(image)
print("Prediction took %.2f seconds" % (time.time() - start_time))

## Compiled Predictor Performance

In [None]:
optimized_results = []
test_input = {"instances": np.asarray(shape_input).tolist()}
# Warmup inference.
optimized_predictor.predict(image)
# Inferencing 100 times.
for _ in range(100):
    start = time.time()
    optimized_predictor.predict(image)
    optimized_results.append((time.time() - start) * 1000)

print("\nPredictions for compiled model: \n")
print("\nP95: " + str(np.percentile(optimized_results, 95)) + " ms\n")
print("P90: " + str(np.percentile(optimized_results, 90)) + " ms\n")
print("P50: " + str(np.percentile(optimized_results, 50)) + " ms\n")
print("Average: " + str(np.average(optimized_results)) + " ms\n")

## Performance Comparison

Here we compare inference speed up provided by SageMaker Neo. P90 is 90th percentile latency. We add this because it represents the tail of the latency distribution (worst case). More information on latency percentiles [here](https://blog.bramp.net/post/2018/01/16/measuring-percentile-latency/). 

In [None]:
p90 = np.percentile(unoptimized_results, 90) / np.percentile(optimized_results, 90)
p50 = np.percentile(unoptimized_results, 50) / np.percentile(optimized_results, 50)
avg = np.average(unoptimized_results) / np.average(optimized_results)

print("P90 Speedup: %.2f" % p90)
print("P50 Speedup: %.2f" % p50)
print("Average Speedup: %.2f" % avg)

## Additional Information

## Cleaning up

To avoid incurring charges to your AWS account for the resources used in this tutorial, you need to delete the SageMaker Endpoint.

In [None]:
unoptimized_predictor.delete_endpoint()

In [None]:
optimized_predictor.delete_endpoint()