# Optimizing PyTorch Model for Inference

### Install and Import the Dependencies

In [7]:
!pip install --upgrade --no-cache-dir torch-neuron neuron-cc[tensorflow] torchvision torch --extra-index-url=https://pip.repos.neuron.amazonaws.com
!pip install --upgrade --no-cache-dir 'transformers==4.6.0'
!pip install accelerate==0.20.3'
!pip install transformers

In [8]:
import torch
import torch_neuron
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoConfig

### Compile and Save the Model into TorchScript

In [9]:
# Build tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("gchhablani/bert-base-cased-finetuned-mrpc")
model = AutoModelForSequenceClassification.from_pretrained("gchhablani/bert-base-cased-finetuned-mrpc", return_dict=False)

# Setup some example inputs
sequence_0 = "The company HuggingFace is based in New York City"
sequence_1 = "Apples are especially bad for your health"
sequence_2 = "HuggingFace's headquarters are situated in Manhattan"

max_length=128
paraphrase = tokenizer.encode_plus(sequence_0, sequence_2, max_length=max_length, padding='max_length', truncation=True, return_tensors="pt")
not_paraphrase = tokenizer.encode_plus(sequence_0, sequence_1, max_length=max_length, padding='max_length', truncation=True, return_tensors="pt")

# Run the original PyTorch model on compilation exaple
paraphrase_classification_logits = model(**paraphrase)[0]

# Convert example inputs to a format that is compatible with TorchScript tracing
example_inputs_paraphrase = paraphrase['input_ids'], paraphrase['attention_mask'], paraphrase['token_type_ids']
example_inputs_not_paraphrase = not_paraphrase['input_ids'], not_paraphrase['attention_mask'], not_paraphrase['token_type_ids']

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/433 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/433M [00:00<?, ?B/s]

In [None]:
%%time
# Run torch.neuron.trace to generate a TorchScript that is optimized by AWS Neuron
model_neuron = torch.neuron.trace(model, example_inputs_paraphrase, verbose=1, compiler_workdir='./compilation_artifacts')

In [12]:
# Save the TorchScript for later use
model_neuron.save('neuron_compiled_model.pt')

### Upload the Compiled Model to S3

In [13]:
# Create a model.tar.gz file to be used by SageMaker endpoint
!tar -czvf model.tar.gz neuron_compiled_model.pt

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
neuron_compiled_model.pt


In [14]:
import boto3
import time
from sagemaker.utils import name_from_base
import sagemaker

In [15]:
# upload model to S3
role = sagemaker.get_execution_role()
sess=sagemaker.Session()
region=sess.boto_region_name
bucket=sess.default_bucket()
sm_client=boto3.client('sagemaker')

In [16]:
model_key = '{}/model/model.tar.gz'.format('inf1_compiled_model')
model_path = 's3://{}/{}'.format(bucket, model_key)
boto3.resource('s3').Bucket(bucket).upload_file('model.tar.gz', model_key)
print("Uploaded model to S3:")
print(model_path)

Uploaded model to S3:
s3://sagemaker-ap-south-1-128015641074/inf1_compiled_model/model/model.tar.gz


### Install Custom Libraries in the PyTorch Container and Push to ECR
We use Sagemaker Prebuilt Container here and install the transformers libraries. You can follow the similar approach to install any custom libraries required by your inference code. 

In [23]:
!cat container/Dockerfile

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
FROM 763104351884.dkr.ecr.us-east-1.amazonaws.com/pytorch-inference-neuron:1.7.1-neuron-py36-ubuntu18.04

# Install packages
RUN pip install "transformers==4.7.0"

In [24]:
%%sh

# The name of our algorithm
algorithm_name=neuron-py36-inference
cd container
account=$(aws sts get-caller-identity --query Account --output text)

# Get the region defined in the current configuration (default to us-west-2 if none defined)
region=$(aws configure get region)
region=${region:-ap-south-1}
fullname="${account}.dkr.ecr.${region}.amazonaws.com/${algorithm_name}:latest"

# If the repository doesn't exist in ECR, create it.
aws ecr describe-repositories --repository-names "${algorithm_name}" > /dev/null 2>&1

if [ $? -ne 0 ]
then
    aws ecr create-repository --repository-name "${algorithm_name}" > /dev/null
fi

# Get the login command from ECR in order to pull down the SageMaker PyTorch image
aws ecr get-login-password --region us-east-1 | docker login --username AWS --password-stdin 763104351884.dkr.ecr.us-east-1.amazonaws.com

# Build the docker image locally with the image name and then push it to ECR with the full name.
docker build  -t ${algorithm_name} . --build-arg REGION=${region}
docker tag ${algorithm_name} ${fullname}

# Get the login command from ECR and execute it directly
aws ecr get-login-password --region ${region} | docker login --username AWS --password-stdin ${account}.dkr.ecr.${region}.amazonaws.com
docker push ${fullname}

https://docs.docker.com/engine/reference/commandline/login/#credentials-store



Login Succeeded
Sending build context to Docker daemon  2.048kB
Step 1/2 : FROM 763104351884.dkr.ecr.us-east-1.amazonaws.com/pytorch-inference-neuron:1.7.1-neuron-py36-ubuntu18.04
1.7.1-neuron-py36-ubuntu18.04: Pulling from pytorch-inference-neuron
4bbfd2c87b75: Pulling fs layer
d2e110be24e1: Pulling fs layer
889a7173dcfe: Pulling fs layer
3575d41c0835: Pulling fs layer
bc6f168604e2: Pulling fs layer
eba9221f33a4: Pulling fs layer
dc17b7738357: Pulling fs layer
edfa9cb4c511: Pulling fs layer
a3ae7b373871: Pulling fs layer
72c61eda0de1: Pulling fs layer
81938fffccdc: Pulling fs layer
d15411504276: Pulling fs layer
2e5382e0d0b4: Pulling fs layer
e636fe876326: Pulling fs layer
28a1c6f31ec5: Pulling fs layer
bc6f168604e2: Waiting
eba9221f33a4: Waiting
af520a0bb6c1: Pulling fs layer
177f01cd03de: Pulling fs layer
3575d41c0835: Waiting
edfa9cb4c511: Waiting
c4f0810c7324: Pulling fs layer
120debb0c6f1: Pulling fs layer
81938fffccdc: Waiting
a3ae7b373871: Waiting
2e5382e0d0b4: Waiting
607d5919

https://docs.docker.com/engine/reference/commandline/login/#credentials-store



Login Succeeded
The push refers to repository [128015641074.dkr.ecr.ap-south-1.amazonaws.com/neuron-py36-inference]
19994d4b34ab: Preparing
5d32c72ad027: Preparing
f4d9427d752b: Preparing
62b8cb6215cb: Preparing
877b36f2f41c: Preparing
4beb4e23ce0b: Preparing
629f76e0ebfa: Preparing
55aafc4b4134: Preparing
5b505b65f8e8: Preparing
4383d9750962: Preparing
120ef6a75dae: Preparing
d1a63e051735: Preparing
1353ff378dc3: Preparing
3d7573db3c3f: Preparing
2858c813c4e4: Preparing
4beb4e23ce0b: Waiting
e8b427e8fb51: Preparing
6b9a1856b2e9: Preparing
5b505b65f8e8: Waiting
629f76e0ebfa: Waiting
79ec63999885: Preparing
5dcdeb94f6a5: Preparing
b8b74f1e44f0: Preparing
4383d9750962: Waiting
bf9a431aeda6: Preparing
5f08512fd434: Preparing
120ef6a75dae: Waiting
55aafc4b4134: Waiting
c7bb31fc0e08: Preparing
3d7573db3c3f: Waiting
d1a63e051735: Waiting
50858308da3d: Preparing
2858c813c4e4: Waiting
5dcdeb94f6a5: Waiting
b8b74f1e44f0: Waiting
6b9a1856b2e9: Waiting
bf9a431aeda6: Waiting
5f08512fd434: Waiting


### Deploying the Model using the Extended ECR Container Image 

In [27]:
import sys
#!{sys.executable} -m pip install Transformers

In [28]:
# Retrieve the container image that we pushed to ECR in the above step
import os
import boto3
import sagemaker

role = sagemaker.get_execution_role()
sess = sagemaker.Session()

bucket = sess.default_bucket()
prefix = "inf1_compiled_model/model"

# Get container name in ECR
client=boto3.client('sts')
account=client.get_caller_identity()['Account']

my_session=boto3.session.Session()
region=my_session.region_name

algorithm_name="neuron-py36-inference"
ecr_image='{}.dkr.ecr.{}.amazonaws.com/{}:latest'.format(account, region, algorithm_name)
print(ecr_image)

128015641074.dkr.ecr.ap-south-1.amazonaws.com/neuron-py36-inference:latest


In [31]:
key = os.path.join(prefix, "model.tar.gz")
pretrained_model_data = "s3://{}/{}".format(bucket, key)
print(pretrained_model_data)

s3://sagemaker-ap-south-1-128015641074/inf1_compiled_model/model/model.tar.gz


In [33]:
# Create a PyTorch Estimator using the inference code and the ECR image
from sagemaker.pytorch.model import PyTorchModel

pytorch_model = PyTorchModel(
    model_data=pretrained_model_data,
    role=role,
    source_dir="code",
    framework_version="1.7.1",
    entry_point="inference.py",
    image_uri=ecr_image
)

# Inform SageMaker that the model has been compiled using neuron-cc. 
pytorch_model._is_compiled_model = True

In [34]:
predictor = pytorch_model.deploy(initial_instance_count=1, instance_type="ml.inf1.2xlarge")

--------------!

#### Note
*In the input_fn, we specified JSON encoding for incoming requests, so we must employ a JSON serializer to encode the data. Additionally, since we set the return content type as a JSON string, a JSON deserializer is required to parse the response.*

In [36]:
predictor.serializer = sagemaker.serializers.JSONSerializer()
predictor.deserializer = sagemaker.deserializers.JSONDeserializer()

In [37]:
%%time
result = predictor.predict(
    [
        "Never allow the same bug to bite you twice.",
        "The best part of Amazon SageMaker is that it makes machine learning easy.",
    ]
)
print(result)

BERT says that "Never allow the same bug to bite you twice." and "The best part of Amazon SageMaker is that it makes machine learning easy." are not paraphrase
CPU times: user 11.8 ms, sys: 6.6 ms, total: 18.4 ms
Wall time: 146 ms


In [65]:
%%time
result = predictor.predict(
    [
        "The company HuggingFace is based in New York City",
        "HuggingFace's headquarters are situated in Manhattan",
    ]
)
print(result)

BERT says that "The company HuggingFace is based in New York City" and "HuggingFace's headquarters are situated in Manhattan" are paraphrase
CPU times: user 18.3 ms, sys: 96 µs, total: 18.4 ms
Wall time: 64.3 ms


### (Optional) Benchmarking Endpoints Performance

In [41]:
import numpy as np
import datetime
import math
import time
import boto3
import matplotlib.pyplot as plt
from joblib import Parallel, delayed
import numpy as np
from tqdm import tqdm
import random

Matplotlib is building the font cache; this may take a moment.


In [42]:
def inference_latency(model,*inputs):
    """
    infetence_time is a simple method to return the latency of a model inference.

        Parameters:
            model: torch model onbject loaded using torch.jit.load
            inputs: model() args

        Returns:
            latency in seconds
    """
    error = False
    start = time.time()
    try:
        results = model(*inputs)
    except:
        error = True
        results = []
    return {'latency':time.time() - start, 'error': error, 'result': results}

In [43]:
def random_sentence():

    s_nouns = ["A dude", "My mom", "The king", "Some guy", "A cat with rabies", "A sloth", "Your homie", "This cool guy my gardener met yesterday", "Superman"]
    p_nouns = ["These dudes", "Both of my moms", "All the kings of the world", "Some guys", "All of a cattery's cats", "The multitude of sloths living under your bed", "Your homies", "Like, these, like, all these people", "Supermen"]
    s_verbs = ["eats", "kicks", "gives", "treats", "meets with", "creates", "hacks", "configures", "spies on", "retards", "meows on", "flees from", "tries to automate", "explodes"]
    p_verbs = ["eat", "kick", "give", "treat", "meet with", "create", "hack", "configure", "spy on", "retard", "meow on", "flee from", "try to automate", "explode"]
    infinitives = ["to make a pie.", "for no apparent reason.", "because the sky is green.", "for a disease.", "to be able to make toast explode.", "to know more about archeology."]

    return (random.choice(s_nouns) + ' ' + random.choice(s_verbs) + ' ' + random.choice(s_nouns).lower() or random.choice(p_nouns).lower() + ' ' + random.choice(infinitives))

print([random_sentence(), random_sentence()])

['The king retards a sloth', 'A sloth configures superman']


In [56]:
%%time
# Defining Auxiliary variables
number_of_clients = 2
number_of_runs = 1000
t = tqdm(range(number_of_runs),position=0, leave=True)

# Starting parallel clients
cw_start = datetime.datetime.utcnow()

results = Parallel(n_jobs=number_of_clients,prefer="threads")(delayed(inference_latency)(predictor.predict,[random_sentence(), random_sentence()]) for mod in t)
avg_throughput = t.total/t.format_dict['elapsed']

cw_end = datetime.datetime.utcnow()

100%|██████████| 1000/1000 [00:09<00:00, 107.50it/s]

CPU times: user 1.36 s, sys: 102 ms, total: 1.46 s
Wall time: 9.33 s





In [57]:
# Computing metrics and print
latencies = [res['latency'] for res in results]
errors = [res['error'] for res in results]
error_p = sum(errors)/len(errors) *100
p50 = np.quantile(latencies[-1000:],0.50) * 1000
p90 = np.quantile(latencies[-1000:],0.95) * 1000
p95 = np.quantile(latencies[-1000:],0.99) * 1000

print(f'Avg Throughput: :{avg_throughput:.1f}\n')
print(f'50th Percentile Latency:{p50:.1f} ms')
print(f'90th Percentile Latency:{p90:.1f} ms')
print(f'95th Percentile Latency:{p95:.1f} ms\n')
print(f'Errors percentage: {error_p:.1f} %\n')

Avg Throughput: :107.2

50th Percentile Latency:18.4 ms
90th Percentile Latency:19.3 ms
95th Percentile Latency:21.9 ms

Errors percentage: 0.0 %



In [46]:
# Querying CloudWatch
print('Getting Cloudwatch:')
cloudwatch = boto3.client('cloudwatch')
statistics=['SampleCount', 'Average', 'Minimum', 'Maximum']
extended=['p50', 'p90', 'p95', 'p100']

# Give 5 minute buffer to end
cw_end += datetime.timedelta(minutes=5)

# Period must be 1, 5, 10, 30, or multiple of 60
# Calculate closest multiple of 60 to the total elapsed time
factor = math.ceil((cw_end - cw_start).total_seconds() / 60)
period = factor * 60
print('Time elapsed: {} seconds'.format((cw_end - cw_start).total_seconds()))
print('Using period of {} seconds\n'.format(period))

Getting Cloudwatch:
Time elapsed: 309.535067 seconds
Using period of 360 seconds



In [47]:
cloudwatch_ready = False
# Keep polling CloudWatch metrics until datapoints are available
while not cloudwatch_ready:
  time.sleep(30)
  print('Waiting 30 seconds ...')
  # Must use default units of microseconds
  model_latency_metrics = cloudwatch.get_metric_statistics(MetricName='ModelLatency',
                                             Dimensions=[{'Name': 'EndpointName',
                                                          'Value': predictor.endpoint_name},
                                                         {'Name': 'VariantName',
                                                          'Value': "AllTraffic"}],
                                             Namespace="AWS/SageMaker",
                                             StartTime=cw_start,
                                             EndTime=cw_end,
                                             Period=period,
                                             Statistics=statistics,
                                             ExtendedStatistics=extended
                                             )
  # Should be 1000
  if len(model_latency_metrics['Datapoints']) > 0:
    print('{} latency datapoints ready'.format(model_latency_metrics['Datapoints'][0]['SampleCount']))
    side_avg = model_latency_metrics['Datapoints'][0]['Average'] / number_of_runs
    side_p50 = model_latency_metrics['Datapoints'][0]['ExtendedStatistics']['p50'] / number_of_runs
    side_p90 = model_latency_metrics['Datapoints'][0]['ExtendedStatistics']['p90'] / number_of_runs
    side_p95 = model_latency_metrics['Datapoints'][0]['ExtendedStatistics']['p95'] / number_of_runs
    side_p100 = model_latency_metrics['Datapoints'][0]['ExtendedStatistics']['p100'] / number_of_runs

    print(f'50th Percentile Latency:{side_p50:.1f} ms')
    print(f'90th Percentile Latency:{side_p90:.1f} ms')
    print(f'95th Percentile Latency:{side_p95:.1f} ms\n')

    cloudwatch_ready = True


Waiting 30 seconds ...
1000.0 latency datapoints ready
50th Percentile Latency:14.2 ms
90th Percentile Latency:15.0 ms
95th Percentile Latency:15.1 ms



#### Delete the endpoints and resources to avoid charges

For additional readings, you may refer to the Neuron SDK Documentation page: https://awsdocs-neuron.readthedocs-hosted.com/en/latest/frameworks/torch/index.html