# Sequence Classfication using HuggingFace DistilBERT Model compiled using Neo on ML_G4DN

In [1]:
!pip3 install transformers sagemaker sagemaker_inference --upgrade


Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
You should consider upgrading via the '/home/ec2-user/anaconda3/envs/pytorch_latest_p37/bin/python -m pip install --upgrade pip' command.[0m


In [2]:
import transformers

transformers.__version__

'4.15.0'

# 1. Inference without Installing HF transformers in the current Neo GPU inference container

# Tokenization using HF

In [3]:
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoConfig,DistilBertForSequenceClassification

MODEL = 'distilbert-base-uncased-finetuned-sst-2-english'
model = DistilBertForSequenceClassification.from_pretrained(MODEL, return_dict=False)
model.eval()

tokenizer = AutoTokenizer.from_pretrained(MODEL)
# Setup some example inputs
sequence_0 = "The company HuggingFace is based in New York City"
sequence_1 = "Apples are especially bad for your health"
sequence_2 = "HuggingFace's headquarters are situated in Manhattan"

max_length=128
paraphrase = tokenizer.encode_plus(sequence_0, sequence_2, max_length=max_length, padding='max_length', truncation=True, return_tensors="pt")
not_paraphrase = tokenizer.encode_plus(sequence_0, sequence_1, max_length=max_length, padding='max_length', truncation=True, return_tensors="pt")


In [4]:
# Convert example inputs to a format that is compatible with TorchScript tracing
example_inputs_paraphrase = paraphrase['input_ids'], paraphrase['attention_mask']
example_inputs_not_paraphrase = not_paraphrase['input_ids'], not_paraphrase['attention_mask']

# Convert HF Model to Neo Compatible Format for compilation

In [5]:
import torch
model_trace = torch.jit.trace(model, example_inputs_paraphrase, strict=False)

In [6]:
model_trace.save('hf-model-distillbert-traced.pth')


In [7]:
! tar zcvf hf-model-distillbert-traced.tar.gz hf-model-distillbert-traced.pth

hf-model-distillbert-traced.pth


In [8]:
import sagemaker
from sagemaker.s3 import S3Uploader,s3_path_join

# get the s3 bucket
sess = sagemaker.Session()
role = sagemaker.get_execution_role()
sagemaker_session_bucket = sess.default_bucket()
# uploads a given file to S3.
upload_path = s3_path_join("s3://",sagemaker_session_bucket,"hf_distillbert_traced")
model_uri = S3Uploader.upload('hf-model-distillbert-traced.tar.gz',upload_path)

# Compile the HF model  using Neo

In [9]:
from sagemaker.pytorch.model import PyTorchModel
import time
pytorch_model = PyTorchModel(
    model_data=model_uri,
    role=role,
    entry_point="distillbert.py",
    framework_version="1.8.1",
    py_version="py3"
)

In [10]:
pytorch_model.compile(
    target_instance_family="ml_g4dn",
    input_shape={"input_ids": [1, 128], "attention_mask": [1, 128]},
    compiler_options={"dtype": "int64"},
    output_path=upload_path,
    framework="pytorch",
    role=role,
    job_name="distillbert-traced-{}".format(int(time.time())),
    compile_max_run=1500
)

???????????????????????????????.........................................................!

<sagemaker.pytorch.model.PyTorchModel at 0x7f8877621ed0>

# Host on Sagemaker 

In [11]:
pytorch_model.deploy(instance_type="ml.g4dn.2xlarge", initial_instance_count=1)

----------!

<sagemaker.pytorch.model.PyTorchPredictor at 0x7f88761bc390>

# Run Inference

In [12]:
import numpy as np
from sagemaker_inference import encoder, decoder

inp = encoder.encode(example_inputs_paraphrase, "application/json")

In [13]:
import json
import numpy as np
import time
import boto3
from sagemaker_inference import content_types, encoder, decoder

sm_runtime = boto3.Session().client("sagemaker-runtime")
request_type = content_types.JSON
response_type = content_types.JSON


In [14]:
response = sm_runtime.invoke_endpoint(
        EndpointName=pytorch_model.endpoint_name, ContentType=request_type, Accept=response_type, Body=inp
    )
print(response["Body"].read())

b'[[[-1.8017159700393677, 1.8369837999343872]]]'


In [45]:
total_time = 0
trial_count = 0
num_of_trials = 100
while trial_count < num_of_trials:
    start_time = time.time()
    response = sm_runtime.invoke_endpoint(
        EndpointName=pytorch_model.endpoint_name, ContentType=request_type, Accept=response_type, Body=inp
    )
    end_time = time.time()

    total_time = total_time + (end_time - start_time)
    trial_count = trial_count + 1
    
1 / (total_time / 100) # inferences per second    

48.616173801931154

# 2. Install HF transformers in the GPU inference container

In [16]:
# Add requirements.txt with huggingface transformers in the compiled model artifact
# This will install HF transformers on container startup

print(pytorch_model.model_data)


s3://sagemaker-us-west-2-086613482928/hf_distillbert_traced/hf-model-distillbert-traced-ml_g4dn.tar.gz


In [17]:
# Change S3 URI below
! aws s3 cp s3://sagemaker-us-west-2-086613482928/hf_distillbert_traced/hf-model-distillbert-traced-ml_g4dn.tar.gz  .

download: s3://sagemaker-us-west-2-086613482928/hf_distillbert_traced/hf-model-distillbert-traced-ml_g4dn.tar.gz to ./hf-model-distillbert-traced-ml_g4dn.tar.gz


In [18]:
! rm -rf distillbert_compiled
! mkdir -p distillbert_compiled
! tar -xzvf hf-model-distillbert-traced-ml_g4dn.tar.gz -C distillbert_compiled

sample_input.pkl
10951053_0_Neo.meta
compiled.pt
10951053_0_Neo.so
10951053_0_Neo.json
dlr.h
libdlr.so
10951053_0_Neo.params
IOC-INF/
IOC-INF/metadata.json
manifest


In [19]:
! mkdir -p distillbert_compiled/code
! cp requirements.txt distillbert_compiled/code/
! cp distillbertreq.py distillbert_compiled/code/

In [20]:
! cd distillbert_compiled ; tar -czvf hf-model-distillbert-traced-ml_g4dn-req.tar.gz * ; mv hf-model-distillbert-traced-ml_g4dn-req.tar.gz ../


10951053_0_Neo.json
10951053_0_Neo.meta
10951053_0_Neo.params
10951053_0_Neo.so
code/
code/requirements.txt
code/distillbertreq.py
compiled.pt
dlr.h
IOC-INF/
IOC-INF/metadata.json
libdlr.so
manifest
sample_input.pkl


In [21]:
model_uri_req = S3Uploader.upload('hf-model-distillbert-traced-ml_g4dn-req.tar.gz',upload_path)

# Deploy on Sagemaker¶


In [22]:
pytorch_model_req = PyTorchModel(
    model_data=model_uri_req,
    role=role,
    entry_point="distillbertreq.py",
    framework_version="1.8.1",
    py_version="py3",
    image_uri="301217895009.dkr.ecr.us-west-2.amazonaws.com/sagemaker-inference-pytorch:1.8.1-gpu-py3"
)

In [23]:
pytorch_model_req.deploy(instance_type="ml.g4dn.2xlarge", initial_instance_count=1)

----------!

<sagemaker.pytorch.model.PyTorchPredictor at 0x7f8875d5f3d0>

In [24]:
datapos = {
"inputs": ["The company HuggingFace is based in New York City", "HuggingFace's headquarters are situated in Manhattan"]
}
dataneg = {
"inputs": ["The company HuggingFace is based in New York City", "Apples are especially bad for your health"]
}

# Run Inference

In [25]:
# Infer "a sentence"

response = sm_runtime.invoke_endpoint(
        EndpointName=pytorch_model_req.endpoint_name, ContentType=request_type, Accept=response_type, Body=json.dumps(datapos)
    )
response["Body"].read()

b'[{"label": "POSITIVE", "score": 0.9743868112564087}]'

In [26]:
# Infer "not a sentence"
response = sm_runtime.invoke_endpoint(
        EndpointName=pytorch_model_req.endpoint_name, ContentType=request_type, Accept=response_type, Body=json.dumps(dataneg)
    )
response["Body"].read()

b'[{"label": "NEGATIVE", "score": 0.9997486472129822}]'

In [38]:
total_time = 0
trial_count = 0
num_of_trials = 100


while trial_count < num_of_trials:
    start_time = time.time()
    response = sm_runtime.invoke_endpoint(
        EndpointName=pytorch_model_req.endpoint_name, ContentType=request_type, Accept=response_type, Body=json.dumps(datapos)
    )
    end_time = time.time()

    total_time = total_time + (end_time - start_time)
    trial_count = trial_count + 1
    
1 / (total_time / 100) # inferences per second    

49.803152878660576