# Load and serve a DistilBERT model from Mxnet on the fly

In [1]:
!pip uninstall -y ezsmdeploy

Found existing installation: ezsmdeploy 0.1.1
Uninstalling ezsmdeploy-0.1.1:
  Successfully uninstalled ezsmdeploy-0.1.1


In [2]:
!pip install mxnet gluonnlp pixiedust



## Train a model locally or remote

In [3]:
import gluonnlp as nlp; import mxnet as mx;

# load distilbert
model, vocab = nlp.model.get_model('distilbert_6_768_12', dataset_name='distilbert_book_corpus_wiki_en_uncased');

# tokenize then transform
tokenizer = nlp.data.BERTTokenizer(vocab, lower=True);
transform = nlp.data.BERTSentenceTransform(tokenizer, max_seq_length=512, pair=False, pad=False);

sample = transform(['Hello world!']);
words, valid_len = mx.nd.array([sample[0]]), mx.nd.array([sample[1]])
model(words, valid_len) #if you want to save the model files, hybridize first

# If you want to save this model and upload as a file to S3 you will have to hybridize() it first before serializing.
# If you want to load the model internally using some hub, don't pass in a model or use Model = None

# model.hybridize()
# model(words, valid_len)
# !mkdir mxnetmodel
# model.export(path='./mxnetmodel/')


[[[-0.29900676 -0.26228657  0.19592251 ... -0.2950903   0.07993776
    0.6249021 ]
  [-0.52049935  0.0308087   0.8929488  ... -0.3392077   0.6287769
    0.5026896 ]
  [-0.346172   -0.09102157  0.70326906 ... -0.10685936  0.26489177
    0.14643385]
  [-0.37253296 -0.4664145   0.31325984 ... -0.21355502  0.03972806
    0.32598272]
  [ 0.8439531  -0.33282137 -0.4697281  ...  0.2638155  -0.8451076
   -0.05704509]]]
<NDArray 1x5x768 @cpu(0)>

## Step 1 : Write a model transform script

#### Make sure you have a ...

- "load_model" function
    - input args are model path
    - returns loaded model object
    - model name is the same as what you saved the model file as (see above step)
<br><br>
- "predict" function
    - input args are the loaded model object and a payload
    - returns the result of model.predict
    - make sure you format it as a single (or multiple) string return inside a list for real time (for mini batch)
    - from a client, a list  or string or np.array that is sent for prediction is interpreted as bytes. Do what you have to for converting back to list or string or np.array
    - return the error for debugging


In [4]:
%%writefile modelscript_mxnet.py
import gluonnlp as nlp; import mxnet as mx;
from joblib import load
import numpy as np
import os
import json

#Return loaded model
def load_model(modelpath):
    model, vocab = nlp.model.get_model('distilbert_6_768_12', dataset_name='distilbert_book_corpus_wiki_en_uncased');
    print("loaded")
    return {'model':model,'vocab':vocab}

# return prediction based on loaded model (from the step above) and an input payload
def predict(modeldict, payload):
    
    #set_trace()
    
    model = modeldict['model']
    vocab = modeldict['vocab']
    
    tokenizer = nlp.data.BERTTokenizer(vocab, lower=True);
    transform = nlp.data.BERTSentenceTransform(tokenizer, max_seq_length=512, pair=False, pad=False);
    
    try:
        # Local
        if type(payload) == str:
            sample = transform(payload);
        elif type(payload) == bytes :
            sample = transform(str(payload.decode()));
        # Remote, standard payload comes in as a list of json strings with 'body' key
        elif type(payload)==list:
            sample = transform(payload[0]['body'].decode());
        else:
            return [json.dumps({'response':"Provide string or bytes string",
                    'payload':str(payload),
                    'type':str(type(payload))})]
        
        words, valid_len = mx.nd.array([sample[0]]), mx.nd.array([sample[1]])
        out = model(words, valid_len)  
        out = json.dumps({'output':out.asnumpy().tolist()})
    except Exception as e:
        out = str(e) #useful for debugging!
    return [out]

Overwriting modelscript_mxnet.py


## Does this work locally? (not "_in a container locally_", but _actually_ in local)

In [5]:
from modelscript_mxnet import *
model = load_model('') # path doesn't matter here since we're loading the model directly in the script

loaded


In [6]:
predict(model,'Hello World!')[0]

'{"output": [[[-0.07445868849754333, -0.0991184413433075, 0.13384896516799927, -0.05646568536758423, 0.08494120836257935, 0.11290202289819717, 0.6132094264030457, -0.5018954277038574, 0.2793712019920349, -0.7371849417686462, -0.17871789634227753, -0.1073930561542511, -0.2754637598991394, 0.2049282342195511, 0.5614697933197021, 0.26408329606056213, 0.43186742067337036, 0.3533274829387665, -0.09774183481931686, -0.08646772801876068, 0.34320127964019775, 0.30926668643951416, -0.023535851389169693, -0.5332310199737549, 0.255035936832428, -0.07873189449310303, 0.11670864373445511, -0.48287102580070496, 0.16832000017166138, 0.37750616669654846, 0.3227921724319458, 0.2094670683145523, -0.2776387333869934, -0.07202951610088348, -0.17642739415168762, 0.25404036045074463, 0.3166342079639435, 0.10013242810964584, -0.2376513034105301, -0.18394093215465546, -0.49095213413238525, -0.7295344471931458, -0.21525487303733826, 0.6681957244873047, -0.2989739775657654, -0.18033066391944885, -1.98974609375,

### ok great! Now let's install ezsmdeploy

_[To Do]_: currently local; replace with pip version!

In [7]:
!pip install -e ./ --quiet 

In [8]:
import ezsmdeploy

#### If you have been running other inference containers in local mode, stop existing containers to avoid conflict

In [9]:
!docker container stop $(docker container ls -aq) >/dev/null

## Deploy locally

In [10]:
ez = ezsmdeploy.Deploy(model = None, #loading distilbert model in script from hub
                  script = 'modelscript_mxnet.py',
                  requirements = ['pyarrow','mxnet', 'gluonnlp','numpy','joblib'], #or pass in the path to requirements.txt
                  instance_type = 'local',
                  wait = True)


[K0:00:00.002975 | No model was passed. Assuming you are downloading a model in the script or in the container
[K0:00:00.133182 | uploaded model tarball(s) ; check returned modelpath
[K0:00:00.134234 | added requirements file
[K0:00:00.136067 | added source file
[K0:00:00.137555 | added Dockerfile
[K0:00:00.139512 | added model_handler and docker utils
[K0:00:00.139605 | building docker container
[K0:00:33.769931 | built docker container
[K0:00:33.886715 | created model(s). Now deploying on local
[32m●∙∙[0m [KAttaching to tmpawbygeat_algo-1-157np_1
[36malgo-1-157np_1  |[0m Starting the inference server with 32 workers.
[32m∙●∙[0m [K[36malgo-1-157np_1  |[0m [2020-04-22 19:22:04 +0000] [9] [INFO] Starting gunicorn 20.0.4
[36malgo-1-157np_1  |[0m [2020-04-22 19:22:04 +0000] [9] [INFO] Listening at: unix:/tmp/gunicorn.sock (9)
[36malgo-1-157np_1  |[0m [2020-04-22 19:22:04 +0000] [9] [INFO] Using worker: gevent
[36malgo-1-157np_1  |[0m [2020-04-22 19:22:04 +0000] [1

## Test containerized version locally

Since you are downloading this model from a hub, the first time you invoke it will be slow, so invoke again to get an inference without all of the container logs

In [17]:
out = ez.predictor.predict('Hello World').decode()
out

[36malgo-1-157np_1  |[0m received input data
[36malgo-1-157np_1  |[0m b'Hello World'
[36malgo-1-157np_1  |[0m loaded
[36malgo-1-157np_1  |[0m predictions from model
[36malgo-1-157np_1  |[0m ['{"output": [[[-0.07445868849754333, -0.0991184413433075, 0.13384896516799927, -0.05646568536758423, 0.08494120836257935, 0.11290202289819717, 0.6132094264030457, -0.5018954277038574, 0.2793712019920349, -0.7371849417686462, -0.17871789634227753, -0.1073930561542511, -0.2754637598991394, 0.2049282342195511, 0.5614697933197021, 0.26408329606056213, 0.43186742067337036, 0.3533274829387665, -0.09774183481931686, -0.08646772801876068, 0.34320127964019775, 0.30926668643951416, -0.023535851389169693, -0.5332310199737549, 0.255035936832428, -0.07873189449310303, 0.11670864373445511, -0.48287102580070496, 0.16832000017166138, 0.37750616669654846, 0.3227921724319458, 0.2094670683145523, -0.2776387333869934, -0.07202951610088348, -0.17642739415168762, 0.25404036045074463, 0.3166342079639435, 0.1001

'{"output": [[[-0.07445868849754333, -0.0991184413433075, 0.13384896516799927, -0.05646568536758423, 0.08494120836257935, 0.11290202289819717, 0.6132094264030457, -0.5018954277038574, 0.2793712019920349, -0.7371849417686462, -0.17871789634227753, -0.1073930561542511, -0.2754637598991394, 0.2049282342195511, 0.5614697933197021, 0.26408329606056213, 0.43186742067337036, 0.3533274829387665, -0.09774183481931686, -0.08646772801876068, 0.34320127964019775, 0.30926668643951416, -0.023535851389169693, -0.5332310199737549, 0.255035936832428, -0.07873189449310303, 0.11670864373445511, -0.48287102580070496, 0.16832000017166138, 0.37750616669654846, 0.3227921724319458, 0.2094670683145523, -0.2776387333869934, -0.07202951610088348, -0.17642739415168762, 0.25404036045074463, 0.3166342079639435, 0.10013242810964584, -0.2376513034105301, -0.18394093215465546, -0.49095213413238525, -0.7295344471931458, -0.21525487303733826, 0.6681957244873047, -0.2989739775657654, -0.18033066391944885, -1.98974609375,

## Deploy on SageMaker

In [15]:
!./src/build-docker.sh

Building container ezsmdeploy-image-

An error occurred (InvalidParameterException) when calling the CreateRepository operation: Invalid parameter at 'repositoryName' failed to satisfy constraint: 'must satisfy regular expression '(?:[a-z0-9]+(?:[._-][a-z0-9]+)*/)*[a-z0-9]+(?:[._-][a-z0-9]+)*''
https://docs.docker.com/engine/reference/commandline/login/#credentials-store

Login Succeeded
invalid argument "ezsmdeploy-image-" for "-t, --tag" flag: invalid reference format
See 'docker build --help'.
Error parsing reference: "ezsmdeploy-image-" is not a valid repository/tag: invalid reference format
invalid reference format
497456752804.dkr.ecr.us-east-1.amazonaws.com/ezsmdeploy-image-
SUCCESS


In [20]:
ezonsm = ezsmdeploy.Deploy(model = None, #loading distilbert model in script from hub
                  script = 'modelscript_mxnet.py',
                  requirements = ['pyarrow','mxnet', 'gluonnlp','numpy','joblib'], #or pass in the path to requirements.txt
                  instance_type = 'ml.m4.xlarge',
                  wait = True)

[K0:00:00.002812 | No model was passed. Assuming you are downloading a model in the script or in the container
[K0:00:00.074029 | uploaded model tarball(s) ; check returned modelpath
[K0:00:00.074677 | added requirements file
[K0:00:00.076766 | added source file
[K0:00:00.078415 | added Dockerfile
[K0:00:00.080521 | added model_handler and docker utils
[K0:00:00.080612 | building docker container
[K0:00:33.568171 | built docker container
[K0:00:33.678219 | created model(s). Now deploying on ml.m4.xlarge
[K0:09:06.979438 | deployed model
[K0:09:06.980192 | estimated cost is $0.31 per hour
[K[32m0:09:06.980406 | Done! ✔[0m 


In [22]:
out = ezonsm.predictor.predict('Hello World').decode() 
out

'{"output": [[[-0.07445868849754333, -0.0991184413433075, 0.13384896516799927, -0.05646568536758423, 0.08494120836257935, 0.11290202289819717, 0.6132094264030457, -0.5018954277038574, 0.2793712019920349, -0.7371849417686462, -0.17871789634227753, -0.1073930561542511, -0.2754637598991394, 0.2049282342195511, 0.5614697933197021, 0.26408329606056213, 0.43186742067337036, 0.3533274829387665, -0.09774183481931686, -0.08646772801876068, 0.34320127964019775, 0.30926668643951416, -0.023535851389169693, -0.5332310199737549, 0.255035936832428, -0.07873189449310303, 0.11670864373445511, -0.48287102580070496, 0.16832000017166138, 0.37750616669654846, 0.3227921724319458, 0.2094670683145523, -0.2776387333869934, -0.07202951610088348, -0.17642739415168762, 0.25404036045074463, 0.3166342079639435, 0.10013242810964584, -0.2376513034105301, -0.18394093215465546, -0.49095213413238525, -0.7295344471931458, -0.21525487303733826, 0.6681957244873047, -0.2989739775657654, -0.18033066391944885, -1.98974609375,

In [23]:
ezonsm.predictor.delete_endpoint()