# Initialization

In [6]:
import sagemaker as sm
import boto3
import json
from datetime import datetime
from time import strftime, gmtime

In [7]:
model_s3_path = 's3://sagemaker-eu-west-1-113147044314/MUSE/model.tar.gz'
local_model_path = "../../../models/MUSE/large/000003"

In [8]:
bucket = sm.session.Session().default_bucket()
print(f"Default bucket: {bucket}")

Default bucket: sagemaker-eu-west-1-113147044314


# Retrieving and packaging the model for SageMaker

We already downloaded the model when we first tried to deploy it using the SageMaker SDK support for Tensorflow. Now we just need to copy it to the proper location.

In [None]:
!tar -czf /tmp/model.tar.gz -C {local_model_path} .
!ls -la /tmp/*.tar.gz
!aws s3 cp /tmp/model.tar.gz s3://{bucket}/MUSE/model.tar.gz

# Common script used by local, local SM and Endpoit

In [None]:
%%writefile modelscript_tensorflow.py
import tensorflow as tf
import numpy as np
import tensorflow_hub as hub
import tensorflow_text
import json

#Return loaded model
def load_model(modelpath):
    model = hub.load(modelpath)
    return model

# return prediction based on loaded model (from the step above) and an input payload
def predict(model, payload):
    if not isinstance(payload, str):
        payload = payload.decode()
    try:
        try:
            if isinstance(json.loads(payload), dict):
                data = json.loads(payload).get('instances', [payload])  # If it has no instances field, assume the payload is a string
            elif isinstance(json.loads(payload), list):
                data = json.loads(payload)
        except json.JSONDecodeError:  # If it can't be decoded, assume it's a string
            data = [payload]
        result = np.asarray(model(data))
        out = result.tolist()
    except Exception as e:
        out = str(e)
    return json.dumps({'output': out})

# Testing local inference

The first step to check if we got the correct model is testing it locally. In order to do that, we need to update the libraries the model used to the same versions used to train it. As can be seen on [Tensorflow Hub](https://tfhub.dev/google/universal-sentence-encoder-multilingual-large/3), those are:
- Tensorflow 2: we'll use version 2.2.0
-Tensorflow Text: we'll use version 2.2.0, under the assumption that it's the one compatible with Tensorflow 2.2
- We'll also install Tensorflow Hub, because it provides the function to load the model.

In [6]:
#!pip uninstall -y tensorflow-gpu
!pip install --force-reinstall tensorflow>=2.2.0 tensorflow-hub>=0.8.0 tensorflow-text==2.2.0 protobuf

[31mERROR: tensorflow-serving-api 1.15.0 has requirement tensorflow~=1.15.0, but you'll have tensorflow 2.2.0 which is incompatible.[0m
[31mERROR: tensorflow-gpu 1.15.2 has requirement gast==0.2.2, but you'll have gast 0.3.3 which is incompatible.[0m
[31mERROR: tensorflow-gpu 1.15.2 has requirement tensorboard<1.16.0,>=1.15.0, but you'll have tensorboard 2.2.2 which is incompatible.[0m
[31mERROR: tensorflow-gpu 1.15.2 has requirement tensorflow-estimator==1.15.1, but you'll have tensorflow-estimator 2.2.0 which is incompatible.[0m
[31mERROR: awscli 1.18.39 has requirement rsa<=3.5.0,>=3.1.2, but you'll have rsa 4.6 which is incompatible.[0m
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [2]:
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text
import numpy as np
from sagemaker.tensorflow.serving import Model

# my_devices = tf.config.experimental.list_physical_devices(device_type='CPU')
# tf.config.experimental.set_visible_devices(devices= my_devices, device_type='CPU')

# print(f"Tensorflow version: {tf.__version__}")
# print(f"Tensorflow text does not provide a version object")
# print(f"Tensorflow hub version: {hub.__version__}")

In [3]:
tf.config.list_physical_devices('CPU')

[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU')]

In [5]:
tf.debugging.set_log_device_placement(True)

In [4]:
tf.config.set_visible_devices([], 'GPU')

In [12]:
converter = tf.lite.TFLiteConverter.from_saved_model(local_model_path)
converter.optimizations = [tf.lite.Optimize.OPTIMIZE_FOR_SIZE]
converter.target_spec.supported_types = [tf.float16]
Tflite_quantized_model = converter.convert()

ConverterError: See console for info.
Traceback (most recent call last):
  File "/home/ec2-user/anaconda3/envs/tensorflow_p36/bin/toco_from_protos", line 8, in <module>
    sys.exit(main())
  File "/home/ec2-user/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages/tensorflow/lite/toco/python/toco_from_protos.py", line 93, in main
    app.run(main=execute, argv=[sys.argv[0]] + unparsed)
  File "/home/ec2-user/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages/tensorflow/python/platform/app.py", line 40, in run
    _run(main=main, argv=argv, flags_parser=_parse_flags_tolerate_undef)
  File "/home/ec2-user/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages/absl/app.py", line 299, in run
    _run_main(main, args)
  File "/home/ec2-user/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages/absl/app.py", line 250, in _run_main
    sys.exit(main(argv))
  File "/home/ec2-user/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages/tensorflow/lite/toco/python/toco_from_protos.py", line 56, in execute
    enable_mlir_converter)
Exception: Failed to find function '__inference_pruned_106594'. The imported TensorFlow GraphDef is ill-formed.




In [None]:
from modelscript_tensorflow import *
model = load_model(local_model_path)

The model expects its input as a JSON object in one of the following formats:
```javascript
{
    "instances": ["example 1", "example 2", ...]
}
["example 1", "example 2", ....]
```
and will return the embeddings in the following format:
```javascript
{
    "output": [[<embeddings for example 1>], [<embeddings for example 2>], ...]
}
```

We'll try the two calls to test that the model itself is working.

In [None]:
inputs = ['The quick brown fox jumped over the lazy dog.', 'This is a test']
inputs_json = json.dumps({'instances': inputs})
inputs_json_list = json.dumps(inputs)

In [None]:
print(f"Input: {inputs_json}\n")
print(f"Result:\n{json.loads(predict(model, inputs_json))}")

In [None]:
print(f"Input: {inputs_json_list}\n")
print(f"Result:\n{json.loads(predict(model, inputs_json_list))}")

The model can also be called with a simple string as input. From the example below, you can see that the result format is always the same:

In [None]:
json.loads(predict(model, inputs[0]))

**You may have to restart the Kernel and run the initialization and setting of model paths before continuing.** The locally loaded model cannot be released from GPU otherwise, and the local SM won't have enough memory to proceed.

# Deploying on SageMaker and a Custom Container Based on EZSMDeploy

[EZSMDeploy](https://pypi.org/project/ezsmdeploy/) got us started, but it's too limited to deploy an optimized configuration. On the other hand, SageMaker's [Tensorflow Serving image](https://github.com/aws/sagemaker-tensorflow-serving-container) also doesn't work, so we'll need to create our own container. We have copied the `src` folder created by EZSMDeploy and edited the files. Let's take a look at our changes:

## Dockerfile

We made the following changes to the Dockerfile:
- Based it on Nvidia's `cuda:10.1-base-ubuntu18.04`. That should make the CUDA libraries available.
- We also need to install several additional packages for them to work: `cuda-command-line-tools-10-1`, `cuda-cufft-10-1`, `cuda-curand-10-1`, 
  `cuda-cusolver-10-1`, `cuda-cusparse-10-1`, `libcublas10=10.1.0.105-1`, `libcublas-dev=10.1.0.105-1`, `libcudnn7`, `libnccl2`, `libgomp1`.

In [None]:
!pygmentize src/Dockerfile

## Build Script

We made the following changes to the `build-docker.sh` script:
- Changed `algorithm-name` to `"muse-large-000003"`
- Added a `latest` tag to the image
- Removed the creation of `done.txt`, since we're calling the script synchronously from the notebook
- If you are running the script from inside a SageMaker notebook instance on a GPU-enabled AWS instance, no additional configuration is needed - 
  the [nvidia-container-toolkit](https://github.com/NVIDIA/nvidia-docker/wiki) is already installed. If you are running from some other environment, please check their instructions.

In [None]:
!pygmentize src/build-docker.sh

In [None]:
!src/build-docker.sh

## Deploying Locally

In [None]:
from sagemaker import get_execution_role

role = get_execution_role()

In [None]:
from  sagemaker.model import Model
from  sagemaker.predictor import RealTimePredictor

model = Model(model_data=model_s3_path, image='113147044314.dkr.ecr.eu-west-1.amazonaws.com/muse-large-000003', role=role, 
              predictor_cls= RealTimePredictor, name='muse-large-000003')

In [None]:
local_predictor = model.deploy(initial_instance_count=1, instance_type='local',  endpoint_name='muse-large-000003-local', wait=True)

In [None]:
inputs = ['The quick brown fox jumped over the lazy dog.', 'This is a test']
inputs_json = json.dumps({'instances': inputs})
inputs_json_list = json.dumps(inputs)

In [None]:
out = local_predictor.predict(inputs_json_list.encode()).decode()

In [None]:
local_predictor.delete_endpoint()

## Deploying to a SageMaker Endpoint

In [None]:
"520713654638.dkr.ecr.eu-west-1.amazonaws.com/{}:{}".format(get_ecr_image_uri_prefix(account, region), ecr_repo, tag)

In [None]:
from  sagemaker.model import Model
from  sagemaker.predictor import RealTimePredictor

model = Model(model_data=model_s3_path, image='113147044314.dkr.ecr.eu-west-1.amazonaws.com/muse-large-000003', role=role, 
              predictor_cls= RealTimePredictor, env={'MODEL_SERVER_WORKERS': '1'}, name='muse-large-000003')

In [None]:
predictor = model.deploy(initial_instance_count=1, instance_type='ml.p3dn.24xlarge',  endpoint_name='muse-large-000003-g4dn', wait=True)

In [None]:
out = predictor.predict(inputs_json_list.encode()).decode()

# EZSMDeploy - Remove afterwards

Then we create a local deployment (for quick testing purposes), passing it:
- the location of the model we downloaded
- the script we defined above with the `load_model` and `predict` functions
- the dependencies we'll need to run the model
- A model name that SageMaker will use to create metadata and track the model creation.

We also tell it to deploy on local mode. Local mode (requested by specifying `local` as the instance type) deploys the Docker container in the machine where the call to deploy was made. It's a convenience for testing ideas fast, disconnected from the SageMaker service. It should not be used for real inference, just small tests.

In [None]:
ez = ezsmdeploy.Deploy(
    model = local_model_path,
    script = 'modelscript_tensorflow.py',
    requirements = ['numpy','tensorflow-gpu==2.2.0','tensorflow_hub', 'tensorflow-text==2.2.0'], #or pass in the path to requirements.txt
    instance_type = 'local',
    monitor=False,
    name='muse-large-000003',
    wait = True
)

From the log we can see we had some problems with GPU. This is because EZSMDeploy doesn't start from an image that has the required GPU drivers. In fact, we can check the Dockerfile used by EZSMDeploy and see it starts from standard Ubuntu 16.04:

In [None]:
!pygmentize src/Dockerfile

All the code generated by EZSMDeploy to create and serve the model is under the `src` folder. The Dockerfile is doing some interesting things:
- It installs all the requirements from a requirements file generated by EZSMDeploy based on the parameter passed by us
- It copies the entire contents of the folder into the image.

Besides the `Dockerfile` above, you may also want to check:
- `transformscript.py`: That's a copy of the script created by us and passed as a parameter.
- `serve`: The base script run by the container (default SageMaker call when serving and no other entrypoint was provided). It just starts the web services:
    - nginx
    - gunicorn
- `wsgi.py`: Used by gunicorn to start the actual workers. As you can see, it's just a simple wrapper around a flask application defined in
- `predictor.py`: The most interesting function here is called `transformation`. Interesting things happening here:
    - It imports `transformscript`, effectively having the functions to load and generate inference from the model.
    - It adds several `print` statements that generate useful log. While useful, it could have performance and security impacts, and we recommend that these are reviewed and removed later.
    
In general, EZSMDeploy is a quick way to generate a deployment template to get started faster when creating new models, but it has its limitations. Let's see how well it works.

In [None]:
inputs = ['The quick brown fox jumped over the lazy dog.', 'This is a test']
inputs_json = json.dumps({'instances': inputs})
inputs_json_list = json.dumps(inputs)

In [None]:
out = ez.predictor.predict(inputs_json_list.encode()).decode()

You can see the actual input and output in the logs above. And here's the result:

In [None]:
json.loads(out)['output']

So, we have generated an embedding from a deployed endpoint, and it seems to work locally. In the next section, we'll see if it also works for production deployment. But first let's remove the local endpoint and release the resources.

In [None]:
ez.predictor.delete_endpoint()

# Deploying to a SageMaker Endpoint

## Deploying through EZSMDeploy Interfacce

EZSMDeploy always rebuilds the image when rerun - but Docker will be smart about its caching, so the building and push should be faster. Most of the time spent here should be on starting and configuring an EC2 instance to deploy the model to.

In [None]:
ezonsm = ezsmdeploy.Deploy(
    model = local_model_path, #Since we are loading a model from TF hub,
    script = 'modelscript_tensorflow.py',
    requirements = ['numpy','tensorflow-gpu==2.2.0','tensorflow_hub', 'tensorflow-text==2.2.0'],
    wait = True,
    instance_type = 'ml.p3.2xlarge',
    monitor=False,
    name='muse-large-000003'
)

We copied a few examples from the book depository dataset to try our endpoint on.

In [None]:
messages = json.dumps({'instances':[
    "Brian Cosgrove's classic introduction to the world of microlight flying has endeared itself to several generations of pilots.",
    "BECAUSE NOT ALL KRAV MAGA IS THE SAME(R) This book is designed for krav maga trainees, security-conscious civilians, law enforcement officers, security professionals, and military personnel alike who wish to refine their essential krav maga combatives, improve their chances of surviving a hostile attack and prevail without serious injury. Combatives are the foundation of krav maga counter-attacks. These are the combatives of the original Israeli Krav Maga Association (Grandmaster Gidon). It is irrefutable that you need only learn a few core combatives to be an effective fighter. Simple is easy. Easy is effective. Effective is what is required to end a violent encounter quickly, decisively, and on your terms. This book stresses doing the right things and doing them in the right way. Right technique + Correct execution = Maximum Effect. Contents include Key strategies for achieving maximum combative effects Krav maga's 12 most effective combatives Developing power and balance Combatives for the upper and lower body Combative combinations and retzev (continuous combat motion) Combatives for takedowns and throws Combatives for armbars, leglocks, and chokes Whatever your martial arts or defensive tactics background or if you have no self-defense background at all, this book can add defensive combatives and combinations to your defensive repertoire. Our aim is to build a strong self-defense foundation through the ability to optimally counter-attack.",
    """-AWESOME FACTS ABOUT THE RUGBY WORLD CUP: I have intentionally selected a specific range of "Rugby World Cup" facts that I feel will not only help children to learn new information but more importantly, remember it. -FUN LEARNING TOOL FOR ALL AGES: This book is designed to capture the imagination of everyone through the use of "WoW" trivia, cool photos and memory recall quiz. -COOL & COLORFUL PICTURES: Each page contains a quality image relating to the subject in question. This helps the reader to match and recall the content. -SHORT QUIZ GAME - POSITIVE REINFORCEMENT: No matter what the score is, everyone's a WINNER! The purpose of the short quiz at the end is to help check understanding, to cement the information and to provide a positive conclusion, regardless of the outcome. Your search for the best "Rugby Union" book is finally over. When you purchase from me today, here are just some of the things you can look forward to..... Amazing and extraordinary "Rugby World Cup" facts. This kind of trivia seems to be one of the few things my memory can actually recall. I'm not sure if it's to do with the shock or the "WoW" factor but for some reason my brain seems to store at least some of it for a later date. A fun way of learning. I've always been a great believer in that whatever the subject, if a good teacher can inspire you and hold your attention, then you'll learn! Now I'm not a teacher but the system I've used in previous publications on Kindle seems to work well, particularly with children. A specific selection of those "WoW" facts combined with some pretty awesome pictures, if I say so myself! Words and images combined to stimulate the brain and absorb the reader using an interactive formula. At the end there is a short "True or False" quiz to check memory recall. Don't worry though, it's a bit of fun but at the same time, it helps to check understanding. Remember, "Everyone's a Winner!" Enjoy ......... Matt."""
]})
out = ezonsm.predictor.predict(messages.encode()).decode()
#x = np.array(out['output'])

We can see below that the result was a list of lists, with each sublist containing 512 elements. Then we check that these elements are indeed values for the vector embedding.

In [None]:
[len(json.loads(out)['output'][x]) for x in range(len(json.loads(out)['output']))]

In [None]:
print(json.loads(out)['output'][0])

Let's delete the model to save resources.

In [None]:
ezonsm.predictor.delete_endpoint()

## Deploying from the SageMaker SDK Model Object created by EZSMDeploy

EZSMDeploy also gives us the SageMaker SDK Model object it creates to deploy the model. We can use that to deploy the model as well.

In [None]:
model = ezonsm.sagemakermodel
model_name = ezonsm.sagemakermodel.name

In [None]:
predictor = model.deploy(initial_instance_count=1, instance_type='ml.p3.2xlarge', endpoint_name=model_name)

In [None]:
messages = json.dumps({'instances':[
    "Brian Cosgrove's classic introduction to the world of microlight flying has endeared itself to several generations of pilots.",
    "BECAUSE NOT ALL KRAV MAGA IS THE SAME(R) This book is designed for krav maga trainees, security-conscious civilians, law enforcement officers, security professionals, and military personnel alike who wish to refine their essential krav maga combatives, improve their chances of surviving a hostile attack and prevail without serious injury. Combatives are the foundation of krav maga counter-attacks. These are the combatives of the original Israeli Krav Maga Association (Grandmaster Gidon). It is irrefutable that you need only learn a few core combatives to be an effective fighter. Simple is easy. Easy is effective. Effective is what is required to end a violent encounter quickly, decisively, and on your terms. This book stresses doing the right things and doing them in the right way. Right technique + Correct execution = Maximum Effect. Contents include Key strategies for achieving maximum combative effects Krav maga's 12 most effective combatives Developing power and balance Combatives for the upper and lower body Combative combinations and retzev (continuous combat motion) Combatives for takedowns and throws Combatives for armbars, leglocks, and chokes Whatever your martial arts or defensive tactics background or if you have no self-defense background at all, this book can add defensive combatives and combinations to your defensive repertoire. Our aim is to build a strong self-defense foundation through the ability to optimally counter-attack.",
    """-AWESOME FACTS ABOUT THE RUGBY WORLD CUP: I have intentionally selected a specific range of "Rugby World Cup" facts that I feel will not only help children to learn new information but more importantly, remember it. -FUN LEARNING TOOL FOR ALL AGES: This book is designed to capture the imagination of everyone through the use of "WoW" trivia, cool photos and memory recall quiz. -COOL & COLORFUL PICTURES: Each page contains a quality image relating to the subject in question. This helps the reader to match and recall the content. -SHORT QUIZ GAME - POSITIVE REINFORCEMENT: No matter what the score is, everyone's a WINNER! The purpose of the short quiz at the end is to help check understanding, to cement the information and to provide a positive conclusion, regardless of the outcome. Your search for the best "Rugby Union" book is finally over. When you purchase from me today, here are just some of the things you can look forward to..... Amazing and extraordinary "Rugby World Cup" facts. This kind of trivia seems to be one of the few things my memory can actually recall. I'm not sure if it's to do with the shock or the "WoW" factor but for some reason my brain seems to store at least some of it for a later date. A fun way of learning. I've always been a great believer in that whatever the subject, if a good teacher can inspire you and hold your attention, then you'll learn! Now I'm not a teacher but the system I've used in previous publications on Kindle seems to work well, particularly with children. A specific selection of those "WoW" facts combined with some pretty awesome pictures, if I say so myself! Words and images combined to stimulate the brain and absorb the reader using an interactive formula. At the end there is a short "True or False" quiz to check memory recall. Don't worry though, it's a bit of fun but at the same time, it helps to check understanding. Remember, "Everyone's a Winner!" Enjoy ......... Matt."""
]})
out = predictor.predict(messages.encode()).decode()
#x = np.array(out['output'])

In [None]:
[len(json.loads(out)['output'][x]) for x in range(len(json.loads(out)['output']))]

In [None]:
print(json.loads(out)['output'][0])

We have the same results as the deployment through EZSMDeploy. That is good, but not perfect. If we check the logs, we see that we are still not leveraging GPU, so the P3 instance is not being used to its fullest. The message that shows the problem is this:
```
tensorflow/stream_executor/platform/default/dso_loader.cc:55] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
tensorflow/stream_executor/cuda/cuda_driver.cc:313] failed call to cuInit: UNKNOWN ERROR (303)
tensorflow/stream_executor/cuda/cuda_diagnostics.cc:169] retrieving CUDA diagnostic information for host: model.aws.local
tensorflow/stream_executor/cuda/cuda_diagnostics.cc:176] hostname: model.aws.local
tensorflow/stream_executor/cuda/cuda_diagnostics.cc:200] libcuda reported version is: Not found: was unable to find libcuda.so DSO loaded into this program
```
That is because the image created by EZSMDeploy is not created with GPU support. 

In [None]:
predictor.delete_endpoint()

# Creating a Batch Transform

Besides starting an endpoint and using it on request, we can also tell SageMaker to apply a batch transformation to an entire dataset. Let's get the latest processed data and use that.

In [None]:
smclient = boto3.client('sagemaker')

In [None]:
latest_job = smclient.list_processing_jobs(
    CreationTimeBefore=datetime.now(),
    NameContains='muse-dask-processing',
    StatusEquals='Completed',
    SortBy='CreationTime',
    SortOrder='Descending',
    MaxResults=1
)['ProcessingJobSummaries'][0]['ProcessingJobName']
print(f"Latest processing job: {latest_job}")
job_description = smclient.describe_processing_job(ProcessingJobName=latest_job)
s3_processed_data = next(output['S3Output']['S3Uri'] for output in job_description['ProcessingOutputConfig']['Outputs'] if output['OutputName'] == 'processed-dataset')
print(f"Location of latest processed data: {s3_processed_data}")

Now that we have the location of the latest processed dataset, let's feed it into the transformer. First, we need to create a [Transformer](https://sagemaker.readthedocs.io/en/stable/api/inference/transformer.html#sagemaker.transformer.Transformer) based on the model we used before (more info on batch transformation [here](https://sagemaker.readthedocs.io/en/stable/overview.html#sagemaker-batch-transform)).

Since we know that our container is not correctly set up for using GPU, let's use a cheaper instance for this one and leverage some parallelism.

In [None]:
timestamp_prefix = strftime("%Y-%m-%d-%H-%M-%S", gmtime())
s3_inference_output = f"s3://{bucket}/sagemaker/muse-inference/output/{timestamp_prefix}"
print(f"Inference results will be saved at {s3_inference_output}")

In [None]:
muse_transformer = model.transformer(
    instance_count=5,
    instance_type='ml.m5.xlarge', 
    output_path=s3_inference_output,
    accept="application/json",  # Needs to be specified when using output filter
    assemble_with="Line",       # Needs to be specified when using output filter
    max_concurrent_transforms=1)

In [None]:
muse_transformer.transform(
    data=s3_processed_data,
    data_type='S3Prefix',
    content_type="text/csv",    # Needs to be specified to use input filter
    compression_type=None,
    split_type="Line",          # Needs to be specified to use input filter
    job_name=f"muse-inference-transform-{timestamp_prefix}",
    input_filter="$[2]",        # Take only Field #2 of the input (the description)
    output_filter="$.output",   # Return the "output" field of the returned object
    join_source=None,
    wait=True,
    logs=True
)