In [None]:
%%sh
# build the image
cd container/

# tag it as example-image:latest
docker build -t example-serve:latest .

In [None]:
from subprocess import Popen
import subprocess

cmd = "docker run -p 8080:8080 --rm example-serve:latest serve"


process = subprocess.Popen(cmd, shell=True,
                           stdout=subprocess.PIPE, 
                           stderr=subprocess.PIPE,
                           start_new_session=True
                          )


process.wait()

# wait for the process to terminate
out, err = process.communicate()
errcode = process.returncode

print(out.decode('utf-8'))
print(process.pid)

In [None]:
import os

os.system('ls .')

In [None]:
import asyncio

proc = await asyncio.create_subprocess_exec(
    'ls','.',
    stdout=asyncio.subprocess.PIPE,
    stderr=asyncio.subprocess.PIPE)


# if proc takes very long to complete, the CPUs are free to use cycles for 
# other processes
#stdout, stderr = await proc.communicate()
stdout, stderr = proc.communicate()
proc.returncode
# 0

# must call decode because stdout is a bytes object
stdout.decode()
# total 24K
# drwxrwxr-x  3 felipe felipe 4,0K Nov  4 17:52 .
# drwxrwxr-x 39 felipe felipe 4,0K Nov  3 18:31 ..
# drwxrwxr-x  2 felipe felipe 4,0K Nov  3 19:32 .ipynb_checkpoints
# -rw-rw-r--  1 felipe felipe  11K Nov  4 17:52 main.ipynb

stderr.decode()
# ''  empty string   

In [None]:
#help(Popen)

In [None]:
%%bash
# run the inference container
docker run -p 8080:8080 --rm example-serve:latest serve 

In [None]:
%%bash
curl -H "Content-Type: text/csv" -v http://localhost:8080/ping

In [None]:
%%bash
curl --data-binary @container/local_test/payload.csv -H "Content-Type: text/csv" -v http://localhost:8080/invocations

In [None]:
import boto3
import datetime
import pprint
import os
import time

pp = pprint.PrettyPrinter(indent=1)
iam = boto3.client('iam')
ecr = boto3.client('ecr')

image_name="example-inference"

try:
    # The repository might already exist
    # in your ECR
    cr_res = ecr.create_repository(
        repositoryName=image_name)
    pp.pprint(cr_res)
except Exception as e:
    print(e)

In [None]:
%%bash
account=$(aws sts get-caller-identity --query Account | sed -e 's/^"//' -e 's/"$//')
region=$(aws configure get region)
ecr_account=${account}.dkr.ecr.${region}.amazonaws.com

# Give docker your ECR login password
aws ecr get-login-password --region $region | docker login --username AWS --password-stdin $ecr_account

# Fullname of the repo
fullname=$ecr_account/example-inference:latest

#echo $fullname
# Tag the image with the fullname
docker tag example-inference:latest $fullname

# Push to ECR
docker push $fullname

In [None]:
# Inspect the ECR repository
repo_res = ecr.describe_images(
    repositoryName='example-inference')
pp.pprint(repo_res)

In [None]:
image_uri="688520471316.dkr.ecr.us-west-2.amazonaws.com/example-inference:latest"
role_arn = 'arn:aws:iam::688520471316:role/sm'

In [None]:
sm_boto3 = boto3.client('sagemaker')

cm_res = sm_boto3.create_model(
    ModelName='example-inference',
    Containers=[
        {
            'Image': image_uri,
   
        },
    ],
    ExecutionRoleArn=role_arn,
    EnableNetworkIsolation=False
)

pp.pprint(cm_res)


In [None]:
# create endpoint config
# https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/sagemaker.html#SageMaker.Client.create_endpoint
model_name='example-inference'
initial_instance_count=1
instance_type='ml.t2.medium'
variant_name = "AMeaningfulProdVarName" #^[a-zA-Z0-9](-*[a-zA-Z0-9]){0,62}

# why do we need it
# think about a use case, where we need many variants
production_variants = [
    {
        "VariantName": variant_name,
        "ModelName": model_name,
        "InitialInstanceCount": initial_instance_count,
        "InstanceType": instance_type
    }
]

endpoint_config_name = "ExampleInferenceConfig" #^[a-zA-Z0-9](-*[a-zA-Z0-9]){0,62}

endpoint_config = {
    "EndpointConfigName": endpoint_config_name,
    "ProductionVariants": production_variants,
}

ep_conf_res = sm_boto3.create_endpoint_config(**endpoint_config)


In [None]:
pp.pprint(ep_conf_res)

In [None]:
# create endpoint
# https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/sagemaker.html#SageMaker.Client.create_endpoint


endpoint_name='exmaple-endpoint'
ep_res = sm_boto3.create_endpoint(
    EndpointName=endpoint_name,
    EndpointConfigName=endpoint_config_name
    )

In [None]:
pp.pprint(ep_res)

In [None]:
# describe endpoint
# https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/sagemaker.html#SageMaker.Client.describe_endpoint

ep_des_res = sm_boto3.describe_endpoint(
    EndpointName=endpoint_name
)

pp.pprint(ep_des_res)

Invoke endpoint 

```sh
grep -rwn .  -e predict
```

```python
def predict(
        self, data, initial_args=None, target_model=None, target_variant=None, inference_id=None
    ):
        """Return the inference from the specified endpoint.
        Args:
            data (object): Input data for which you want the model to provide
                inference. If a serializer was specified when creating the
                Predictor, the result of the serializer is sent as input
                data. Otherwise the data must be sequence of bytes, and the
                predict method then sends the bytes in the request body as is.
            initial_args (dict[str,str]): Optional. Default arguments for boto3
                ``invoke_endpoint`` call. Default is None (no default
                arguments).
            target_model (str): S3 model artifact path to run an inference request on,
                in case of a multi model endpoint. Does not apply to endpoints hosting
                single model (Default: None)
            target_variant (str): The name of the production variant to run an inference
                request on (Default: None). Note that the ProductionVariant identifies the
                model you want to host and the resources you want to deploy for hosting it.
            inference_id (str): If you provide a value, it is added to the captured data
                when you enable data capture on the endpoint (Default: None).
        Returns:
            object: Inference for the given input. If a deserializer was specified when creating
                the Predictor, the result of the deserializer is
                returned. Otherwise the response returns the sequence of bytes
                as is.
        """

        request_args = self._create_request_args(
            data, initial_args, target_model, target_variant, inference_id
        )
        response = self.sagemaker_session.sagemaker_runtime_client.invoke_endpoint(**request_args)
        return self._handle_response(response)
```

so `sagemaker_session.sagemaker_runtime_client` invokes the endpint

I searched `sagemaker_runtime` in session.py

```python
def _initialize(
        self,
        boto_session,
        sagemaker_client,
        sagemaker_runtime_client,
        sagemaker_featurestore_runtime_client,
    ):
        """Initialize this SageMaker Session.
        Creates or uses a boto_session, sagemaker_client and sagemaker_runtime_client.
        Sets the region_name.
        """
        self.boto_session = boto_session or boto3.DEFAULT_SESSION or boto3.Session()

        self._region_name = self.boto_session.region_name
        if self._region_name is None:
            raise ValueError(
                "Must setup local AWS configuration with a region supported by SageMaker."
            )

        self.sagemaker_client = sagemaker_client or self.boto_session.client("sagemaker")
        prepend_user_agent(self.sagemaker_client)

        if sagemaker_runtime_client is not None:
            self.sagemaker_runtime_client = sagemaker_runtime_client
        else:
            config = botocore.config.Config(read_timeout=80)
            self.sagemaker_runtime_client = self.boto_session.client(
                "runtime.sagemaker", config=config
            )

        prepend_user_agent(self.sagemaker_runtime_client)

        if sagemaker_featurestore_runtime_client:
            self.sagemaker_featurestore_runtime_client = sagemaker_featurestore_runtime_client
        else:
            self.sagemaker_featurestore_runtime_client = self.boto_session.client(
                "sagemaker-featurestore-runtime"
            )

        self.local_mode = False
 
```

This led to investigate [`SageMakerRuntime` client in boto3](https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/sagemaker-runtime.html)


```
client = boto3.client('sagemaker-runtime')
response = client.invoke_endpoint(
    EndpointName='string',
    Body=b'bytes'|file,
    ContentType='string',
    Accept='string',
    CustomAttributes='string',
    TargetModel='string',
    TargetVariant='string',
    TargetContainerHostname='string',
    InferenceId='string'
)
```

To figure out how content type looks like, I looked up [`serializers.py`](https://github.com/aws/sagemaker-python-sdk/blob/master/src/sagemaker/serializers.py) and [`deserializers.py`](https://github.com/aws/sagemaker-python-sdk/blob/master/src/sagemaker/deserializers.py)


Why is it called SageMaker runtime

In [None]:
# how to serialize csv

import csv
import io

def _serialize_row(data):
    """Serialize data as a CSV-formatted row.
    Args:
        data (sting): Data to be serialized in a row.
    Returns:
        str: The data serialized as a CSV-formatted row.
    """
    if isinstance(data, str):
        return data

    if isinstance(data, np.ndarray):
        data = np.ndarray.flatten(data)

    if hasattr(data, "__len__"):
        if len(data) == 0:
            raise ValueError("Cannot serialize empty array")
        csv_buffer = io.StringIO()
        csv_writer = csv.writer(csv_buffer, delimiter=",")
        csv_writer.writerow(data)
        return csv_buffer.getvalue().rstrip("\r\n")
        
csv_buffer = io.StringIO()
csv_writer = csv.writer(csv_buffer, delimiter=',')
csv_writer.writerow('xyze')
v = csv_buffer.getvalue().rstrip('\r\n')

print(v.rstrip("\r\n"))

In [None]:
# invoke endpoint
import json

sm_runtime = boto3.client('sagemaker-runtime')

body=json.dumps('a json string')

# the model only supports csv data (look at how the model is defined in container/predictor.py)
content_type='text/csv'
# see the cell below for serializing a string into csv format

csv_buffer = io.StringIO()
csv_writer = csv.writer(csv_buffer, delimiter=',')
csv_writer.writerow('xyze')
body = csv_buffer.getvalue().rstrip('\r\n')

# respnse type is also text/csv (look at decision_tree/predictor.py)
accept='text/csv'

res=sm_runtime.invoke_endpoint(
    EndpointName=endpoint_name,
    Body=body,                # encoded input data
    ContentType=content_type, # I told the endpoint what's the encode
    Accept=accept             # I told the endpoint how I wish to decode its response
)

print(res)

In [None]:
# how to decode json string

import codecs

class SimpleBaseDeserializer:
    pass

class JSONDeserializer(SimpleBaseDeserializer):
    """Deserialize JSON data from an inference endpoint into a Python object."""

    def __init__(self, accept="application/json"):
        """Initialize a ``JSONDeserializer`` instance.
        Args:
            accept (union[str, tuple[str]]): The MIME type (or tuple of allowable MIME types) that
                is expected from the inference endpoint (default: "application/json").
        """
        super(JSONDeserializer, self).__init__(accept=accept)

    def deserialize(self, stream, content_type):
        """Deserialize JSON data from an inference endpoint into a Python object.
        Args:
            stream (botocore.response.StreamingBody): Data to be deserialized.
            content_type (str): The MIME type of the data.
        Returns:
            object: The JSON-formatted data deserialized into a Python object.
        """
        try:
            return json.load(codecs.getreader("utf-8")(stream))
        finally:
            stream.close()
            
            

class CSVDeserializer(SimpleBaseDeserializer):
    """Deserialize a stream of bytes into a list of lists.
    Consider using :class:~`sagemaker.deserializers.NumpyDeserializer` or
    :class:~`sagemaker.deserializers.PandasDeserializer` instead, if you'd like to convert text/csv
    responses directly into other data types.
    """

    def __init__(self, encoding="utf-8", accept="text/csv"):
        """Initialize a ``CSVDeserializer`` instance.
        Args:
            encoding (str): The string encoding to use (default: "utf-8").
            accept (union[str, tuple[str]]): The MIME type (or tuple of allowable MIME types) that
                is expected from the inference endpoint (default: "text/csv").
        """
        super(CSVDeserializer, self).__init__(accept=accept)
        self.encoding = encoding

    def deserialize(self, stream, content_type):
        """Deserialize data from an inference endpoint into a list of lists.
        Args:
            stream (botocore.response.StreamingBody): Data to be deserialized.
            content_type (str): The MIME type of the data.
        Returns:
            list: The data deserialized into a list of lists representing the
                contents of a CSV file.
        """
        try:
            decoded_string = stream.read().decode(self.encoding)
            return list(csv.reader(decoded_string.splitlines()))
        finally:
            stream.close()
        

In [None]:
# decode response
res_body = res['Body']
res_body.read().decode('utf-8')

In [None]:
# SageMaker Pricing

In [None]:
del_res = sm_boto3.delete_endpoint(
    EndpointName=endpoint_name)
pp.pprint(del_res)

# Create Endpoint 

In this notebook, you will learn basics about hosting your trained model on Amazon SageMaker for inference. There are two ways you can use Amazon SageMaker for inference:
1. Set up persistent endpoint for real-time online inference
2. Gather data to be predicted in batch and use SageMaker batch transform for offline inference. 

In this notebook, we focus on the first option and we will discuss batch transform in another notebook. 

You are highly recommeneded to go through [the section on model deployment](https://docs.aws.amazon.com/sagemaker/latest/dg/how-it-works-deployment.html) in the official docs before moving on.


The pricing for setting up an endpoint can be found [here](https://aws.amazon.com/sagemaker/pricing/)

Like a [CreateTrainingJob](https://github.com/hsl89/amazon-sagemaker-examples/blob/sagemaker-fundamentals/sagemaker-fundamentals/create-training-job/create-training-job.ipynb), Amazon SageMaker interacts with your inference logic via a containerized enviornment. 

The following APIs are relavent:
* [`CreateModel`](https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/sagemaker.html#SageMaker.Client.create_model)
* [`CreateEndpointConfig`](https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/sagemaker.html#SageMaker.Client.create_endpoint_config)
* [`CreateEndpoint`](https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/sagemaker.html#SageMaker.Client.create_endpoint)

You are highly recommended to go through them. It's okay if you don't understand everything, we will go through them in detail in this notebook. 

The outline of this notebook is:
* Create an IAM role for SageMaker
* Build an inference image
* Test the inference image / container locally and push it to ECR
* Use the ECR address of the inference container to define a model by calling `CreateModel`
* Specify configuration of an endpoint by calling `CreateEndpointConfig`
* Use model definition from 3 and endpoint configuration from 4 to create an endpoint by calling `CreateEndpoint`
* Invoke the endpoint by using SageMaker runtime client 

In [None]:
# setups
import boto3
import datetime
import pprint
import os
import time

pp = pprint.PrettyPrinter(indent=1)

## Create an IAM service role

Review [notebook on execution role](https://github.com/hsl89/amazon-sagemaker-examples/blob/execution-role/sagemaker-fundamentals/execution-role/execution-role.ipynb) for step-by-step instructions on how to create an IAM Role.

The service role is intended to be assumed by the SageMaker service. For simplicity, we will give it `AmazonSageMakerFullAccess` permission. However, in order to do what we need in this notebook, we do not need such a comprehensive permission. You are highly encouraged to play with the helper functions we provide in `iam_helpers.py` to figure out what are the minimum permissions needed to run this notebook. 

First get some useful functions we created there to help us creating an execution role. 

In [None]:
%%bash
file=$(ls . | grep iam_helpers.py)

if [ -f "$file" ]
then
    rm $file
fi

wget https://raw.githubusercontent.com/hsl89/amazon-sagemaker-examples/sagemaker-fundamentals/sagemaker-fundamentals/execution-role/iam_helpers.py


In [None]:
# create an role
from iam_helpers import create_execution_role, attach_permission

role_name='sm' 
role = create_execution_role(role_name=role_name)['Role']
print(role)

In [None]:
# attach AmazonSageMakerFullAccess
iam = boto3.client('iam')
res = iam.attach_role_policy(
    RoleName=role['RoleName'],
    PolicyArn='arn:aws:iam::aws:policy/AmazonSageMakerFullAccess',
)

pp.pprint(res)

## Build an inference image

You inference image must be a self-contained web server. When you run your inference container locally, it should listen on port 8080 and accept POST requests to the `/invocations` endpoint. The payload of the POST requests is the content of the data that you want your model to predict. Since the inference container is essentially a web server, you should expect it to look differently from the container we used for [`CreateTrainingJob`](https://github.com/hsl89/amazon-sagemaker-examples/blob/sagemaker-fundamentals/sagemaker-fundamentals/create-training-job/create-training-job.ipynb). 

In this notebook, we use a minimal python stack to build our web server:
![Request serving stack](stack.png)

### Further readings on the serving stack

* [Overview of the stack](https://flask.palletsprojects.com/en/1.1.x/deploying/uwsgi/)
* [Ngnix homepage](https://www.nginx.com/resources/wiki/start/) 
* [WSGI homepage](https://gunicorn.org/)
* [Flask homepage](https://flask.palletsprojects.com/en/1.1.x/)

### How SageMaker runs your container

SageMaker runs your container like

```sh
docker run <image> serve
```

This means you need to have an executable called `serve` in the `PATH`. In this notebook, we will create a python script as an **executable** and put it in the working directory of the docker image. 
        
The folder `container/src` contains the configs and entry point of the web server

In [None]:
!ls  container/src

#### Entrypoint for Ngnixs server

`serve` is a python executable that is intended to be used as the entrypoint for the inference image.

In [None]:
!cat container/src/serve

#### Config file for Ngnix server
`nginx.conf` is the config file for the nginx server.

In [None]:
!cat container/src/nginx.conf

#### WSGI config

In [None]:
!cat container/src/wsgi.py

#### Inference logic

The most important file in `container/src` is `predictor.py`. It contains the inference logic. Other files in the `container/src` can be used **as it**. But you will need to customize `predictor.py` to implement your own inference logic. 

In [None]:
!pygmentize container/src/predictor.py

## Build the container

We build the container from `container/Dockderfile`. And let's call this image `example-serve`. 

In [None]:
!cat container/Dockerfile

In [None]:
%%sh
# build the image
cd container/

# tag it as example-image:latest
docker build -t example-serve:latest .

## Test your image

Like in the [notebook for CreateTrainingJob](https://github.com/hsl89/amazon-sagemaker-examples/blob/sagemaker-fundamentals/sagemaker-fundamentals/create-training-job/create-training-job.ipynb), we replicate the Amazon SageMaker hosting environment and test your image locally before serving in production. You are encouraged to read through the section on [Use Your Own Inference Code with Hosting Services](https://docs.aws.amazon.com/sagemaker/latest/dg/your-algorithms-inference-code.html) and think about how would you replicate SageMaker hosting environment before moving on. 

Like for `CreateTrainingJob`, SageMaker reserves `/opt/ml` directory in your image to inject ML-related info for `CreateEndpoint`. In particular, it downloads your trained model artifact and inject it in the directory `/opt/ml/model`. When calling `CreateEndpoint` you will need to tell SageMaker the S3 URI of your model artifact. SageMaker will use then pull the artifact and inject it into `/opt/ml/model`. This means when defining your own inference logic, you should load your trained model from `/opt/ml/model`. 

We will use docker python client to run your image and we will mount `container/local_test/ml` to `/opt/ml` as docker volume. 

In [None]:
# look at what's inside `container/ml`
!ls container/local_test/ml

The inference logic we implemented in `container/src/predictor.py` under `def inference():` does not require a real ML model. Therefore we do not need to inject anything for the purpose of local test. We will discuss how to load a real model in a more advanced notebook. 

<span style="color:red"> TODO for Dev:  add link to the advanced notebook when it is ready</span>.

#### Run the container

To run the container `example-serve`, open a terminal in the current directory and go to `container/local_test`

```sh
cd container/local_test
```

Then run the following command

```sh
docker run -v ml:/opt/ml -p 8080:8080 --rm example-serve:latest serve 
```

`-v ml:/opt/ml` binds the directory `ml` (in `container/local_test`) to `/opt/ml` in the image as a docker volume.

`-p 8080:8080` exposes port 8080 inside container as port 8080 on the hos

`--rm` removes the container from daemon when it is stopped. 

We suggest you to run the image from the shell instead of within the notebook because when you are debugging your own container, you can more easily stdout from the container when you have a shell process running it. 

#### Ping your container
Once your container is up, you can ping it at `http://localhost:8080`. 

To trigger the logic under `def ping():` in `container/src/predictor.py`, run the following cell

In [None]:
%%sh
curl localhost:8080/ping

To trigger the logic under `def inference():` in `container/src/predictor.py` with a json string, run the following cell

In [None]:
%%sh
curl --header "Content-Type: application/json" \
  --request POST \
  --data '{"key":"value"}' \
  http://localhost:8080/invocations

In [None]:
%%sh
curl --header "Content-Type: text/csv" \
  --request POST \
  http://localhost:8080/invocations

To stop the container, go to the terminal that runs your container and press `Control + C`. Alternatively, you can find out it container id by grepping for a docker process that binds port 8080 on the host and manually remove it.

```sh
docker rm -f $(docker ps | grep -e "0.0.0.0:8080->8080/tcp" | awk '{print $1}'
```

## Push the image to ECR
Now you have tested your image, the next thing to do is to push it to your ECR so that SageMaker can download it. We have discussed this in the [previous notebook on `CreateTrainingJob`](https://github.com/hsl89/amazon-sagemaker-examples/blob/sagemaker-fundamentals/sagemaker-fundamentals/create-training-job/create-training-job.ipynb) in the section where we push the training image to ECR. 

In the notebook for `CreateTrainingJob`, we created the ECR repo and pushed the training image there via the IAM user (you). In this notebook, let's do something different: we will create an ECR repo and push the image there using the IAM role `sm` you created at the beginining. For this purpose, you will need to make sure the IAM user you are assuming now has the permission to assume role, i.e. (`sts: AssumeRole`)

In [None]:
# Verify that you can assume role
user_arn = boto3.client('sts').get_caller_identity()['Arn'] # you

user_prp = iam.simulate_principal_policy(
    PolicySourceArn=user_arn,
    ActionNames=['sts:AssumeRole']
)
print("== User's Permission to Assume Role ==")
pp.pprint(user_prp['EvaluationResults'])


In [None]:
role['Arn']

In [None]:
# Create a boto session with the role

now = str(time.time()).split('.')[0]

obj = boto3.client('sts').assume_role(
    RoleArn=role['Arn'],
    RoleSessionName=now
)

cred=obj['Credentials']

sess = boto3.session.Session(
    aws_access_key_id=cred['AccessKeyId'],
    aws_secret_access_key=cred['SecretAccessKey'],
    aws_session_token=cred['SessionToken']
    )

In [None]:
# inspect the profile of the session
assumed_role=sess.client('sts').get_caller_identity()
pp.pprint(assumed_role)

In [None]:
# Verify the assumed role has the previlege to create ECR repo
role_prp = iam.simulate_principal_policy(
    PolicySourceArn=role['Arn'],
    ActionNames=['ecr:GetAuthorizationToken', 'ecr:CreateRepository']
)

pp.pprint(role_prp)

### Create a repo

In [None]:
ecr = sess.client('ecr')

try:
    # The repository might already exist
    # in your ECR
    cr_res = ecr.create_repository(
        repositoryName='example-serve')
    pp.pprint(cr_res)
except Exception as e:
    print(e)

### Push the image to ECR

In [None]:
%%bash
account=$(aws sts get-caller-identity --query Account | sed -e 's/^"//' -e 's/"$//')
region=$(aws configure get region)
ecr_account=${account}.dkr.ecr.${region}.amazonaws.com

# Give docker your ECR login password
aws ecr get-login-password --region $region | docker login --username AWS --password-stdin $ecr_account

# Fullname of the repo
fullname=$ecr_account/example-serve:latest

#echo $fullname
# Tag the image with the fullname
docker tag example-serve:latest $fullname

# Push to ECR
docker push $fullname

## Create model


