# Deploy Hugging Face transformer models with multi-model endpoints 

This notebook is a step-by-step tutorial on deploying multiple pre-trained PyTorch Hugging Face model with multi-model endpoint on Amazon SageMaker. 

We will describe the steps for deploying a multi-model endpoint on Amazon SageMaker with TorchServe serving stack. An additional step compared to single model deployment is the requirement to create a manifest file for each model prior to deployment. For training Hugging Face models on SageMaker, refer the examples [here](https://github.com/huggingface/notebooks/tree/master/sagemaker)

In [None]:
import sagemaker
from sagemaker import get_execution_role
from sagemaker.utils import name_from_base
from sagemaker.pytorch import PyTorchModel
import boto3
import torch
import time
import pandas as pd
from pathlib import Path
import tarfile
import shutil 
import datetime
import json

role = get_execution_role()
region = boto3.Session().region_name
sm_session = sagemaker.Session()
bucket = sm_session.default_bucket()
sm_client = boto3.client("sagemaker", region)
sm_runtime = boto3.client("sagemaker-runtime")
s3_client = boto3.client('s3')
prefix = "sagemaker/huggingface-pytorch-sentiment-analysis"

In [None]:
%store
%store -r

In [None]:
try:
    describe_model_package_group_response = sm_client.describe_model_package_group(
        ModelPackageGroupName=model_package_group_name
    )
    print(describe_model_package_group_response)
except:
    print(f"model package group {model_package_group_name} does not exist")

### Register a new model version for Hugging Face roberta model with entry point script helper function
we will firstly download the roberta model file and prepare the model with inference script to be used in the endpoint.

In [None]:
local_artifact_path = Path("model_artifacts")
local_artifact_path.mkdir(exist_ok=True, parents=True)
model_tar_name = 'model_roberta_MME.tar.gz'
org_model_tar_name = model_roberta_uri.split('/')[-1]

In [None]:
s3_client.download_file(bucket, '/'.join(model_roberta_uri.split('/')[3:]), org_model_tar_name)

In [None]:
with tarfile.open(org_model_tar_name) as tar:
    tar.extractall(path=local_artifact_path.stem)

In [None]:
shutil.copytree('../code', local_artifact_path / 'code') 

In [None]:
def create_tar(tarfile_name: str, local_path: Path):
    """
    Create a tar.gz archive with the content of `local_path`.
    """
    with tarfile.open(tarfile_name, mode="w:gz") as archive:
        [
            archive.add(k, arcname=f"{k.relative_to(local_path)}")
            for k in local_path.glob("**/*.*")
            if f"{k.relative_to(local_path)}"[0] != "."
        ]
    tar_size = Path(tarfile_name).stat().st_size / 10**6
    return tar_size

In [None]:
tar_size = create_tar(model_tar_name, local_artifact_path)
print(f"Created {model_tar_name}, size {tar_size:.2f} MB")

In [None]:
from sagemaker.s3 import S3Uploader,s3_path_join
model_data_path = s3_path_join("s3://",bucket,prefix+"/models")
model_roberta_mme_uri =S3Uploader.upload(model_tar_name, model_data_path)
print(f"Uploaded roberta model to {model_roberta_mme_uri}")
%store model_roberta_mme_uri

In [None]:
list_model_packages_response = sm_client.list_model_packages(
    ModelPackageGroupName=model_package_group_name
)
list_model_packages_response

In [None]:
describe_model_package_response = sm_client.describe_model_package(
    ModelPackageName=roberta_model_package_arn
)
describe_model_package_response

In [None]:
model_package_response = sm_client.create_model_package(
    ModelPackageGroupName=str(model_package_group_name),
    ModelPackageDescription=f"Hugging Face Roberta Model MME - sentiment analysis",
    Domain=describe_model_package_response['Domain'],
    Task=describe_model_package_response['Task'],
    InferenceSpecification={
        "Containers": [
            {
                "ContainerHostname": "huggingface-pytorch-roberta-update",
                "Image": describe_model_package_response['InferenceSpecification']['Containers'][0]['Image'],
                "ModelDataUrl": model_roberta_mme_uri,
                "Framework": describe_model_package_response['InferenceSpecification']['Containers'][0]['Framework'],
                "NearestModelName": describe_model_package_response['InferenceSpecification']['Containers'][0]['NearestModelName'],
                "Environment": {
                    "SAGEMAKER_CONTAINER_LOG_LEVEL": describe_model_package_response['InferenceSpecification']['Containers'][0]['Environment']['SAGEMAKER_CONTAINER_LOG_LEVEL'],
                    "SAGEMAKER_PROGRAM": "inference.py",
                    "SAGEMAKER_REGION": region,
                    "SAGEMAKER_SUBMIT_DIRECTORY": model_roberta_mme_uri,
                    "HF_TASK": describe_model_package_response['InferenceSpecification']['Containers'][0]['Environment']['HF_TASK'],
                },
            },
        ],
        "SupportedRealtimeInferenceInstanceTypes": describe_model_package_response['InferenceSpecification']['SupportedRealtimeInferenceInstanceTypes'],
        "SupportedContentTypes": ["application/json"],
        "SupportedResponseMIMETypes": ["application/json"],
    },
)

In [None]:
list_model_packages_response = sm_client.list_model_packages(
    ModelPackageGroupName=model_package_group_name
)
list_model_packages_response

In [None]:
roberta_mme_model_version_arn = list_model_packages_response["ModelPackageSummaryList"][0]["ModelPackageArn"]
print("roberta MME model: {}".format(roberta_mme_model_version_arn))
distilbert_model_version_arn = list_model_packages_response["ModelPackageSummaryList"][1]["ModelPackageArn"]
print("distilbert model: {}".format(distilbert_model_version_arn))

In [None]:
model_package_update_input_dict = {
    "ModelPackageArn": roberta_mme_model_version_arn,
    "ModelApprovalStatus": "Approved",
}
model_package_update_response = sm_client.update_model_package(**model_package_update_input_dict)
model_package_update_response

In [None]:
now_roberta_mme = f'{datetime.datetime.now():%Y-%m-%d-%H-%M-%S}'
roberta_mme_model_name = f"hf-pytorch-model-roberta-mme-{now_roberta_mme}"
print("Model name : {}".format(roberta_mme_model_name))
%store roberta_mme_model_name

In [None]:
primary_container_roberta = {
    "ModelPackageName": roberta_mme_model_version_arn,
}

create_model_roberta_respose = sm_client.create_model(
    ModelName=roberta_mme_model_name, 
    ExecutionRoleArn=role, 
    PrimaryContainer=primary_container_roberta
)
%store roberta_mme_model_name
print("Model arn : {}".format(create_model_roberta_respose["ModelArn"]))

In [None]:
image_uri = describe_model_package_response['InferenceSpecification']['Containers'][0]['Image']
image_uri 

In [None]:
deploy_instance_type = describe_model_package_response['InferenceSpecification']['SupportedRealtimeInferenceInstanceTypes'][1]

### Create the model metadata
Here we use `boto3` to establish the model metadata. Instead of describing a single model, this metadata will indicate the use of multi-model semantics and will identify the source location of all specific model artifacts.

In [None]:
# establish the place in S3 from which the endpoint will pull individual models
multi_model_now = f'{datetime.datetime.now():%Y-%m-%d-%H-%M-%S}'
multi_model_name = f'pytorch-multi-model-senti-{multi_model_now}'
_container = {
    'Image':        image_uri,
    'ModelDataUrl': model_data_path,
    'Mode':         'MultiModel'
}
create_model_response = sm_client.create_model(
    ModelName = multi_model_name,
    ExecutionRoleArn = role,
    Containers = [_container])
%store multi_model_name
print(f'Multi Model name {multi_model_name}')

### Create the multi-model endpoint
There is nothing special about the SageMaker endpoint config metadata for a multi-model endpoint. You need to consider the appropriate instance type and number of instances for the projected prediction workload. The number and size of the individual models will drive memory requirements.
endpoint_config_name =
Once the endpoint config is in place, the endpoint creation is straightforward.

In [None]:
endpoint_config_name = f'pytorch-multi-model-config-{multi_model_now}'
print('Endpoint config name: ' + endpoint_config_name)

create_endpoint_config_response = sm_client.create_endpoint_config(
    EndpointConfigName = endpoint_config_name,
    ProductionVariants=[{
        'InstanceType': deploy_instance_type,
        'InitialInstanceCount': 1,
        'InitialVariantWeight': 1,
        'ModelName': multi_model_name,
        'VariantName': 'AllTraffic'}])

In [None]:
endpoint_name = f'pytorch-multi-model-endpoint-{multi_model_now}'
print('Endpoint name: ' + endpoint_name)

create_endpoint_response = sm_client.create_endpoint(
    EndpointName=endpoint_name,
    EndpointConfigName=endpoint_config_name)
print('Endpoint Arn: ' + create_endpoint_response['EndpointArn'])

In [None]:
describe_endpoint_response = sm_client.describe_endpoint(EndpointName=endpoint_name)

while describe_endpoint_response["EndpointStatus"] == "Creating":
    describe_endpoint_response = sm_client.describe_endpoint(EndpointName=endpoint_name)
    print(describe_endpoint_response["EndpointStatus"])
    time.sleep(15)

describe_endpoint_response

### Invoke multi-model endpoint

In [None]:
test_data = pd.read_csv("../sample_payload/test_data.csv", header=None)
json_data = dict({'inputs':test_data.iloc[:,0].to_list()})
test_data

In [None]:
def invoke_multi_model_endpoint(model_archive=None, content_type="JSON", test_data=None):

    if content_type == "JSON":

        response = sm_runtime.invoke_endpoint(
            EndpointName=endpoint_name,
            Body=json.dumps(test_data),
            ContentType="application/json",
            TargetModel=model_archive,
        )
    elif content_type == "CSV":
        response = sm_runtime.invoke_endpoint(
            EndpointName=endpoint_name,
            Body=test_data.to_csv(header=False, index=False),
            ContentType="text/csv",
            TargetModel=model_archive,
        )
    else:
        print(f"input content type {content_type} is not supported, please selece CSV or JSON.")
    return response["Body"].read()

In [None]:
%%time
model_archive = '/model_roberta_MME.tar.gz'
content_type = "JSON" #"CSV"
payload = json_data #test_data
results = invoke_multi_model_endpoint(model_archive, content_type, payload)
print(results)

In [None]:
%%time
model_archive = '/model_roberta_MME.tar.gz'
content_type = "CSV"
payload = test_data
results = invoke_multi_model_endpoint(model_archive, content_type, payload)
print(results)

In [None]:
%%time
model_archive = '/model_distilbert.tar.gz'
content_type = "JSON" #"CSV"
payload = json_data #test_data
results = invoke_multi_model_endpoint(model_archive, content_type, payload)
print(results)

In [None]:
%%time
model_archive = '/model_distilbert.tar.gz'
content_type = "CSV"
payload = test_data
results = invoke_multi_model_endpoint(model_archive, content_type, payload)
print(results)

## Delete the endpoint

If you do not plan to use this endpoint further, you should delete the endpoint to avoid incurring additional charges.

In [None]:
sm_session.delete_endpoint(endpoint_name)