# Create a Multi-model Endpoint on SageMaker for Hugging Face Text Generation Models

### Create and test a generic endpoint

In [None]:
from sagemaker.huggingface import HuggingFace

# redefining if you want to use a pretrained model
# huggingface_estimator = HuggingFace.attach('<point to your training job here')
# s3_model_data = huggingface_estimator.model_data

In [2]:
from sagemaker.huggingface import HuggingFaceModel
import sagemaker

role = sagemaker.get_execution_role()

# Hub Model configuration. https://huggingface.co/models
hub = {
    'HF_MODEL_ID':'gpt2',
    'HF_TASK':'text-generation'
}

# create Hugging Face Model Class
huggingface_model = HuggingFaceModel(
    transformers_version='4.6.1',
    pytorch_version='1.7.1',
    py_version='py36',
    env=hub,
    role=role, 
#     entry_point="inference.py",
#     model_data=s3_model_data,
)

# deploy model to SageMaker Inference
predictor = huggingface_model.deploy(
    initial_instance_count=1, # number of instances
    instance_type='ml.m5.xlarge' # ec2 instance type
)


-----!

In [None]:
predictor.predict({"inputs":'A rose by any other name'})

### Create a multi-model endpoint

In [None]:
multi_model_input_s3 = s3_model_data.split('/huggingface')[0] + '/multi-model/'

multi_model_input_s3

In [None]:
from datetime import datetime
from sagemaker.multidatamodel import MultiDataModel
import sagemaker

sess = sagemaker.Session()

mme = MultiDataModel(
    name="hf-multi-gpt2-" + datetime.now().strftime("%Y-%m-%d-%H-%M-%S"),
    model_data_prefix=multi_model_input_s3, # this is the bucket source for ALL models, do NOT point this to an existing model artifact
    model = huggingface_model,
    role = role,
    sagemaker_session=sess)

In [None]:
predictor = mme.deploy(
    initial_instance_count=1,
    instance_type="ml.c5.xlarge",
)
predictor.serializer = sagemaker.serializers.JSONSerializer()
predictor.deserializer = sagemaker.deserializers.JSONDeserializer()

In [None]:
list(mme.list_models())

In [None]:
mme.add_model(model_data_source=s3_model_data, model_data_path='My-Finetuned-Model')

### Create and deploy a GPT-6B pretrained model
Note that due to the size of GPT-J 6B we won't be able to deploy it to the multi-model endpoint. However, we can deploy it to a generic SageMaker endpoint and test it there. Notice the increase in the quality of the generated text! This is due to the size of the model and the number of paramters - this tends to directly cause a more "intelligent" model.

In [None]:
from sagemaker.huggingface import HuggingFaceModel
import sagemaker

# IAM role with permissions to create endpoint
role = sagemaker.get_execution_role()

# public S3 URI to gpt-j artifact
model_uri="s3://huggingface-sagemaker-models/transformers/4.12.3/pytorch/1.9.1/gpt-j/model.tar.gz"

# create Hugging Face Model Class
huggingface_model = HuggingFaceModel(
    model_data=model_uri,
    transformers_version='4.12.3',
    pytorch_version='1.9.1',
    py_version='py38',
    role=role, 
)

# deploy model to SageMaker Inference
predictor = huggingface_model.deploy(
    initial_instance_count=1, # number of instances
    instance_type='ml.g4dn.xlarge' #'ml.p3.2xlarge' # ec2 instance type
)

predictor.predict({"inputs":'A rose by any other name'})