## Serverless Inference

https://github.com/huggingface/notebooks/blob/main/sagemaker/19_serverless_inference/sagemaker-notebook.ipynb

In [None]:
# !pip install sagemaker --upgrade

In [33]:
import sagemaker
import boto3
import json

In [34]:
# help(sagemaker.Session)

In [35]:
bucket_name='sagemaker-contract-audit'
sess = sagemaker.Session(default_bucket=bucket_name)

In [36]:
try:
    role = sagemaker.get_execution_role()
except ValueError:
    iam = boto3.client('iam')
    role = iam.get_role(RoleName='sagemaker_execution_role')['Role']['Arn']

In [37]:
print(f"sagemaker role arn: {role}")
print(f"sagemaker bucket: {sess.default_bucket()}")
print(f"sagemaker session region: {sess.boto_region_name}")

sagemaker role arn: arn:aws:iam::418467925232:role/AWS_Sagemaker_role
sagemaker bucket: sagemaker-contract-audit
sagemaker session region: us-east-1


In [46]:
from sagemaker.huggingface.model import HuggingFaceModel
from sagemaker.serverless import ServerlessInferenceConfig

# Hub Model configuration. <https://huggingface.co/models>
# hub = {
#     'HF_MODEL_ID':'distilbert-base-uncased-finetuned-sst-2-english',
#     'HF_TASK':'text-classification'
# }

hub = {
	'HF_MODEL_ID':'distilgpt2',
	'SM_NUM_GPUS': json.dumps(0)
}


endpoint_name = 'sm-contract-audit'

# create Hugging Face Model Class
huggingface_model = HuggingFaceModel(
   env=hub,                      # configuration for loading model from Hub
   role=role,                    # iam role with permissions to create an Endpoint
   transformers_version="4.26",  # transformers version used
   pytorch_version="1.13",        # pytorch version used
   py_version='py39',            # python version used
)

# Specify MemorySizeInMB and MaxConcurrency in the serverless config object
serverless_config = ServerlessInferenceConfig(
    memory_size_in_mb=3072, max_concurrency=10,
)

# deploy the endpoint endpoint
predictor = huggingface_model.deploy(
    serverless_inference_config=serverless_config,
    endpoint_name=endpoint_name,
)

----!

In [47]:
data = {
  "inputs": "this year its very hot",
}

res = predictor.predict(data=data)
print(res)

[{'generated_text': 'this year its very hot, not surprisingly, though the other day it has been a little cooler, but it only lasted about three hours and it has been very nice.'}]


In [48]:
predictor.delete_model()
predictor.delete_endpoint()

In [25]:
# help(HuggingFaceModel)