In [None]:
!pip install -U sagemaker==2.232.2

In [1]:
import sagemaker

sess = sagemaker.Session()
session_bucket = sess.default_bucket()
role = sagemaker.get_execution_role()

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml


In [39]:
# Import the necessary libraries for using Hugging Face models and SageMaker
from sagemaker.huggingface import HuggingFaceModel

# Define the instance type that will be used for inference
# ml.inf2.24xlarge is based on AWS Inferentia2 hardware, optimized for high-performance machine learning inference
instance_type = "ml.inf2.24xlarge"

# Set the health check timeout and volume size for the SageMaker model endpoint
health_check_timeout = 2400  # The maximum time (in seconds) SageMaker waits for the model to be ready
volume_size = 128  # Storage size in GB allocated to the model

# Define the environment configuration for the Hugging Face model
config = {
    "HF_MODEL_ID": "meta-llama/Meta-Llama-3.1-8B",  # Hugging Face model ID
    "HF_NUM_CORES": "8",  # Number of Neuron cores to use for inference
    "HF_AUTO_CAST_TYPE": "bf16",  # Enable automatic casting to bf16 (half precision for faster inference)
    "MAX_BATCH_SIZE": "4",  # Maximum batch size to process in one forward pass
    "MAX_INPUT_LENGTH": "4095",  # Maximum input sequence length (tokens) allowed for inference
    "MAX_TOTAL_TOKENS": "4096",  # Maximum total number of tokens (input + output)
    "HF_TOKEN": "<put your HF token there>"  # Token to authenticate with Hugging Face Hub (ensure to keep this secure)
}

# Set the URI for the Hugging Face TGI (Text Generation Inference) image
# This image is designed for optimized inference using AWS Neuron SDK (for Inferentia)
tgi_image = "763104351884.dkr.ecr.us-east-1.amazonaws.com/huggingface-pytorch-tgi-inference:2.1.2-optimum0.0.25-neuronx-py310-ubuntu22.04"

# Create the HuggingFaceModel object with the specified role, image, and environment configuration
model = HuggingFaceModel(
  role=role,  # IAM role that grants SageMaker permissions
  image_uri=tgi_image,  # URI for the Hugging Face inference image
  env=config  # Pass the environment variables defined in the config
)

# In this case, we are deploying a precompiled model, stored at https://huggingface.co/aws-neuron/optimum-neuron-cache 
# If the model you need to deploy the model that is not precompiled,  you can export your own neuron model
# as explained in https://huggingface.co/docs/optimum-neuron/main/en/guides/export_model#exporting-neuron-models-using-neuronx-tgi

# Mark the model as precompiled
model._is_compiled_model = True


In [19]:
predictor = model.deploy(
  initial_instance_count=1,
  instance_type=instance_type,
  container_startup_health_check_timeout=health_check_timeout,
  volume_size=volume_size
)

------------------------!

In [38]:
data = {
    "inputs": "What are the pros and cons of different energy sources?",
    "temperature": 0.7,
    "max_tokens": 100,
    "top_p": 0.9,
    "n": 1,
}

predictor.predict(data)

[{'generated_text': 'What are the pros and cons of different energy sources? Is there a link between electricity usage and climate change? How can we tackle energy poverty, the issue of clean air at home, or the challenge of providing electricity access in refugee camps? Why should we care about these issues? And how can we better communicate these issues to diverse audiences?\nThese are key issues for the energy sector – both at home and abroad. This degree will equip you to address them from the perspective of economics, innovation and policy – and prepare you for an exciting career.\nOur innovative'}]

In [40]:
#clean-up

predictor.delete_endpoint()