# Deploy a model on AWS Inferentia
In this notebook we will download Llama-3-8B model from the HuggingFace and deploy it on AWS Inferentia using two methods:
1. Just-In-Time (JIT) compilation
2. Ahead-Of-Time compilation using optimization job (part of Inference Optimization Toolkit)

In [None]:
%pip install sagemaker --upgrade --quiet --no-warn-conflicts

In [None]:
import json
import boto3
import sagemaker
import huggingface_hub
from pathlib import Path

In [None]:
role = sagemaker.get_execution_role()  # execution role for the endpoint
sess = sagemaker.session.Session()  # sagemaker session for interacting with different AWS APIs
bucket = sess.default_bucket()  # bucket to house artifacts
region = sess._region_name  # region name of the current SageMaker Studio environment

sm_client = boto3.client("sagemaker")  # client to intreract with SageMaker
smr_client = boto3.client("sagemaker-runtime")  # client to intreract with SageMaker Endpoints

print(f"sagemaker role arn: {role}")
print(f"sagemaker bucket: {sess.default_bucket()}")
print(f"sagemaker session region: {sess.boto_region_name}")
print(f"boto3 version: {boto3.__version__}")
print(f"sagemaker version: {sagemaker.__version__}")

## 2. Deploy model on AWS Inferentia

### 2.1 Download the model from The HuggingFace and upload to S3

Deploying mode using JIT does not require the model weights to be on S3 (you can use HF model_id) but AOT does.
We will download model from the HF repository and upload it in S3.
If you already have your model in S3 bucket you can skip model download step.

In [None]:
model_id="meta-llama/Meta-Llama-3-8B"

hf_local_download_dir = Path.cwd() / "model_repo"
hf_local_download_dir.mkdir(exist_ok=True)

huggingface_hub.snapshot_download(
    repo_id=model_id,
    revision="main",
    local_dir=hf_local_download_dir,
)

In [None]:
!rm -rf model_repo/.ipynb_checkpoints
!rm -rf model_repo/.cache
!rm -rf model_repo/.gitattributes
!rm -rf model_repo/original

In [None]:
model_uri = sess.upload_data(
    path=hf_local_download_dir.as_posix(),
    bucket=bucket,
    key_prefix="inference-model",
)
model_uri = model_uri + "/" #need to point towards the uncompressed model artifacts
model_uri

In [None]:
!aws s3 ls {model_uri} #verify model artifacts

Helper function to check latency of the endpoint

In [None]:
import time
import numpy as np

def run_perf_test(llm, num_iterations, payload):
    results = []
    for i in range(0, num_iterations):
        start = time.time()        
        response_model = smr_client.invoke_endpoint(
            EndpointName = llm,
            Body = json.dumps(payload),
            ContentType = "application/json",
        )
        results.append((time.time() - start) * 1000)
    
    print("\nPrediction latency: \n")
    print("P95: " + str(np.percentile(results, 95)) + " ms")
    print("P90: " + str(np.percentile(results, 90)) + " ms")
    print("Average: " + str(np.average(results)) + " ms")

In [None]:
LMI_VERSION = "0.29.0"
LMI_FRAMEWORK = "djl-neuronx"

serving_image = sagemaker.image_uris.retrieve(framework=LMI_FRAMEWORK, region=region, version=LMI_VERSION)

print(f"Inference Image: {serving_image}")

### 2.2 Deploy with JIT on AWS Inferentia

In [None]:
neuronx_instance_type = "ml.inf2.8xlarge"

neuronx_config = {
    "OPTION_ROLLING_BATCH": "auto",    
    "OPTION_TENSOR_PARALLEL_DEGREE": "2", # 2 Neuron cores per accelerator
    "OPTION_ATTENTION_LAYOUT": "BSH",
    "OPTION_COLLECTIVES_LAYOUT": "BSH",
    "OPTION_CONTEXT_LENGTH_ESTIMATE": "256",
    "OPTION_N_POSITIONS": "2048",
    "OPTION_DTYPE": "fp16",
    "OPTION_NEURON_OPTIMIZE_LEVEL": "2"
}

jit_config = {"HF_MODEL_ID": model_uri} | neuronx_config

In [None]:
instance_type = neuronx_instance_type
container = serving_image
config = jit_config
model_name = "llama3-8b-neuron"
endpoint_name = sagemaker.utils.name_from_base(model_name)

In [None]:
create_model_response = sm_client.create_model(
    ModelName = model_name,
    ExecutionRoleArn = role,
    PrimaryContainer = {
        "Image": container,
        "Environment": config,
    }
)
model_arn = create_model_response["ModelArn"]
print(f"Created Model: {model_arn}")

In [None]:
endpoint_config_name = endpoint_name
health_check_timeout = 900

endpoint_config_response = sm_client.create_endpoint_config(
    EndpointConfigName = endpoint_config_name,
    ProductionVariants = [
        {
            "VariantName": "variant1",
            "ModelName": model_name,
            "InstanceType": instance_type,
            "InitialInstanceCount": 1,
            "ContainerStartupHealthCheckTimeoutInSeconds": health_check_timeout,
            "RoutingConfig": {
                'RoutingStrategy': 'LEAST_OUTSTANDING_REQUESTS'
            },
        },
    ],
)
endpoint_config_response

In [None]:
create_endpoint_response = sm_client.create_endpoint(
    EndpointName = endpoint_name, EndpointConfigName = endpoint_config_name
)
sess.wait_for_endpoint(endpoint_name)

In [None]:
#
payload = {
  "inputs": "Can you tell me something about Amazon SageMaker?",
  "parameters": {"max_new_tokens": 256, "temperature": 0.1}
}

response_model = smr_client.invoke_endpoint(
    EndpointName = endpoint_name,
    Body = json.dumps(payload),
    ContentType = "application/json",
)

assistant = json.loads(response_model["Body"].read().decode("utf8"))["generated_text"]
print(assistant)

In [None]:
# 
# Calculate runtime performance
# 
run_perf_test(llm = endpoint_name, num_iterations = 10, payload = payload)

In [None]:
sess.delete_endpoint(endpoint_name)
sess.delete_endpoint_config(endpoint_config_name)
sess.delete_model(model_name)

### 2.3 Run Ahead-Of-Time compilation using Inference Optimization Toolkit

In [None]:
prefix = "llama3-8b-neuron-aot"
model_name = sagemaker.utils.name_from_base(prefix)
output_location = f"s3://{bucket}/{prefix}/"
instance_type = neuronx_instance_type

In [None]:
job_name = model_name
job_timeout = 7200

response = sm_client.create_optimization_job(
    OptimizationJobName=job_name,
    RoleArn=role,
    ModelSource={
        'S3': {
            'S3Uri': model_uri,
            'ModelAccessConfig': {
                'AcceptEula': True  # Change it to True
            }
        }
    },
    DeploymentInstanceType=instance_type,
    OptimizationEnvironment={},
    OptimizationConfigs=[
        {
            'ModelCompilationConfig': {
                'Image': serving_image,
                'OverrideEnvironment': neuronx_config
            }
        },
    ],
    OutputConfig={
        'S3OutputLocation': output_location
    },
    StoppingCondition={
        'MaxRuntimeInSeconds': job_timeout,
        'MaxWaitTimeInSeconds': job_timeout,
        'MaxPendingTimeInSeconds': job_timeout
    },
)
response

In [None]:
sess.wait_for_optimization_job(job_name)

In [None]:
!aws s3 ls {output_location} #verify compiled model artifacts

### 2.4 Deploy compiled model to the endpoint

For AOT deployment the only difference is model weights location, rest of steps are exactly the same as for JIT deployment

In [None]:
aot_config = {"HF_MODEL_ID": output_location} | neuronx_config

In [None]:
create_model_response = sm_client.create_model(
    ModelName = model_name,
    ExecutionRoleArn = role,
    PrimaryContainer = {
        "Image": serving_image,
        "Environment": aot_config,
    }
)
model_arn = create_model_response["ModelArn"]
print(f"Created Model: {model_arn}")

Start-up of LLM inference containers can last longer than smaller models, mainly due to longer model downloading and loading times. Timeout values need to be increased accordingly from their default values. Each endpoint deployment takes a few minutes.

In [None]:
endpoint_config_name = model_name
health_check_timeout = 900

endpoint_config_response = sm_client.create_endpoint_config(
    EndpointConfigName = endpoint_config_name,
    ProductionVariants = [
        {
            "VariantName": "variant1",
            "ModelName": model_name,
            "InstanceType": instance_type,
            "InitialInstanceCount": 1,
            "ContainerStartupHealthCheckTimeoutInSeconds": health_check_timeout,
            "RoutingConfig": {
                'RoutingStrategy': 'LEAST_OUTSTANDING_REQUESTS'
            },
        },
    ],
)
endpoint_config_response

In [None]:
#
# Create endpoint
#
endpoint_name = model_name

create_endpoint_response = sm_client.create_endpoint(
    EndpointName = endpoint_name, EndpointConfigName = endpoint_config_name
)

sess.wait_for_endpoint(endpoint_name)

Let's invoke our endpoint and get a sample response.

In [None]:
payload = {
  "inputs": "Can you tell me something about Amazon SageMaker?",
  "parameters": {"max_new_tokens": 256, "temperature": 0.1}
}

response_model = smr_client.invoke_endpoint(
    EndpointName = endpoint_name,
    Body = json.dumps(payload),
    ContentType = "application/json",
)

assistant = json.loads(response_model["Body"].read().decode("utf8"))["generated_text"]
print(assistant)

In [None]:
# 
# Calculate runtime performance
# 
run_perf_test(llm = endpoint_name, num_iterations = 10, payload = payload)

### 2.5 Clean Up Endpoint

In [None]:
sess.delete_endpoint(endpoint_name)
sess.delete_endpoint_config(endpoint_config_name)
sess.delete_model(model_name)