In [None]:
import boto3, json, sagemaker
from sagemaker import get_execution_role
from pathlib import Path

sess = boto3.Session()
sm_client = sess.client("sagemaker")
sagemaker_session = sagemaker.Session(boto_session=sess)
bucket = sagemaker_session.default_bucket()
runtime_sm_client = boto3.client("sagemaker-runtime")
region = sess.region_name
sts_client = sess.client('sts')
account_id = sts_client.get_caller_identity()['Account']

In [None]:
NGC_API_KEY = ""
SRC_IMAGE_PATH="nvcr.io/nvidia/nim/medical_imaging_vista3d:24.03"
SRC_IMAGE_NAME="medical_imaging_vista3d"
DST_REPO_NAME="nim-shim"
DST_REGISTRY=f"{account_id}.dkr.ecr.{region}.amazonaws.com/{DST_REPO_NAME}"

## Build customized container and push to ECR

In [None]:
# login NGC to download NIM container
!echo $NGC_API_KEY | docker login nvcr.io --username '$oauthtoken' --password-stdin

In [None]:
%%bash

sudo chown ec2-user /home/ec2-user/SageMaker/lost+found

account=$(aws sts get-caller-identity --query Account --output text)
region=$(aws configure get region)
repo_name="nim-shim"
image_name="medical_imaging_vista3d"
fullname="${account}.dkr.ecr.${region}.amazonaws.com/${repo_name}:${image_name}"
echo "ECR image fullname is: $fullname"

# # If the repository doesn't exist in ECR, create it.
aws ecr describe-repositories --repository-names "${repo_name}" > /dev/null 2>&1

if [ $? -ne 0 ]
then
    aws ecr create-repository --repository-name "${repo_name}" > /dev/null
fi

# Get the login command from ECR and execute it directly
$(aws ecr get-login --region ${region} --no-include-email)

echo "Building container"
docker build --quiet -t ${fullname} .

echo "Container is built, pushing container to ECR"
docker push --quiet ${fullname}

In [None]:
%%bash
 
wget -q --content-disposition https://api.ngc.nvidia.com/v2/resources/nvidia/ngc-apps/ngc_cli/versions/3.41.3/files/ngccli_linux.zip -O ~/ngccli_linux.zip && \
unzip -qq -o ~/ngccli_linux.zip -d ~/ngc && \
chmod u+x ~/ngc/ngc-cli/ngc && \
echo "export PATH=\"\$PATH:~/ngc/ngc-cli\"" >> ~/.bash_profile


Run the following command to configure NGC on terminal
```bash
source ~/.bash_profile
ngc config set

```
Then download model files and weights

## Create SageMaker inference endpoint

In [None]:
role = get_execution_role(sagemaker_session=sagemaker_session)
model_name = 'nim-llm-medical-image-vista3d'
container_image = f"{DST_REGISTRY}:{SRC_IMAGE_NAME}"
instance_type = 'ml.g5.2xlarge'
SG_CONTAINER_STARTUP_TIMEOUT=850

In [None]:
container = {
    "Image": container_image,
    "Mode": "SingleModel",
    "Environment": {
        "NGC_API_KEY": NGC_API_KEY,
        "MODEL_PATH": "/opt/ml/model"
    }
}

In [None]:
create_model_response = sm_client.create_model(
    ModelName = model_name,
    ExecutionRoleArn = role,
    Containers = [container]
)

In [None]:
# real time endpoint
endpoint_config_name = model_name + '-realtime-config'
endpoint_config_response = sm_client.create_endpoint_config(
    EndpointConfigName = endpoint_config_name, 
    ProductionVariants = [
        {
            "VariantName": "AllTraffic",
            "ModelName": model_name, 
            "InstanceType": instance_type,
            "InitialInstanceCount": 1,
            "InitialVariantWeight": 1.0,
            "ContainerStartupHealthCheckTimeoutInSeconds": SG_CONTAINER_STARTUP_TIMEOUT
        }
    ]
)

In [None]:
endpoint_name = model_name + '-realtime-endpoint'
create_endpoint_response = sm_client.create_endpoint(
    EndpointName=endpoint_name,
    EndpointConfigName=endpoint_config_name
)

In [None]:
print('Waiting for {} endpoint to be in service...'.format(endpoint_name))
waiter = sm_client.get_waiter('endpoint_in_service')
waiter.wait(EndpointName=endpoint_name)

In [None]:
# invoke real time endpoint
payload = {
    "image": "https://assets.ngc.nvidia.com/products/api-catalog/vista3d/example-1.nii.gz",
    # "image": "s3://amgenfileshare2024/example-1.nii.gz",  ## data in s3
    # "image": "https://dicom-medical-imaging.us-east-1.amazonaws.com/datastore/04607de734384f9a87dc9e8cb0013f12/studies/1.3.6.1.4.1.14519.5.2.1.7085.2626.822645453932810382886582736291/series/1.3.6.1.4.1.14519.5.2.1.7085.2626.119403521930927333027265674239/instances/1.3.6.1.4.1.14519.5.2.1.7085.2626.126237574085706868619012301210?imageSetId=a762adc9861764a64d4118336586f520", ## AWS HealthImaging getDICOMinstance
    # "image": "ahi://04607de734384f9a87dc9e8cb0013f12/a762adc9861764a64d4118336586f520",   ## AWS HealthImaging get imageset
    "prompts": {
        "classes": ["liver", "spleen"]
    }
}
response = runtime_sm_client.invoke_endpoint(
    EndpointName=endpoint_name,
    Body=json.dumps(payload),
    ContentType='application/json',
    Accept='application/json'
)
response

In [None]:
# async endpoint
async_endpoint_config_name = model_name + '-async-config'
endpoint_config_response = sm_client.create_endpoint_config(
    EndpointConfigName = async_endpoint_config_name, 
    ProductionVariants = [
        {
            "VariantName": "AllTraffic",
            "ModelName": model_name, 
            "InstanceType": instance_type,
            "InitialInstanceCount": 1,
            "InitialVariantWeight": 1.0,
            "ContainerStartupHealthCheckTimeoutInSeconds": SG_CONTAINER_STARTUP_TIMEOUT
        }
    ],
    AsyncInferenceConfig={
        "OutputConfig": {
            "S3OutputPath": f"s3://{bucket}/nim/vista3d/output"
        }
    }
)

In [None]:
async_endpoint_name = model_name + '-async-endpoint'
create_endpoint_response = sm_client.create_endpoint(
    EndpointName=async_endpoint_name,
    EndpointConfigName=async_endpoint_config_name
)

In [None]:
print('Waiting for {} endpoint to be in service...'.format(endpoint_name))
waiter = sm_client.get_waiter('endpoint_in_service')
waiter.wait(EndpointName=async_endpoint_name)

In [None]:
# invoke async endpoint
# Specify the location of the input. You will need to upload sg-invoke-payload.json to the s3 folder/
input_location = f"s3://{bucket}/nim/vista3d/input/sg-invoke-payload.json"

# After you deploy a model into production using SageMaker hosting 
# services, your client applications use this API to get inferences 
# from the model hosted at the specified endpoint.
response = runtime_sm_client.invoke_endpoint_async(
                            EndpointName=async_endpoint_name, 
                            InputLocation=input_location,
                            InvocationTimeoutSeconds=3600)
response

### (Optional) Setup Autoscale policy to scale to 0 instance for the async endpoint

In [None]:
# set up scale to 0: https://docs.aws.amazon.com/sagemaker/latest/dg/async-inference-autoscale.html

autoscale_client = boto3.client('application-autoscaling')

# This is the format in which application autoscaling references the endpoint
resource_id=f"endpoint/{async_endpoint_name}/variant/AllTraffic"

# Define and register your endpoint variant
response = autoscale_client.register_scalable_target(
    ServiceNamespace='sagemaker', 
    ResourceId=resource_id,
    ScalableDimension='sagemaker:variant:DesiredInstanceCount', # The number of EC2 instances for your Amazon SageMaker model endpoint variant.
    MinCapacity=0,
    MaxCapacity=2
)
response

In [None]:
put_policy_response = autoscale_client.put_scaling_policy(
    PolicyName="HasBacklogWithoutCapacity-ScalingPolicy",
    ServiceNamespace="sagemaker",  # The namespace of the service that provides the resource.
    ResourceId=resource_id,  # Endpoint name
    ScalableDimension="sagemaker:variant:DesiredInstanceCount",  # SageMaker supports only Instance Count
    PolicyType="StepScaling",  # 'StepScaling' or 'TargetTrackingScaling'
    StepScalingPolicyConfiguration={
        "AdjustmentType": "ChangeInCapacity", # Specifies whether the ScalingAdjustment value in the StepAdjustment property is an absolute number or a percentage of the current capacity. 
        "MetricAggregationType": "Average", # The aggregation type for the CloudWatch metrics.
        "Cooldown": 60, # The amount of time, in seconds, to wait for a previous scaling activity to take effect. 
        "StepAdjustments": # A set of adjustments that enable you to scale based on the size of the alarm breach.
        [ 
            {
              "MetricIntervalLowerBound": 0,
              "ScalingAdjustment": 1
            }
          ]
    }
)
put_policy_response

In [None]:
cw_client = boto3.client('cloudwatch') 

response = cw_client.put_metric_alarm(
    AlarmName='sagemaker_async_inference_endpoint_step_scaling_policy_alarm',
    MetricName='HasBacklogWithoutCapacity',
    Namespace='AWS/SageMaker',
    Statistic='Average',
    EvaluationPeriods= 2,
    DatapointsToAlarm= 2,
    Threshold= 1,
    ComparisonOperator='GreaterThanOrEqualToThreshold',
    TreatMissingData='missing',
    Dimensions=[
        { 'Name':'EndpointName', 'Value': async_endpoint_name },
    ],
    Period= 60,
    AlarmActions=[put_policy_response['PolicyARN']]
)
response

## Clean Up

In [None]:
cw_client.delete_alarms(AlarmNames=[
    'sagemaker_async_inference_endpoint_step_scaling_policy_alarm',
])
autoscale_client.delete_scaling_policy(
    PolicyName='HasBacklogWithoutCapacity-ScalingPolicy',
    ResourceId=resource_id,
    ScalableDimension="sagemaker:variant:DesiredInstanceCount",
    ServiceNamespace="sagemaker",
)
sm_client.delete_endpoint(EndpointName=endpoint_name)
sm_client.delete_endpoint(EndpointName=async_endpoint_name)
sm_client.delete_endpoint_config(EndpointConfigName=endpoint_config_name)
sm_client.delete_endpoint_config(EndpointConfigName=async_endpoint_config_name)
sm_client.delete_model(ModelName=model_name)