# [모듈 3.2] 2개의 NCF 모델을 SageMaker Endpoint Triton 서빙

# 1. 환경 셋업

## 1.1. 기본 세팅
사용하는 패키지는 import 시점에 다시 재로딩 합니다.

In [None]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.append('./src')

전 노트북에서 훈련 후의 아티펙트를 가져옵니다.

In [None]:
%store -r model_serving_folder
%store -r food_model_name
%store -r fashion_model_name
%store -r bucket

In [None]:
import sagemaker

sagemaker_session = sagemaker.Session()
role = sagemaker.get_execution_role()

import boto3, json, sagemaker, time
import numpy as np
sm_client = boto3.client(service_name="sagemaker")

### 변수 설정

In [None]:
prefix = "triton-ncf"

ts = time.strftime("%Y-%m-%d-%H-%M-%S", time.gmtime())
# endpoint variables
sm_model_name = f"{prefix}-mdl-{ts}"
endpoint_config_name = f"{prefix}-epc-{ts}"
endpoint_name = f"{prefix}-ep-{ts}"
model_data_url = f"s3://{bucket}/{prefix}/"
instance_type = "local_gpu"

In [None]:
print("sm_model_name: \n", sm_model_name)
print("endpoint_config_name: \n", endpoint_config_name)
print("endpoint_name: \n", endpoint_name)

## Triton Docker Image 결정

In [None]:
from triton_util import account_id_map
region = boto3.Session().region_name

base = "amazonaws.com.cn" if region.startswith("cn-") else "amazonaws.com"
mme_triton_image_uri = (
    "{account_id}.dkr.ecr.{region}.{base}/sagemaker-tritonserver:22.07-py3".format(
        account_id=account_id_map[region], region=region, base=base
    )
)
print("mme_triton_image_uri: \n", mme_triton_image_uri)

# 2. 모델 패키징 (model.tar.gz) 및 S3 업로딩



In [None]:
import os
from triton_util import tar_artifact, upload_tar_s3

## 2.1. Food ncf model

In [None]:
food_model_tar_file = tar_artifact(model_serving_folder, food_model_name)    
print("food_model_tar_file: ", food_model_tar_file)
food_model_uri_pt = upload_tar_s3(sagemaker_session, food_model_tar_file, prefix)
print("food_model_uri_pt: ", food_model_uri_pt)

## 2.2. Fashion ncf model

In [None]:
fashion_model_tar_file = tar_artifact(model_serving_folder, fashion_model_name)    
print("fashion_model_tar_file: ", fashion_model_tar_file)
fashion_model_uri_pt = upload_tar_s3(sagemaker_session, fashion_model_tar_file, prefix)
print("fashion_model_uri_pt: ", fashion_model_uri_pt)

# 3. 클라우드 배포

## 3.1. 변수 및 컨테이너 설정

In [None]:
sm_model_name = f"{prefix}-mdl-{ts}"
real_endpoint_config_name = f"{prefix}-epc-{ts}"
real_endpoint_name = f"{prefix}-ep-{ts}"

In [None]:
container = {"Image": mme_triton_image_uri, 
             "ModelDataUrl": model_data_url, 
             "Mode": "MultiModel",
             'Environment' : {
                                "SAGEMAKER_TRITON_LOG_VERBOSE": "3",
                                "SAGEMAKER_TRITON_LOG_INFO": "1",
                                "SAGEMAKER_TRITON_LOG_WARNING" : "1",
                                "SAGEMAKER_TRITON_LOG_ERROR" : "1"
                             }             
            }


In [None]:
print("container: ", container)
print("sm_model_name: ", sm_model_name)

## 3.2. 세이지 메이커 모델, 앤드포인트 컨피그, 앤드포인트 생성

In [None]:
create_model_response = sm_client.create_model(
    ModelName=sm_model_name, ExecutionRoleArn=role, PrimaryContainer=container
)

print("Model Arn: " + create_model_response["ModelArn"])

In [None]:
create_endpoint_config_response = sm_client.create_endpoint_config(
    EndpointConfigName=real_endpoint_config_name,
    ProductionVariants=[
        {
            "InstanceType": "ml.g4dn.4xlarge",
            "InitialVariantWeight": 1,
            "InitialInstanceCount": 1,
            "ModelName": sm_model_name,
            "VariantName": "AllTraffic",
        }
    ],
)

print("Endpoint Config Arn: " + create_endpoint_config_response["EndpointConfigArn"])

In [None]:
create_endpoint_response = sm_client.create_endpoint(
    EndpointName=real_endpoint_name, EndpointConfigName= real_endpoint_config_name
)

print("Endpoint Arn: " + create_endpoint_response["EndpointArn"])

In [None]:
%%time 

resp = sm_client.describe_endpoint(EndpointName= real_endpoint_name)
status = resp["EndpointStatus"]
print("Status: " + status)

while status == "Creating":
    time.sleep(60)
    resp = sm_client.describe_endpoint(EndpointName=real_endpoint_name)
    status = resp["EndpointStatus"]
    print("Status: " + status)

print("Arn: " + resp["EndpointArn"])
print("Status: " + status)

## 3.3. GPU 다중 모델 엔드포인트에 대한 자동 확장 정책 설정

Amazon SageMaker 다중 모델 엔드포인트는 호스팅된 모델에 대해 자동 조정(Auto Scaling)을 지원합니다. Auto Scaling은 워크로드의 변화에 ​​따라 모델에 대해 프로비저닝된 인스턴스 수를 동적으로 조정합니다. 워크로드가 증가하면 Auto Scaling이 더 많은 인스턴스를 온라인 상태로 만듭니다. 워크로드가 감소하면 Auto Scaling이 불필요한 인스턴스를 제거하므로 사용하지 않는 프로비저닝된 인스턴스에 대해 비용을 지불하지 않아도 됩니다.

아래 조정 정책에서 TargetTrackingScalingPolicyConfiguration 구성의 사용자 지정 지표 GPUUtilization을 사용하고 해당 지표의 대상 값에 대해 TargetValue를 60.0으로 설정합니다. 이 자동 확장 정책은 GPU 사용률이 60% 이상일 때 MaxCapacity까지 추가 인스턴스를 프로비저닝합니다.

In [None]:
# Perform auto-scaling of the endpoint based on GPU memory utilization
# This is the format in which application autoscaling references the endpoint
auto_scaling_client = boto3.client("application-autoscaling")

resource_id = "endpoint/" + endpoint_name + "/variant/" + "AllTraffic"
response = auto_scaling_client.register_scalable_target(
    ServiceNamespace="sagemaker",
    ResourceId=resource_id,
    ScalableDimension="sagemaker:variant:DesiredInstanceCount",
    MinCapacity=1,
    MaxCapacity=5,
)


# GPUMemoryUtilization metric
response = auto_scaling_client.put_scaling_policy(
    PolicyName="GPUUtil-ScalingPolicy",
    ServiceNamespace="sagemaker",
    ResourceId=resource_id,
    ScalableDimension="sagemaker:variant:DesiredInstanceCount",  # SageMaker supports only Instance Count
    PolicyType="TargetTrackingScaling",  # 'StepScaling'|'TargetTrackingScaling'
    TargetTrackingScalingPolicyConfiguration={
        # Scale out when GPU utilization hits GPUUtilization target value.
        "TargetValue": 60.0,
        "CustomizedMetricSpecification": {
            "MetricName": "GPUUtilization",
            "Namespace": "/aws/sagemaker/Endpoints",
            "Dimensions": [
                {"Name": "EndpointName", "Value": endpoint_name},
                {"Name": "VariantName", "Value": "AllTraffic"},
            ],
            "Statistic": "Average",  # Possible - 'Statistic': 'Average'|'Minimum'|'Maximum'|'SampleCount'|'Sum'
            "Unit": "Percent",
        },
        "ScaleInCooldown": 600,
        "ScaleOutCooldown": 200,
    },
)

# 4. SageMaker Endpoint 에 추론

## 4.1. 샘플 입력 생성

In [None]:
def create_sample_payload():
    # user
    user_np = np.zeros((1,100)).astype(np.int32)
    # item
    item_np = np.random.randint(low=1, high=1000, size=(1,100)).astype(np.int32)

    payload = {
        "inputs": [
            {"name": "INPUT__0", "shape": [1,100], 
             "datatype": "INT32", "data": user_np.tolist()},
            {"name": "INPUT__1", "shape": [1,100], 
             "datatype": "INT32", "data": item_np.tolist()},
        ]
    }
    
    return payload

payload = create_sample_payload()
print("payload: ", payload)

## 4.2. NCF Food 모델에 추론

In [None]:
def multiple_model_invoke_endpoint(client,endpoint_name, payload, TargetModel): 
    print("Model: ", TargetModel)
    response = client.invoke_endpoint(
        EndpointName=endpoint_name, ContentType="application/octet-stream", 
        Body=json.dumps(payload),
        TargetModel= TargetModel,        
    )

    result = json.loads(response["Body"].read().decode("utf8"))
    
    return result



In [None]:
print(food_model_tar_file)
print(fashion_model_tar_file)

In [None]:
runtime_client = boto3.Session().client('sagemaker-runtime')
start_time = time.time()
result = multiple_model_invoke_endpoint(runtime_client,endpoint_name, payload, food_model_tar_file)
print("--- %s seconds ---" % (time.time() - start_time))
print('result: ', result)

SageMaker Endpoint 의 로그를 Cloud Watch 를 통해서 화인 함.
Input, Output 의 입력 구조 및 어떠한 "모델"이 사용이 되었는지, 그리고 Memory 할당, 해제를 화인 할 수 있습니다.
- 아래 메세지 "http_server.cc:1088] HTTP: unable to provide 'OUTPUT__0' in GPU, will use CPU" 는 에러가 아닙니다. 아래 내용 참조 하세요.
    - this log comes from the HTTP / gRPC server. It's not an error. Since the output is returned via HTTP/GRPC the buffer resides on CPU not GPU even though your model outputs may have been on GPU.
        - https://github.com/triton-inference-server/server/issues/2090

![cloud_watch_food_log.png](img/cloud_watch_food_log.png)

## 4.3.NCF Fashion 모델에 추론

In [None]:
import time
runtime_client = boto3.Session().client('sagemaker-runtime')

start_time = time.time()
result = multiple_model_invoke_endpoint(runtime_client,endpoint_name, payload, fashion_model_tar_file)
print("--- %s seconds ---" % (time.time() - start_time))
print('result: ', result)

# 5. 앤드포인트 삭제

In [None]:
from inference_utils import delete_endpoint

client = boto3.Session().client('sagemaker')
delete_endpoint(client, real_endpoint_name)

- https://github.com/triton-inference-server/server/issues/2090

this log comes from the HTTP / gRPC server. It's not an error. Since the output is returned via HTTP/GRPC the buffer resides on CPU not GPU even though your model outputs may have been on GPU.