# SageMaker Inference Component 모델 배포 및 추론

아래 모델 배포는 아래의 인스턴스에서 테스트 완료 되었습니다.
- ml.g5.xlarge (SageMaker On-Demand Endpoint 기준으로 us-east-1 에서 $1.4084) 입니다. 참조: [Pricing Link](https://aws.amazon.com/sagemaker/pricing/)
  

## 1. 환경 구성 

### 상위 폴더의 Python 경로 추가

In [1]:
%load_ext autoreload
%autoreload 2

import sys, os

def add_python_path(module_path):
    if os.path.abspath(module_path) not in sys.path:
        sys.path.append(os.path.abspath(module_path))
        print(f"python path: {os.path.abspath(module_path)} is added")
    else:
        print(f"python path: {os.path.abspath(module_path)} already exists")
    print("sys.path: ", sys.path)

module_path = "../.."
add_python_path(module_path)

python path: /home/ec2-user/SageMaker/aws-ai-ml-workshop-kr/genai/aws-gen-ai-kr/30_fine_tune/03-fine-tune-llama3 is added
sys.path:  ['/home/ec2-user/SageMaker/aws-ai-ml-workshop-kr/genai/aws-gen-ai-kr/30_fine_tune/03-fine-tune-llama3/notebook/02-naver-news-llama3-inference', '/home/ec2-user/SageMaker/.cs/conda/envs/llama3_puy310/lib/python310.zip', '/home/ec2-user/SageMaker/.cs/conda/envs/llama3_puy310/lib/python3.10', '/home/ec2-user/SageMaker/.cs/conda/envs/llama3_puy310/lib/python3.10/lib-dynload', '', '/home/ec2-user/SageMaker/.cs/conda/envs/llama3_puy310/lib/python3.10/site-packages', '/home/ec2-user/SageMaker/aws-ai-ml-workshop-kr/genai/aws-gen-ai-kr/30_fine_tune/03-fine-tune-llama3']


### 이전 노트북에서 훈련된 모델 경로 가져오기
- ../01-naver-news-fsdp-QLoRA/03-SageMaker-Training.ipynb 가 실행 되었으면, model_s3_path 로 세팅
- ../01-naver-news-fsdp-QLoRA/03-1-Option-SageMaker-Training.ipynb 가 실행 되었으면, optimized_model_s3_path 로 세팅

In [2]:
try:
    %store -r optimized_model_s3_path
    optimized_model_s3_path
    print("optimized_model_s3_path: \n", optimized_model_s3_path)
    model_s3_path = optimized_model_s3_path
except:
    try:
        %store -r model_s3_path
        model_s3_path
        print("model_s3_path: ", model_s3_path)
    except:
        print("optimized_model_s3_path and model_s3_path not found")
        model_s3_path = None

optimized_model_s3_path: 
 {'S3DataSource': {'S3Uri': 's3://sagemaker-us-east-1-057716757052/llama3-8b-naver-news-2024-07-29-07-13-1-2024-07-29-07-13-12-264/output/model/', 'S3DataType': 'S3Prefix', 'CompressionType': 'None'}}


## 2. 추론 이미지 가져오기



In [3]:
import sagemaker
import boto3
sess = sagemaker.Session()
role = sagemaker.get_execution_role()

sess = sagemaker.Session()
sagemaker_client = sess.sagemaker_client
sagemaker_runtime_client = sess.sagemaker_runtime_client


print(f"sagemaker role arn: {role}")
print(f"sagemaker session region: {sess.boto_region_name}")


sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/SageMaker/.xdg/config/sagemaker/config.yaml
sagemaker role arn: arn:aws:iam::057716757052:role/gen_ai_gsmoon
sagemaker session region: us-east-1


In [4]:
from sagemaker.huggingface import get_huggingface_llm_image_uri

# retrieve the llm image uri
llm_image = get_huggingface_llm_image_uri(
  "huggingface",
  session=sess,
  version="2.0.2",
)

# print ecr image uri
print(f"llm image uri: {llm_image}")

llm image uri: 763104351884.dkr.ecr.us-east-1.amazonaws.com/huggingface-pytorch-tgi-inference:2.3.0-tgi2.0.2-gpu-py310-cu121-ubuntu22.04


## 3. SageMaker Endpoint 생성
- EndpointConfiguration 생성
- Endpoint 생성

### 모델을 배포할 인스턴스 정의

In [5]:
# instance_type = "ml.g5.12xlarge"
# instance_type = "ml.g5.4xlarge"
instance_type = "ml.g5.xlarge"


if instance_type == "ml.p4d.24xlarge":
    num_GPUSs = 8
elif instance_type == "ml.g5.12xlarge":
    num_GPUSs = 4
elif instance_type == "ml.g5.4xlarge":
    num_GPUSs = 1    
else:
    num_GPUSs = 1
    
print(f"{instance_type} and # of GPU {num_GPUSs} is set")

ml.g5.xlarge and # of GPU 1 is set


### Endpoint config name 및 설정 값 기술

In [6]:
from datetime import datetime
currentDateAndTime = datetime.now()
currentTime = currentDateAndTime.strftime("%Y-%m-%d-%H-%M-%S")
print("The current time is", currentTime)

# Set an unique endpoint config name
endpoint_config_name = f"llama3-endpoint-config-{currentTime}" 
print(f"Endpoint config name: {endpoint_config_name}")


# Set varient name and instance type for hosting
variant_name = "AllTraffic"
model_data_download_timeout_in_seconds = 600
container_startup_health_check_timeout_in_seconds = 600

initial_instance_count = 1
max_instance_count = 2
print(f"Initial instance count: {initial_instance_count}")
print(f"Max instance count: {max_instance_count}")


The current time is 2024-07-29-12-25-04
Endpoint config name: llama3-endpoint-config-2024-07-29-12-25-04
Initial instance count: 1
Max instance count: 2


### SageMaker Endpoint Configuration 만들기

In [7]:
epc_response = sagemaker_client.create_endpoint_config(
    EndpointConfigName=endpoint_config_name,
    ExecutionRoleArn=role,
    ProductionVariants=[
        {
            "VariantName": variant_name,
            "InstanceType": instance_type,
            "InitialInstanceCount": 1,
            "ModelDataDownloadTimeoutInSeconds": model_data_download_timeout_in_seconds,
            "ContainerStartupHealthCheckTimeoutInSeconds": container_startup_health_check_timeout_in_seconds,
            "ManagedInstanceScaling": {
                "Status": "ENABLED",
                "MinInstanceCount": initial_instance_count,
                "MaxInstanceCount": max_instance_count,
            },
            "RoutingConfig": {"RoutingStrategy": "LEAST_OUTSTANDING_REQUESTS"},
        }
    ],
)

In [8]:
from scripts.inference_util import print_json
# print(epc_response)
print_json(epc_response)

{
    "EndpointConfigArn": "arn:aws:sagemaker:us-east-1:057716757052:endpoint-config/llama3-endpoint-config-2024-07-29-12-25-04",
    "ResponseMetadata": {
        "RequestId": "9b7c30c6-b5be-4f6f-b64d-ecd078e64ae7",
        "HTTPStatusCode": 200,
        "HTTPHeaders": {
            "x-amzn-requestid": "9b7c30c6-b5be-4f6f-b64d-ecd078e64ae7",
            "content-type": "application/x-amz-json-1.1",
            "content-length": "123",
            "date": "Mon, 29 Jul 2024 12:25:05 GMT"
        },
        "RetryAttempts": 0
    }
}


  from .autonotebook import tqdm as notebook_tqdm


### Endpoint 생성

In [9]:
%%time
# Set a unique endpoint name
endpoint_name = f"llama3-endpoint-{currentTime}"
print(f"endpoint_name: {endpoint_name}")

ep_response = sagemaker_client.create_endpoint(
    EndpointName=endpoint_name,
    EndpointConfigName=endpoint_config_name,
)
# print(ep_response)
print(f"Creating endpoint: {endpoint_name}")
sess.wait_for_endpoint(endpoint_name)

endpoint_name: llama3-endpoint-2024-07-29-12-25-04
Creating endpoint: llama3-endpoint-2024-07-29-12-25-04
-----!CPU times: user 18.3 ms, sys: 5.41 ms, total: 23.7 ms
Wall time: 3min


{'EndpointName': 'llama3-endpoint-2024-07-29-12-25-04',
 'EndpointArn': 'arn:aws:sagemaker:us-east-1:057716757052:endpoint/llama3-endpoint-2024-07-29-12-25-04',
 'EndpointConfigName': 'llama3-endpoint-config-2024-07-29-12-25-04',
 'ProductionVariants': [{'VariantName': 'AllTraffic',
   'CurrentInstanceCount': 1,
   'DesiredInstanceCount': 1,
   'ManagedInstanceScaling': {'Status': 'ENABLED',
    'MinInstanceCount': 1,
    'MaxInstanceCount': 2},
   'RoutingConfig': {'RoutingStrategy': 'LEAST_OUTSTANDING_REQUESTS'}}],
 'EndpointStatus': 'InService',
 'CreationTime': datetime.datetime(2024, 7, 29, 12, 25, 6, 162000, tzinfo=tzlocal()),
 'LastModifiedTime': datetime.datetime(2024, 7, 29, 12, 27, 43, 138000, tzinfo=tzlocal()),
 'ResponseMetadata': {'RequestId': 'bf989fe2-0589-4944-b6e6-e1913c59cb41',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': 'bf989fe2-0589-4944-b6e6-e1913c59cb41',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '574',
   'date': 'M

## 4. SageMaker Model 생성

### SageMaker Model 정의
- 추론 이미지 기술 
- 모델 아티펙트 경로 기술 

In [10]:
from huggingface_hub import HfFolder
from sagemaker.huggingface import HuggingFaceModel

# sagemaker config

health_check_timeout = 600 # 20 minutes
model_name = f"llama3-model-{currentTime}"

import time

# Define Model and Endpoint configuration parameter
config = {
  'HF_MODEL_ID': "/opt/ml/model",       # Path to the model in the container
  'SM_NUM_GPUS': f"{num_GPUSs}",        # Number of GPU used per replica
  'MAX_INPUT_LENGTH': "2048",           # Max length of input text
  'MAX_TOTAL_TOKENS': "4096",           # Max length of the generation (including input text)
  #'MAX_BATCH_PREFILL_TOKENS': "16182",  # Limits the number of tokens that can be processed in parallel during the generation
  'MAX_BATCH_PREFILL_TOKENS': "4096",  # Limits the number of tokens that can be processed in parallel during the generation
  'MESSAGES_API_ENABLED': "true",       # Enable the OpenAI Messages API
}

# create HuggingFaceModel with the image uri
llm_model = HuggingFaceModel(
  role=role,
  name=model_name,
  model_data=model_s3_path, # path to s3 bucket with model, we are not using a compressed model
  image_uri=llm_image,
  env=config
)

In [11]:
llm_model.create()

### inference component 생성

In [12]:
# Deploy model to Amazon SageMaker Inference Component
inference_component_name_llama3b = f"llama3b-IC-{currentTime}"
print("inference_component_name_llama3b: ", inference_component_name_llama3b)
variant_name = "AllTraffic"

ic_response = sagemaker_client.create_inference_component(
    InferenceComponentName=inference_component_name_llama3b,
    EndpointName=endpoint_name,
    VariantName=variant_name,
    Specification={
        "ModelName": model_name,
        "ComputeResourceRequirements": {
            "NumberOfAcceleratorDevicesRequired": num_GPUSs,
            "NumberOfCpuCoresRequired": 1,
            "MinMemoryRequiredInMb": 1024,
        },
    },
    RuntimeConfig={"CopyCount": 1},
)

inference_component_name_llama3b:  llama3b-IC-2024-07-29-12-25-04


In [13]:
import time
# Wait for IC to come InService
print(f"InferenceComponent: {inference_component_name_llama3b}")
while True:
    desc = sagemaker_client.describe_inference_component(
        InferenceComponentName=inference_component_name_llama3b
    )
    status = desc["InferenceComponentStatus"]
    print(status)
    sys.stdout.flush()
    if status in ["InService", "Failed"]:
        break
    time.sleep(30)

InferenceComponent: llama3b-IC-2024-07-29-12-25-04
Creating
Creating
Creating
Creating
Creating
Creating
Creating
Creating
Creating
Creating
Creating
Creating
Creating
Creating
Creating
Creating
Creating
InService


## 5. 추론: 한국어 요약 

### Helper 함수

In [14]:
from scripts.inference_util import (
    print_json,
    create_messages_parameters,
)

In [15]:
%store -r full_test_data_json

In [16]:

from scripts.inference_util import (
    get_message_from_dataset,
    extract_system_user_prompt,
    # run_inference,
    generate_response_IC
)

In [17]:
messages, full_test_dataset, rand_idx = get_message_from_dataset(
                                        sample_dataset_json_file = full_test_data_json, verbose=False)    
generate_response_IC(messages, endpoint_name, full_test_dataset, rand_idx, inference_component_name_llama3b)    

elapsed time: 3.418 second
**Query:**
{'messages': [{'role': 'system', 'content': 'You are an AI assistant specialized in news articles.Your role is to provide accurate summaries and insights in Korean. Please analyze the given text and provide concise, informative summaries that highlight the key goals and findings.'}, {'role': 'user', 'content': 'Please summarize the goals for journalist in this text:\n\n산업현장에서 세종텔레콤의 스마트 안전 플랫폼 솔루션 을 활용하고 있다. 세종텔레콤 제공 세종텔레콤은 태영건설에 중대재해처벌법 대응에 최적화된 스마트 안전 플랫폼 솔루션 납품 계약을 체결했다고 4일 밝혔다. 우선 태영건설 전국 산업현장에 스마트 안전 솔루션 500대 납품 계약을 체결했고 추가 구축을 협의 중이다. 지난 1월 본격 시행된 중대재해처벌법은 산업현장에서 인명사고 발생 시 경영진이나 법인에게 책임을 물을 수 있도록 규정한 법이다. 해당 법은 안전 보건 관련 관리상의 조치 구축을 의무화하고 있으나 기업의 한정적 자원과 부족한 인력 문제로 어려움을 겪고 있다. 세종텔레콤의 스마트 안전 플랫폼 솔루션은 출입관리부터 CCTV 가스탐지 각종 센서 등을 하나로 통합해 현장을 종합 관리할 수 있다. LBS 위치기반 IoT 사물인터넷 등 스마트 기술을 융합했다. 안전 관리 담당자는 각 현장마다 설치된 카메라 및 CCTV 개소별 센서와 통신 인프라를 통해 현장 정보를 실시간으로 확인하고 비상 상황 시에는 전체 현장 또는 해당 구역 상황실 시스템이나 모바일로 근로자에게 안전 조치사항을 지시할 수 있다. 이와 함께 타워크레인에 설치한 360도 카메라를 

## 6. 리소스 삭제
- 인퍼런스 컴포넌트 삭제
- 세이지 메이커 모델 삭제
- 엔드포인트 삭제


In [18]:
from sagemaker.predictor import Predictor

predictor = Predictor(
    endpoint_name=endpoint_name,
    sagemaker_session=sess,
)

In [19]:
try:
    print(f"Deleting inference components: [b magenta]{inference_component_name_llama3b} ✅")
    # Delete inference component
    sagemaker_client.delete_inference_component(
        InferenceComponentName=inference_component_name_llama3b
    )
except Exception as e:
    print(f"{e}")


Deleting inference components: [b magenta]llama3b-IC-2024-07-29-12-25-04 ✅


In [20]:
try:
    print(f"Deleting model: {model_name}")
    predictor.delete_model()
except Exception as e:
    print(f"{e}")


Deleting model: llama3-model-2024-07-29-12-25-04


In [21]:

try:
    print(f"Deleting endpoint: [b magenta]{predictor.endpoint_name} ✅")
    predictor.delete_endpoint()
except Exception as e:
    print(f"{e}")

print("---" * 10)
print("Done")

Deleting endpoint: [b magenta]llama3-endpoint-2024-07-29-12-25-04 ✅
------------------------------
Done
