# LLama3 8B 모델을 Sagemaker를 통해 g5 인스턴스에 배포하기

## 실험 환경
- 이 노트북은 SageMaker Studio Code Editor 및 커널 base (Python 3.10.13) 에서 테스트 되었습니다.

---

# 0. 사전 진행 내용
- Llama3 모델을 사용하기 위해서는 아래의 웹페이지에 가서 본인의 계정으로 로그인 후에 "동의" 를 먼저 해야 합니다.
    - [meta-llama/Meta-Llama-3-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct)
- 또한 본의 HF Key 을 얻기 위해서는, [User access tokens](https://huggingface.co/docs/hub/en/security-tokens) 참고 하세요.

# 1. 환경 셋업

상위 폴더의 경로를 추가하여 해당 유틸리티, 이미지 폴더를 참조 합니다.

In [12]:
import sys, os

def add_python_path(module_path):
    if os.path.abspath(module_path) not in sys.path:
        sys.path.append(os.path.abspath(module_path))
        print(f"python path: {os.path.abspath(module_path)} is added")
    else:
        print(f"python path: {os.path.abspath(module_path)} already exists")
    print("sys.path: ", sys.path)

module_path = ".."
add_python_path(module_path)

python path: /home/sagemaker-user/aws-ai-ml-workshop-kr/genai/aws-gen-ai-kr/40_inference is added
sys.path:  ['/home/sagemaker-user/aws-ai-ml-workshop-kr/genai/aws-gen-ai-kr/40_inference/30-Llama-3-Inference', '/opt/conda/lib/python310.zip', '/opt/conda/lib/python3.10', '/opt/conda/lib/python3.10/lib-dynload', '', '/opt/conda/lib/python3.10/site-packages', '/home/sagemaker-user/aws-ai-ml-workshop-kr/genai/aws-gen-ai-kr/40_inference']


In [1]:
# install_needed = True
install_needed = False

if install_needed:
    ! pip install sagemaker --upgrade  --quiet
    ! pip list | grep -E "sagemaker"

In [2]:
import boto3
import sagemaker
from sagemaker import Model, image_uris, serializers, deserializers
import json

role = sagemaker.get_execution_role()  # execution role for the endpoint
sess = sagemaker.session.Session()  # sagemaker session for interacting with different AWS APIs
region = sess._region_name  # region name of the current SageMaker Studio environment
account_id = sess.account_id()  # account_id of the current SageMaker Studio environment

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml


## [중요] Hugging Face HF_Key 를 환경변수에 저장 
- 아래에 본인의 Key 를 입력하고, 주석을 제거 후에 사용하세요.

    ```
    key_val = "<Type Your HF Key>"
    # set_hf_key_env_vars(hf_key_name, key_val)
    ```


In [5]:
from sagemaker.huggingface import HuggingFaceModel, get_huggingface_llm_image_uri

import os

def set_hf_key_env_vars(hf_key_name, key_val):
    os.environ[hf_key_name] = key_val

def get_hf_key_env_vars(hf_key_name):
    HF_key_value = os.environ.get(hf_key_name)

    return HF_key_value

hf_key_name = "HF_KEY"
key_val = "<Type Your HF Key>"
# set_hf_key_env_vars(hf_key_name, key_val)


HF_key_value = get_hf_key_env_vars(hf_key_name)
# print("HF_key_value: ", HF_key_value)



# 2. HF 파라미터 설정



## 모델 설정

In [25]:
# model id
hf_model_id = 'meta-llama/Meta-Llama-3-8B-Instruct'


## 환경 변수 설정

In [6]:
# instance type
instance_type = "ml.g5.2xlarge"

# Set GPU_NUM
if instance_type == "ml.g5.2xlarge":
    num_gpu = "1"
elif instance_type == "ml.g5.24xlarge":
    num_gpu = "4"
else:
    num_gpu = None

In [7]:
# Hub Model configuration. https://huggingface.co/models
hub = {
	'HF_MODEL_ID': hf_model_id,
	'SM_NUM_GPUS': num_gpu,
	'HUGGING_FACE_HUB_TOKEN': HF_key_value
}

## 추론 도커 이미지 설정

In [8]:
image_uri = get_huggingface_llm_image_uri("huggingface",version="2.0.2")
image_uri

'763104351884.dkr.ecr.us-east-1.amazonaws.com/huggingface-pytorch-tgi-inference:2.3.0-tgi2.0.2-gpu-py310-cu121-ubuntu22.04'

## SageMaker Model 의 하위 클래스인 HuggingFaceModel 생성

In [9]:
# create Hugging Face Model Class
huggingface_model = HuggingFaceModel(
	image_uri= image_uri,
	env=hub,
	role=role, 
)



# 3. SageMaker Endpoint 에 배포 

## endpoint_name 생성

In [10]:
from datetime import datetime

def create_ennpoint_name(model_id, instance_type):

    hf_model_id = model_id.split('/')[1]

    instance_type = instance_type.replace('.','-')
    current_datetime = datetime.now()
    formatted_datetime = current_datetime.strftime("%Y-%m-%d-%H-%M-%S")
    endpoint_name = f"{hf_model_id}-{instance_type}-{formatted_datetime}"

    return endpoint_name

endpoint_name = create_ennpoint_name(hf_model_id, instance_type)
print("endpoint_name: ", endpoint_name)

    

endpoint_name:  Meta-Llama-3-8B-Instruct-ml-g5-2xlarge-2024-05-19-11-40-54


## SageMaker Endpoint 배포

In [11]:
# deploy model to SageMaker Inference
predictor = huggingface_model.deploy(
	endpoint_name = endpoint_name,
	initial_instance_count=1,
	instance_type= instance_type,
	container_startup_health_check_timeout=300,
  )

----------!

# 4. 추론

## pay_load 생성

In [21]:
from inference_utils.inference_util import ( print_ww, 
                                            pretty_print_json,
                                       )
                                       
def create_payload_llama_8b(prompt, param):
    # prompt="What is a machine learning?"
    input_data = f"{prompt}"
    pay_load = {"inputs": input_data, "parameters": param}
    return pay_load

prompt = "My name is Clara and I am"
param = {"do_sample": True, "max_new_tokens": 256}
pay_load = create_payload_llama_8b(prompt, param)



print("## payload: ") 
pretty_print_json(pay_load)




## payload: 
{
    "inputs": "My name is Clara and I am",
    "parameters": {
        "do_sample": true,
        "max_new_tokens": 256
    }
}


## 추론

In [22]:
import sagemaker
# Get a predictor for your endpoint
predictor = sagemaker.Predictor(
    endpoint_name= endpoint_name,
    serializer=sagemaker.serializers.JSONSerializer(),
    deserializer=sagemaker.deserializers.JSONDeserializer(),
)

In [24]:
import time
s = time.perf_counter()

response = predictor.predict(pay_load)

elapsed_async = time.perf_counter() - s

from termcolor import colored

print("## inference esponse: ")                      
print_ww(colored(response, "green"))                         


## inference esponse: 
[32m[{'generated_text': 'My name is Clara and I am a language enthusiast. I have a passion for
languages and I believe that language learning is a powerful tool for self-discovery, cultural
understanding, and global connections.\nI am a native English speaker from the United States, but I
have also studied Spanish, French, German, and Italian, and I am currently working on improving my
Mandarin Chinese skills. I have had the opportunity to travel to several countries where I have
immersed myself in the local language and culture, and I have found that language learning has
opened doors to new experiences, new friends, and a deeper understanding of the world.\nI believe
that language learning should be a fun and engaging experience, and I strive to make my lessons
enjoyable and interactive. I use a variety of teaching methods and resources to help my students
achieve their language learning goals, including conversation practice, grammar exercises, reading
and wr

# 5. 엔드포인트 삭제

In [17]:
def delete_endpoint_model(endpoint_name,llm_model ):
    sess.delete_endpoint(endpoint_name)
    sess.delete_endpoint_config(endpoint_name)
    llm_model.delete_model()

# delete_endpoint_model(endpoint_name,huggingface_model)

