# LLama2 7B 모델을 Sagemaker를 통해 g5.24xlarge 인스턴스에 배포하기

이 노트북은 HuggingFace에서 Llama 7B 모델을 가져와 Sagemaker에서 g5.24xl 인스턴스에 배포하는 방법을 보여줍니다.

## Step 1: Let's bump up SageMaker and import stuff

In [1]:
%pip install sagemaker --upgrade  --quiet

Note: you may need to restart the kernel to use updated packages.


In [2]:
import boto3
import sagemaker
from sagemaker import Model, image_uris, serializers, deserializers
import json

role = sagemaker.get_execution_role()  # execution role for the endpoint
sess = sagemaker.session.Session()  # sagemaker session for interacting with different AWS APIs
region = sess._region_name  # region name of the current SageMaker Studio environment
account_id = sess.account_id()  # account_id of the current SageMaker Studio environment

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml


## Step 2: Get the model

In [3]:
from sagemaker.huggingface import get_huggingface_llm_image_uri

# retrieve the llm image uri
llm_image = get_huggingface_llm_image_uri(
  "huggingface",
  version="1.4.2"
)

# print ecr image uri
print(f"llm image uri: {llm_image}")

llm image uri: 763104351884.dkr.ecr.us-east-1.amazonaws.com/huggingface-pytorch-tgi-inference:2.1.1-tgi1.4.2-gpu-py310-cu121-ubuntu22.04


## Step 3: Start building SageMaker endpoint

In [4]:
import json
from sagemaker.huggingface import HuggingFaceModel

# sagemaker config
instance_type = "ml.g5.24xlarge"
number_of_gpu = 4
health_check_timeout = 120

# Define Model and Endpoint configuration parameters
config = {
    'HF_MODEL_ID': "meta-llama/Llama-2-7b-hf", # model_id from hf.co/models
    'SM_NUM_GPUS': json.dumps(number_of_gpu), # Number of GPU used per replica
    'HUGGING_FACE_HUB_TOKEN': "<자신의 HuggingFace read access token 입력>" # Read Access token of your HuggingFace profile https://huggingface.co/settings/tokens
}

# create HuggingFaceModel with the image uri
llm_model = HuggingFaceModel(
  role=role,
  image_uri=llm_image,
  env=config
)

## Step 4: Create Sagemaker endpoint and deploy the model to the Sagemaker endpoint

In [5]:
from sagemaker.utils import name_from_base

endpoint_name = name_from_base(f"{config['HF_MODEL_ID'].split('/')[1].split('.')[0]}-imweb-poc")

endpoint_name

'Llama-2-7b-hf-imweb-poc-2024-04-22-09-46-49-774'

In [6]:
%%time
# https://sagemaker.readthedocs.io/en/stable/api/inference/model.html#sagemaker.model.Model.deploy

llm = llm_model.deploy(
    endpoint_name=endpoint_name,
    initial_instance_count=1,
    instance_type=instance_type,
    container_startup_health_check_timeout=health_check_timeout,
)

--------!CPU times: user 256 ms, sys: 3.93 ms, total: 260 ms
Wall time: 4min 32s


## Step 5: Test the inference

In [7]:
# Get a predictor for your endpoint
predictor = sagemaker.Predictor(
    endpoint_name=endpoint_name,
    serializer=sagemaker.serializers.JSONSerializer(),
    deserializer=sagemaker.deserializers.JSONDeserializer(),
)

In [8]:
# Make a prediction with your endpoint
response = predictor.predict({
    "inputs": "The future of Gen-AI is", 
    "parameters": {"do_sample": True, "max_new_tokens": 256}
})

response

[{'generated_text': 'The future of Gen-AI is in your hands\nPrevious PostPrevious Secrets to becoming a great, kick-ass programmer\nNext PostNext GUI style components in JES Steven'}]

## Clean up the environment

In [None]:
sess.delete_endpoint(endpoint_name)
sess.delete_endpoint_config(endpoint_name)
llm_model.delete_model()