# SageMaker Endpoint 추론 및 간단한 벤치마크

### 선수 사항
- 이 노트북은 [20-Fine-Tune-Llama-7B-INF2](../../20-Fine-Tune-Llama-7B-INF2/README.md) 의 Llama-7B 모델의 파인 튜닝후에 SageMaker Endpoint 가 배포 된 이후에 실행 결과 입니다. 
- 다른 Llama 2 계열의 SageMaker Endpoint 가 배포된 이후에 실행 하셔도 됩니다. 


실험 환경:  노트북은 SageMaker Studio Code Editor 에서 테스트 되었습니다.
- 사용 커널: base(Python 3.10.13)

---

# 0. 필요 패키지 설치

In [1]:
install_needed = True
if install_needed:
    ! pip install -q transformers==4.31.0
    ! pip list | grep transformers

transformers                          4.31.0


# 1. 환경 설정

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import sys, os

def add_python_path(module_path):
    if os.path.abspath(module_path) not in sys.path:
        sys.path.append(os.path.abspath(module_path))
        print(f"python path: {os.path.abspath(module_path)} is added")
    else:
        print(f"python path: {os.path.abspath(module_path)} already exists")
    print("sys.path: ", sys.path)

module_path = ".."
add_python_path(module_path)


python path: /home/sagemaker-user/aws-ai-ml-workshop-kr/genai/aws-gen-ai-kr/40_inference/90_benchmark is added
sys.path:  ['/home/sagemaker-user/aws-ai-ml-workshop-kr/genai/aws-gen-ai-kr/40_inference/90_benchmark/10-Getting-Started', '/opt/conda/lib/python310.zip', '/opt/conda/lib/python3.10', '/opt/conda/lib/python3.10/lib-dynload', '', '/opt/conda/lib/python3.10/site-packages', '/home/sagemaker-user/aws-ai-ml-workshop-kr/genai/aws-gen-ai-kr/40_inference/90_benchmark']


In [4]:
from benchmark_utils.benchmark import (print_ww, 
                                       pretty_print_json,
                                       invoke_endpoint_sagemaker
                                       )

# 2. SageMaker Endpoint 설정 & pay_load 생성
### [중요] 아래 endpoint_name 을 입력하세요.
그림의 예시처럼, SageMaker endpoint 의 name 을 복사해서 아래에 붙여넣기 하세요.
- ![sagemaker_ep_console.png](img/sagemaker_ep_console.png)


In [13]:
def create_payload_llama_7b_fine_tuned_model(prompt, param):
    # prompt="What is a machine learning?"
    input_data = f"<s>[INST] <<SYS>>\nAs a data scientist\n<</SYS>>\n{prompt} [/INST]"
    pay_load = {"inputs": input_data, "parameters": param}
    return pay_load

def create_payload_mistral_7B(prompt, param):
    pay_load = {"inputs": prompt, "parameters": param}
    return pay_load

# model_id = "llama_7b_fine_tuned_model"
model_id = "mistralai/Mistral-7B-v0.1"

endpoint_name = '<Type Your SageMaker Endpoint Name>' 
# endpoint_name = 'lmi-model-2024-04-13-14-53-53-788' # Llama
endpoint_name = 'Mistral-7B-v0-imweb-poc-2024-04-23-13-36-19-527' # Mistral


if "llama" in model_id:
    # prompt = "What happened to the dinosaurs? "
    prompt = "The future of Gen-AI is"
    param = {"max_new_tokens":512, "temperature": 0.1 , "do_sample":"False", "stop" : ["</s>"]}
    pay_load = create_payload_llama_7b_fine_tuned_model(prompt, param)
elif "mistral" in model_id :
    prompt = "The future of Gen-AI is "
    param = {"max_new_tokens":512, "do_sample": True}
    pay_load = create_payload_mistral_7B(prompt, param)

pay_load    




{'inputs': 'The future of Gen-AI is ',
 'parameters': {'max_new_tokens': 512, 'do_sample': True}}

## Sagemaker Endpoint 호출

In [14]:
import time
s = time.perf_counter()

response = invoke_endpoint_sagemaker(endpoint_name = endpoint_name, 
                         pay_load = pay_load)    

elapsed_async = time.perf_counter() - s
from termcolor import colored

print(f"elapsed time: {round(elapsed_async,3)} second")
print("## payload: ") 
pretty_print_json(pay_load)
print("## inference esponse: ")                      
print_ww(colored(response, "green"))                         

elapsed time: 3.243 second
## payload: 
{
    "inputs": "The future of Gen-AI is ",
    "parameters": {
        "max_new_tokens": 512,
        "do_sample": true
    }
}
## inference esponse: 
[32m[{"generated_text":"The future of Gen-AI is 99days away. If prescriptive analysis is being used
effectively, there are workers throughout the US who have been prompted about plans for furloughing
or being laid off. If the report is right, there could be more than twice as many jobs lost.
Autoweek this week. Let’s look at what is going on.\n\nAnd this is Diamond Sutra that goes back to
the 11th century. A little before that. You have to understand, this was written with some very
strong religious meaning.\n\n(By Unknown – SuperCoder (talk), CC BY-SA 3.0,
https://commons.wikimedia.org/w/index.php?curid=32017622)\n\nListen to the podcast for more.  If you
like the idea, can you please share?"}][0m


# 4. 토큰 갯수 세기

- "NousResearch/Llama-2-7b-chat-hf" 모델 훈련에 사용한 Llama2 의 Tokenizer 를 로딩 합니다.
- 자세한 정보는 [여기]((https://huggingface.co/docs/transformers/v4.31.0/model_doc/llama2#transformers.LlamaTokenizer)) 츨 참조 하세요. 

In [15]:
from transformers import (
    AutoTokenizer
)

import os
# Load LLaMA tokenizer
if "llama" in model_id:
    model_name = "NousResearch/Llama-2-7b-chat-hf"
elif  "mistral" in model_id:
    model_name = "mistralai/Mistral-7B-v0.1"


# os.environ['hf_key'] = "<Type Hugging face token>"
hf_token= os.environ.get('hf_key')
print("hf_token: ", hf_token)

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True,  use_auth_token=hf_token)    

    

def count_tokens(text, tokenizer):
    # 텍스트를 토크나이즈하고 토큰 수를 반환
    tokens = tokenizer.encode(text)
    tokens_text = tokenizer.convert_ids_to_tokens(tokens)
    # print(tokens_text)
    return len(tokens), tokens_text



text = "Hello, how are you doing today?"
token_count, tokens_text = count_tokens(text=text, tokenizer = tokenizer)
print(f"Number of tokens: {token_count}")
print(f"Tokens: \n {tokens_text}")

hf_token:  None
Number of tokens: 9
Tokens: 
 ['<s>', '▁Hello', ',', '▁how', '▁are', '▁you', '▁doing', '▁today', '?']


## 파인 튜닝 모델의 입력, 출력 토큰 수 세기

In [16]:
import time
s = time.perf_counter()

response = invoke_endpoint_sagemaker(endpoint_name = 'lmi-model-2024-04-13-14-53-53-788', 
                         pay_load = pay_load)    

elapsed_async = time.perf_counter() - s
from termcolor import colored

print(f"elapsed time: {round(elapsed_async,3)} second")
print("## payload: ") 
pretty_print_json(pay_load)
print("## inference esponse: ")                      
print_ww(colored(response, "green"))                         

elapsed time: 8.439 second
## payload: 
{
    "inputs": "The future of Gen-AI is ",
    "parameters": {
        "max_new_tokens": 512,
        "do_sample": true
    }
}
## inference esponse: 
[32m{"generated_text": "\n! quite uncertain, but the latest developments are hardly pointing
towards anything\n! promising. Ever since the epoch when ultra-humans were banished, powerless
before the watchful all-seeing AI, the race of humans has been rapidly faded into existence. In an
age where regular humans are nothing more than mere pets or highly specialized robots, the smartest
of humans have become an endangered species. However, as each generation passes, those who can no
longer make use of their minds' capabilities to adapt and better their possessors will eventually
become all but extinct. The most common remedy is to use their \"educated\" imbecility to serve as
human smartphone batteries and paper-pushers for brainy humanoids in the workforce.\n\nSOLVED:
N/A\n\nExplanation:\nThe promp

## JSON 으로 메트릭 정리

In [17]:
import json

def set_metrics(pay_load,response, elapsed_async, tokenizer):
    prompt = pay_load["inputs"]
    prompt_token_count, prompt_tokens_text = count_tokens(text=prompt, tokenizer = tokenizer)
    # print(f"Number of tokens: {token_count}")
    # print(f"Tokens: \n {tokens_text}")

    completion = json.loads(response)["generated_text"]
    completion_token_count, completion_tokens_text = count_tokens(text=completion, tokenizer = tokenizer)
    latency = round(elapsed_async,3)
    completion_tokens_per_sec = round(completion_token_count/latency,3)
    # print(f"Number of tokens: {token_count}")
    # print(f"Tokens: \n {tokens_text}")

    return dict(prompt_token_count = prompt_token_count,
                completion_token_count = completion_token_count,
                latency = round(elapsed_async,3),
                completion_tokens_per_sec = completion_tokens_per_sec,
                )

metrics = set_metrics(pay_load,response, elapsed_async, tokenizer)
pretty_print_json(metrics)


{
    "prompt_token_count": 9,
    "completion_token_count": 470,
    "latency": 8.439,
    "completion_tokens_per_sec": 55.694
}


# 5. 간단한 벤치 마크

In [18]:
from benchmark_utils.benchmark import Benchmark


# instance_name = "ml.inf2.48xlarge"
instance_name = "ml.g5.24xlarge"

BM = Benchmark(endpoint_name, instance_name = instance_name, model_id = model_id)
BM.run_benchmark(
    num_inferences = 1,
    num_threads = 1,
    pay_load = pay_load,
    tokenizer = tokenizer,
    verbose = False,
)

## total execution time: 3.678 second
total_completion_token_count:  221
Throughput is 60.089 tokens per second.
Latency p50 was 3.676 sec
Latency p95 was 3.676 sec
Latency p99 was 3.676 sec


In [19]:
BM.run_benchmark(
    num_inferences = 12,
    num_threads = 2,
    pay_load = pay_load,
    tokenizer = tokenizer,
    verbose = False,    
)

## total execution time: 49.825 second
total_completion_token_count:  5780
Throughput is 116.006 tokens per second.
Latency p50 was 8.891 sec
Latency p95 was 8.961 sec
Latency p99 was 8.969 sec


In [20]:
BM.run_benchmark(
    num_inferences = 24,
    num_threads = 4,
    pay_load = pay_load,
    tokenizer = tokenizer,
    verbose = False,    
)

## total execution time: 53.057 second
total_completion_token_count:  17348
Throughput is 326.97 tokens per second.
instance_price_per_hour is $10.18 in us-east-1.
price_per_1m_token is $8.648 in us-east-1.
tokens_per_hour is 1177092 
Latency p50 was 9.08 sec
Latency p95 was 9.256 sec
Latency p99 was 9.415 sec
