### 1. 安装HuggingFace 并下载模型到本地

In [None]:
!pip install huggingface-hub -Uqq 
!pip install -U sagemaker

In [None]:
!rm -rf ./LLM_qwen15_int4_model

In [7]:
from huggingface_hub import snapshot_download
from pathlib import Path
local_model_path = Path("./LLM_qwen15_14b_int4_model")


In [8]:
local_model_path.mkdir(exist_ok=True)

model_name = "Qwen/Qwen1.5-14B-Chat-GPTQ-Int4"
commit_hash = "2303ef27e4d8f3bf668c3139d1653e09fa41c83d"

In [None]:
snapshot_download(repo_id=model_name, revision=commit_hash, cache_dir=local_model_path)

### 2. 把模型拷贝到S3为后续部署做准备

In [2]:
import sagemaker
from sagemaker import image_uris
import boto3
import os
import time
import json

role = sagemaker.get_execution_role()  # execution role for the endpoint
sess = sagemaker.session.Session()  # sagemaker session for interacting with different AWS APIs
bucket = sess.default_bucket()  # bucket to house artifacts

region = sess._region_name
account_id = sess.account_id()

s3_client = boto3.client("s3")
sm_client = boto3.client("sagemaker")
smr_client = boto3.client("sagemaker-runtime")

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


In [9]:
s3_model_prefix = f"aigc-llm-models/{model_name}"  # folder where model checkpoint will go
model_snapshot_path = list(local_model_path.glob("**/snapshots/*"))[0]
s3_code_prefix = f"aigc-llm-models/{model_name}_deploy_code"
print(f"s3_code_prefix: {s3_code_prefix}")
print(f"model_snapshot_path: {model_snapshot_path}")

s3_code_prefix: aigc-llm-models/Qwen/Qwen1.5-14B-Chat-GPTQ-Int4_deploy_code
model_snapshot_path: LLM_qwen15_14b_int4_model/models--Qwen--Qwen1.5-14B-Chat-GPTQ-Int4/snapshots/2303ef27e4d8f3bf668c3139d1653e09fa41c83d


In [None]:
!aws s3 cp --recursive {model_snapshot_path} s3://{bucket}/{s3_model_prefix}

### 3. 模型部署准备（entrypoint脚本，容器镜像，服务配置）

In [10]:
# 可以从 https://github.com/aws/deep-learning-containers/blob/master/available_images.md#large-model-inference-containers 中寻找更新版本的 Container

#中国区需要替换为下面的image_uri
# inference_image_uri = (
#     f"727897471807.dkr.ecr.{region}.amazonaws.com.cn/djl-inference:0.26.0-deepspeed0.12.6-cu121"
# )

inference_image_uri = f"763104351884.dkr.ecr.{region}.amazonaws.com/djl-inference:0.26.0-deepspeed0.12.6-cu121"
print(f"Image going to be used is ---- > {inference_image_uri}")

Image going to be used is ---- > 763104351884.dkr.ecr.us-west-2.amazonaws.com/djl-inference:0.26.0-deepspeed0.12.6-cu121


In [11]:
local_code_dir = s3_code_prefix.split('/')[-1]
!mkdir -p {local_code_dir}

In [131]:
%%writefile {local_code_dir}/model.py
from djl_python import Input, Output
from djl_python.streaming_utils import StreamingUtils
from djl_python.properties_manager.hf_properties import HuggingFaceProperties
from djl_python.properties_manager.properties import StreamingEnum, is_rolling_batch_enabled, is_streaming_enabled
import os

import logging
from transformers import AutoModelForCausalLM, AutoTokenizer
model = None
tokenizer = None
hf_configs = None 

def get_model(properties):
    tensor_parallel = properties["tensor_parallel_degree"]
    model_location = properties['model_dir']
    if "model_id" in properties:
        model_location = properties['model_id']
    logging.info(f"Loading model in {model_location}")
    
    tokenizer = AutoTokenizer.from_pretrained(model_location, use_fast=False)

    model = AutoModelForCausalLM.from_pretrained(
        model_location,
        device_map="auto",
        torch_dtype='auto'
    ).eval()
    
    return model, tokenizer

def inference(inputs):
    try:
        input_map = inputs.get_as_json()
        logging.info(f"(lyb)input_map")
        logging.info(input_map)
        
        input_properties = inputs.get_properties()
        logging.info(f"(lyb)input_properties")
        logging.info(input_properties)
        
        data = input_map.pop("inputs", '')
        messages = input_map.pop("messages", [])
        parameters = input_map.pop("parameters", {})
        stream = input_map.pop("stream", False)
        outputs = Output()
        
        max_new_tokens = parameters.pop('max_tokens', 512)
        parameters['max_new_tokens'] = max_new_tokens
        
        if is_streaming_enabled(hf_configs.enable_streaming) and stream:
            outputs.add_property("content-type", "application/jsonlines")
            if hf_configs.enable_streaming.value == StreamingEnum.huggingface.value:
                outputs.add_stream_content(
                    StreamingUtils.use_hf_default_streamer(
                        model, tokenizer, data,
                        hf_configs.device, **parameters))
            else:
                stream_generator = StreamingUtils.get_stream_generator(
                    "Accelerate")
                outputs.add_stream_content(
                    stream_generator(model, tokenizer, data,
                                     hf_configs.device, **parameters))
            return outputs


        input_ids = tokenizer.apply_chat_template(conversation=messages, tokenize=True, add_generation_prompt=True, return_tensors='pt')
        output_ids = model.generate(input_ids.to('cuda'), **parameters)
        response = tokenizer.decode(output_ids[0][input_ids.shape[1]:], skip_special_tokens=True)

        result = {"outputs": response}
        outputs.add_as_json(result)
        
        return outputs
    except Exception as e:
        logging.exception("Huggingface inference failed")
        outputs = Output().error(str(e))


def handle(inputs: Input) -> None:
    global model, tokenizer, hf_configs
    logging.info(f"global model, tokenizer")
    
    properties = inputs.get_properties()
    
    if not hf_configs:
        hf_configs = HuggingFaceProperties(**properties)
    
    if not model:
        model, tokenizer = get_model(properties)

    if inputs.is_empty():
        # Model server makes an empty call to warmup the model on startup
        return None

    logging.info(f"(lyb)inference")
    return inference(inputs)

Overwriting Qwen1.5-14B-Chat-GPTQ-Int4_deploy_code/model.py


In [132]:
s3_path = f"s3://{bucket}/{s3_model_prefix}/"
print(f"option.s3url ==> {s3_path}")

option.s3url ==> s3://sagemaker-us-west-2-106839800180/aigc-llm-models/Qwen/Qwen1.5-14B-Chat-GPTQ-Int4/


#### Note: option.s3url 需要按照自己的账号进行修改

In [133]:
%%writefile {local_code_dir}/serving.properties
engine=Python
option.tensor_parallel_degree=1
option.enable_streaming=True
option.predict_timeout=240
option.trust_remote_code=true
option.s3url = S3PATH

Overwriting Qwen1.5-14B-Chat-GPTQ-Int4_deploy_code/serving.properties


In [134]:
!sed -i "s|option.s3url = S3PATH|option.s3url = {s3_path}|" {local_code_dir}/serving.properties

#### 注意: 必须把transformers升级到4.37.0以上，否则会出现  [Issue34](https://github.com/QwenLM/Qwen1.5/issues/34)

如果是中国区建议添加国内的pip镜像,如下代码所示
```https://github.com/QwenLM/Qwen1.5/issues/34
%%writefile {local_code_dir}/requirements.txt
-i https://pypi.tuna.tsinghua.edu.cn/simple
transformers==4.37.0
```

In [135]:
%%writefile {local_code_dir}/requirements.txt
transformers==4.37.0
accelerate
tiktoken
einops
scipy
transformers_stream_generator==0.0.4
peft
deepspeed
auto-gptq
optimum

Overwriting Qwen1.5-14B-Chat-GPTQ-Int4_deploy_code/requirements.txt


In [None]:
# !pip install auto-gptq

In [136]:
!rm model.tar.gz
!cd {local_code_dir} && rm -rf ".ipynb_checkpoints"
!tar czvf model.tar.gz {local_code_dir}

Qwen1.5-14B-Chat-GPTQ-Int4_deploy_code/
Qwen1.5-14B-Chat-GPTQ-Int4_deploy_code/model.py
Qwen1.5-14B-Chat-GPTQ-Int4_deploy_code/requirements.txt
Qwen1.5-14B-Chat-GPTQ-Int4_deploy_code/serving.properties


In [137]:
s3_code_artifact = sess.upload_data("model.tar.gz", bucket, s3_code_prefix)
print(f"S3 Code or Model tar ball uploaded to --- > {s3_code_artifact}")

S3 Code or Model tar ball uploaded to --- > s3://sagemaker-us-west-2-106839800180/aigc-llm-models/Qwen/Qwen1.5-14B-Chat-GPTQ-Int4_deploy_code/model.tar.gz


### 4. 创建模型 & 创建endpoint

In [138]:
from sagemaker.utils import name_from_base
import boto3

model_name = name_from_base(f"qwen15-14B-int4") #Note: Need to specify model_name
print(model_name)
print(f"Image going to be used is ---- > {inference_image_uri}")

create_model_response = sm_client.create_model(
    ModelName=model_name,
    ExecutionRoleArn=role,
    PrimaryContainer={
        "Image": inference_image_uri,
        "ModelDataUrl": s3_code_artifact
    },
    
)
model_arn = create_model_response["ModelArn"]

print(f"Created Model: {model_arn}")

qwen15-14B-int4-2024-03-19-05-41-40-301
Image going to be used is ---- > 763104351884.dkr.ecr.us-west-2.amazonaws.com/djl-inference:0.26.0-deepspeed0.12.6-cu121
Created Model: arn:aws:sagemaker:us-west-2:106839800180:model/qwen15-14B-int4-2024-03-19-05-41-40-301


In [139]:
endpoint_config_name = f"{model_name}-config"
endpoint_name = f"{model_name}-endpoint"

#Note: ml.g4dn.2xlarge 也可以选择
endpoint_config_response = sm_client.create_endpoint_config(
    EndpointConfigName=endpoint_config_name,
    ProductionVariants=[
        {
            "VariantName": "variant1",
            "ModelName": model_name,
            "InstanceType": "ml.g5.2xlarge",
            "InitialInstanceCount": 1,
            # "VolumeSizeInGB" : 400,
            # "ModelDataDownloadTimeoutInSeconds": 2400,
            "ContainerStartupHealthCheckTimeoutInSeconds": 10*60,
        },
    ],
)
endpoint_config_response

{'EndpointConfigArn': 'arn:aws:sagemaker:us-west-2:106839800180:endpoint-config/qwen15-14B-int4-2024-03-19-05-41-40-301-config',
 'ResponseMetadata': {'RequestId': '616e1d96-a622-4d07-b72a-9d94aac28fc1',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '616e1d96-a622-4d07-b72a-9d94aac28fc1',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '127',
   'date': 'Tue, 19 Mar 2024 05:41:41 GMT'},
  'RetryAttempts': 0}}

In [140]:
create_endpoint_response = sm_client.create_endpoint(
    EndpointName=f"{endpoint_name}", EndpointConfigName=endpoint_config_name
)
print(f"Created Endpoint: {create_endpoint_response['EndpointArn']}")

Created Endpoint: arn:aws:sagemaker:us-west-2:106839800180:endpoint/qwen15-14B-int4-2024-03-19-05-41-40-301-endpoint


#### 持续检测模型部署进度

In [141]:
import time
resp = sm_client.describe_endpoint(EndpointName=endpoint_name)
status = resp["EndpointStatus"]
print("Status: " + status)

while status == "Creating":
    time.sleep(60)
    resp = sm_client.describe_endpoint(EndpointName=endpoint_name)
    status = resp["EndpointStatus"]
    print("Status: " + status)

print("Arn: " + resp["EndpointArn"])
print("Status: " + status)

Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: InService
Arn: arn:aws:sagemaker:us-west-2:106839800180:endpoint/qwen15-14B-int4-2024-03-19-05-41-40-301-endpoint
Status: InService


### 5. 模型测试

In [122]:
%%time
import json
import boto3

smr_client = boto3.client("sagemaker-runtime")

parameters = {
  "max_tokens": 1024,
  "temperature": 0.1,
  "top_p":0.8
}

CPU times: user 4.57 ms, sys: 0 ns, total: 4.57 ms
Wall time: 3.91 ms


## No stream

In [123]:
prompts1 = """AWS Clean Rooms 的FAQ文档有提到 Q: 是否发起者和数据贡献者都会被收费？A: 是单方收费，只有查询的接收方会收费。
请问AWS Clean Rooms是多方都会收费吗？
"""
prompts1 = """写一篇500字的科幻小说，背景关于宇宙战争"""

messages = [{"role":"user", "content": prompts1}]

start = time.time()
response_model = smr_client.invoke_endpoint(
            EndpointName=endpoint_name,
            Body=json.dumps(
            {
                "inputs" : prompts1,
                "messages": messages,
                "parameters": parameters
            }
            ),
            ContentType="application/json",
        )

resp = response_model['Body'].read()
print (f"\ntime:{time.time()-start} s")
print(resp.decode('utf8'))
# print(resp)



time:14.543756246566772 s
{
  "outputs":"标题：星际烽火\n\n在遥远的未来，人类文明已经扩张到了银河系的边缘。在星域深处，一场名为\"暗影战争\"的宇宙战争正在悄然酝酿。地球联盟，作为宇宙中的和平守护者，与黑暗势力——暗影帝国的冲突一触即发。\n\n地球联盟的旗舰\"曙光号\"，由天才科学家艾伦驾驶，他的目标是阻止暗影帝国的邪恶计划——吞噬所有星系的生命能源。艾伦的队伍包括勇敢的战士莉娜和智谋过人的战术家罗伯特，他们一同踏上了这场未知的冒险。\n\n在一次深入敌后的任务中，他们发现暗影帝国的首领，黑暗领主，正准备启动一个黑洞引擎，将整个星系的生命力吸干。艾伦一行人决定冒险破坏这个装置，他们利用高科技武器和智慧，与黑暗领主展开了一场惊心动魄的太空对决。\n\n在最后关头，艾伦利用他的创新思维，设计出了一种可以干扰黑洞引擎的特殊波束。在生死存亡之际，他们成功地发射了波束，黑洞引擎的能量输出被削弱，暗影领主的计划破产。\n\n然而，胜利并未结束他们的战斗。艾伦明白，这只是战争的开始，他们必须继续保护宇宙的和平。\"曙光号\"再次启航，向着下一个战场，他们准备迎接更多的挑战。\n\n在浩渺的宇宙中，战争的烽火照亮了黑暗，而地球联盟，就像一颗永不熄灭的星辰，坚守着和平的信念，继续前行。"
}


## stream

In [142]:
import io


class StreamScanner:
    """
    A helper class for parsing the InvokeEndpointWithResponseStream event stream. 
    
    The output of the model will be in the following format:
    ```
    b'{"outputs": [" a"]}\n'
    b'{"outputs": [" challenging"]}\n'
    b'{"outputs": [" problem"]}\n'
    ...
    ```
    
    While usually each PayloadPart event from the event stream will contain a byte array 
    with a full json, this is not guaranteed and some of the json objects may be split across
    PayloadPart events. For example:
    ```
    {'PayloadPart': {'Bytes': b'{"outputs": '}}
    {'PayloadPart': {'Bytes': b'[" problem"]}\n'}}
    ```
    
    This class accounts for this by concatenating bytes written via the 'write' function
    and then exposing a method which will return lines (ending with a '\n' character) within
    the buffer via the 'readlines' function. It maintains the position of the last read 
    position to ensure that previous bytes are not exposed again. 
    """
    
    def __init__(self):
        self.buff = io.BytesIO()
        self.read_pos = 0
        
    def write(self, content):
        self.buff.seek(0, io.SEEK_END)
        self.buff.write(content)
        
    def readlines(self):
        self.buff.seek(self.read_pos)
        for line in self.buff.readlines():
            if line[-1] != b'\n':
                self.read_pos += len(line)
                yield line[:-1]
                
    def reset(self):
        self.read_pos = 0

In [146]:
prompts1 = """写一篇500字的科幻小说，背景关于宇宙战争"""

messages = [{"role":"user", "content": prompts1}]

start = time.time()
response_model = smr_client.invoke_endpoint_with_response_stream(
            EndpointName=endpoint_name,
            Body=json.dumps(
            {
                "inputs" : [prompts1],
                "messages": messages,
                "parameters": parameters,
                "stream" : True
            }
            ),
            ContentType="application/json",
        )

event_stream = response_model['Body']
scanner = StreamScanner()
for event in event_stream:
    scanner.write(event['PayloadPart']['Bytes'])
    for line in scanner.readlines():
        try:
            resp = json.loads(line.decode('utf-8'))
            
            print(resp['outputs'][0], end='')
        except Exception as e:
            print(line)
            continue

。
标题：星际的决战

在遥远的未来，人类已经掌握了星际旅行的技术，建立了庞大的星际联盟。然而，宇宙的和平并未长久，一股名为“黑暗星云”的外星势力，他们的目标是吞噬所有的文明，宇宙战争一触即发。

我们的主角，年轻的星际战士艾伦，被选中参加这场决定人类命运的战争。他的飞船，名为“希望号”，装备了最先进的防御系统和武器。艾伦的使命是保护联盟的核心星球——地球。

战争的初期，黑暗星云的军队如潮水般涌来，艾伦和他的队伍在星空中奋力抵抗。每一次的战斗都像是在生死边缘徘徊，但艾伦从未退缩，他的决心和勇气激励着整个联盟。

在一次关键的战役中，艾伦发现黑暗星云的首领——黑暗领主，他的力量源自一颗名为“毁灭之心”的神秘星球。艾伦决定冒险深入黑暗星云的腹地，寻找并摧毁这颗星球。

经过一系列的冒险和挑战，艾伦成功摧毁了“毁灭之心”，黑暗星云的军队瞬间瓦解。艾伦的勇敢和智慧，成为了联盟的英雄，他的名字被刻在了宇宙的历史中。

宇宙战争的硝烟散去，和平再次降临。艾伦回到地球，看着星空，心中充满了对未来的期待。他知道，虽然战争结束了，但人类的探索和守护宇宙的使命，才刚刚开始。<|endoftext|><|im_start|>
<|im_start|><|im_start|>
<|im_start|>
<|im_start|>
<|im_start|>
<|im_start|>
<|im_start|>
<|im_start|>
<|im_start|>
<|im_start|>
<|im_start|>
<|im_start|>
<|im_start|>
<|im_start|>
<|im_start|>
<|im_start|>
<|im_start|>
<|im_start|>
<|im_start|>
<|im_start|>
<|im_start|>
<|im_start|>
<|im_start|>
<|im_start|>
<|im_start|>
<|im_start|>
<|im_start|>
<|im_start|>
<|im_start|>
<|im_start|>
<|im_start|>
<|im_start|>
<|im_start|>
<|im_start|>
<|im_start|>
<|im_start|>
<|im_start|>
<|im_star

#### 清除模型Endpoint和config(仅限清除资源时使用）

In [128]:
!aws sagemaker delete-endpoint --endpoint-name {endpoint_name}

In [129]:
!aws sagemaker delete-endpoint-config --endpoint-config-name {endpoint_config_name}

In [130]:
!aws sagemaker delete-model --model-name {model_name}