### 1. 安装HuggingFace 并下载模型到本地

In [1]:
!pip install huggingface-hub -Uqq
!pip install -U sagemaker

Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
Collecting sagemaker
  Downloading sagemaker-2.168.0.tar.gz (844 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m844.7/844.7 kB[0m [31m19.9 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: sagemaker
  Building wheel for sagemaker (setup.py) ... [?25ldone
[?25h  Created wheel for sagemaker: filename=sagemaker-2.168.0-py2.py3-none-any.whl size=1151019 sha256=b548c0d9f0519ac230e8495a9872543e532fa6c1a6d4363a099e6566a04ab61d
  Stored in directory: /home/ec2-user/.cache/pip/wheels/81/33/1d/c4fb556086a08d8e4ede95111dba93971c1caa14d095afdaa9
Successfully built sagemaker
Installing collected packages: sagemaker
  Attempting uninstall: sagemaker
    Found existing installation: sagemaker 2.159.0
    Uninstalling sagemaker-2.159.0:
      Successfully uninstalled sagemaker-2.159.0
Successfully installed

In [3]:
from huggingface_hub import snapshot_download
from pathlib import Path

local_model_path = Path("./LLM_chatglm_model")
local_model_path.mkdir(exist_ok=True)
model_name = "THUDM/chatglm-6b"
commit_hash = "f83182484538e663a03d3f73647f10f89878f438"

In [4]:
snapshot_download(repo_id=model_name, revision=commit_hash, cache_dir=local_model_path)

Fetching 21 files:   0%|          | 0/21 [00:00<?, ?it/s]

'LLM_chatglm_model/models--THUDM--chatglm-6b/snapshots/f83182484538e663a03d3f73647f10f89878f438'

### 2. 把模型拷贝到S3为后续部署做准备

In [4]:
%pip install sagemaker pip --upgrade  --quiet

Note: you may need to restart the kernel to use updated packages.


In [5]:
# Note the following may error depending on which awscli is installed in your jupyter kernel, 
# but that is ok 

%pip install botocore-*-py3-none-any.whl boto3-*-py3-none-any.whl --force

Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
Processing ./botocore-1.29.157-py3-none-any.whl
Processing ./boto3-1.26.157-py3-none-any.whl
Collecting jmespath<2.0.0,>=0.7.1 (from botocore==1.29.157)
  Using cached jmespath-1.0.1-py3-none-any.whl (20 kB)
Collecting python-dateutil<3.0.0,>=2.1 (from botocore==1.29.157)
  Using cached python_dateutil-2.8.2-py2.py3-none-any.whl (247 kB)
Collecting urllib3<1.27,>=1.25.4 (from botocore==1.29.157)
  Downloading urllib3-1.26.16-py2.py3-none-any.whl (143 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.1/143.1 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting s3transfer<0.7.0,>=0.6.0 (from boto3==1.26.157)
  Downloading s3transfer-0.6.1-py3-none-any.whl (79 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.8/79.8 kB[0m [31m18.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting six>=1.5 (from python-dateutil<3.0.0,>=2.1->botocore==1.29.157)
  Using cache

In [5]:
import sagemaker
from sagemaker import image_uris
import boto3
import os
import time
import json

role = sagemaker.get_execution_role()  # execution role for the endpoint
sess = sagemaker.session.Session()  # sagemaker session for interacting with different AWS APIs
bucket = sess.default_bucket()  # bucket to house artifacts

region = sess._region_name
account_id = sess.account_id()

s3_client = boto3.client("s3")
sm_client = boto3.client("sagemaker")
smr_client = boto3.client("sagemaker-runtime")

In [6]:
s3_model_prefix = "LLM-RAG/workshop/LLM_chatglm_model"  # folder where model checkpoint will go
model_snapshot_path = list(local_model_path.glob("**/snapshots/*"))[0]
s3_code_prefix = "LLM-RAG/workshop/LLM_chatglm_deploy_code"
print(f"s3_code_prefix: {s3_code_prefix}")
print(f"model_snapshot_path: {model_snapshot_path}")

s3_code_prefix: LLM-RAG/workshop/LLM_chatglm_deploy_code
model_snapshot_path: LLM_chatglm_model/models--THUDM--chatglm-6b/snapshots/f83182484538e663a03d3f73647f10f89878f438


In [7]:
!aws s3 cp --recursive {model_snapshot_path} s3://{bucket}/{s3_model_prefix}

upload: LLM_chatglm_model/models--THUDM--chatglm-6b/snapshots/f83182484538e663a03d3f73647f10f89878f438/config.json to s3://sagemaker-us-west-2-946277762357/LLM-RAG/workshop/LLM_chatglm_model/config.json
upload: LLM_chatglm_model/models--THUDM--chatglm-6b/snapshots/f83182484538e663a03d3f73647f10f89878f438/.gitattributes to s3://sagemaker-us-west-2-946277762357/LLM-RAG/workshop/LLM_chatglm_model/.gitattributes
upload: LLM_chatglm_model/models--THUDM--chatglm-6b/snapshots/f83182484538e663a03d3f73647f10f89878f438/LICENSE to s3://sagemaker-us-west-2-946277762357/LLM-RAG/workshop/LLM_chatglm_model/LICENSE
upload: LLM_chatglm_model/models--THUDM--chatglm-6b/snapshots/f83182484538e663a03d3f73647f10f89878f438/README.md to s3://sagemaker-us-west-2-946277762357/LLM-RAG/workshop/LLM_chatglm_model/README.md
upload: LLM_chatglm_model/models--THUDM--chatglm-6b/snapshots/f83182484538e663a03d3f73647f10f89878f438/configuration_chatglm.py to s3://sagemaker-us-west-2-946277762357/LLM-RAG/workshop/LLM_chat

### 3. 模型部署准备（entrypoint脚本，容器镜像，服务配置）

In [7]:
# inference_image_uri = (
#     f"763104351884.dkr.ecr.{region}.amazonaws.com/djl-inference:0.21.0-deepspeed0.8.3-cu117"
# )

# print(f"Image going to be used is ---- > {inference_image_uri}")

In [8]:
inference_image_uri = image_uris.retrieve(
    framework="djl-deepspeed",
    region=sess.boto_session.region_name,
    version="0.22.1"
)
print(f"Image going to be used is ---- > {inference_image_uri}")

Image going to be used is ---- > 763104351884.dkr.ecr.us-west-2.amazonaws.com/djl-inference:0.22.1-deepspeed0.8.3-cu118


In [9]:
!mkdir -p LLM_chatglm_deploy_code

In [102]:
%%writefile LLM_chatglm_deploy_code/model.py
from djl_python import Input, Output
import torch
import logging
import math
import os

from transformers import pipeline, AutoModel, AutoTokenizer
model = None
tokenizer = None
STOP_flag = "[DONE]"

DEVICE = "cuda"
DEVICE_ID = "0"
CUDA_DEVICE = f"{DEVICE}:{DEVICE_ID}" if DEVICE_ID else DEVICE
def torch_gc():
    if torch.cuda.is_available():
        with torch.cuda.device(CUDA_DEVICE):
            torch.cuda.empty_cache()
            torch.cuda.ipc_collect()
            
def load_model(properties):
    global tokenizer,model
    tensor_parallel = properties["tensor_parallel_degree"]
    model_location = properties['model_dir']
    if "model_id" in properties:
        model_location = properties['model_id']
    logging.info(f"Loading model in {model_location}")
    
    tokenizer = AutoTokenizer.from_pretrained(model_location, trust_remote_code=True)
   
    model = AutoModel.from_pretrained(model_location, trust_remote_code=True).half().cuda()
    
    model.requires_grad_(False)
    model.eval()
    
    return model, tokenizer



def stream_items(prompt, history, max_length, top_p, temperature):
    global model, tokenizer
    size = 0
    response = ""
    for response, history in model.stream_chat(tokenizer, prompt, history=history, max_length=max_length, top_p=top_p,
                                               temperature=temperature):
        this_response = response[size:]
        history = [list(h) for h in history]
        size = len(response)
        stream_buffer = { "outputs":this_response,"finished": False}
        yield stream_buffer
    ## stop
    yield {"query": prompt, "outputs": STOP_flag, "response": response, "history": history, "finished": True}
    


def handle(inputs: Input):
    global model, tokenizer
    if not model:
        model, tokenizer = load_model(inputs.get_properties())

    if inputs.is_empty():
        return None
    data = inputs.get_as_json()
    
    input_sentences = data["inputs"]
    params = data["parameters"]
    history = data["history"]
    print(f'input prompt:{input_sentences}')    
    outputs = Output()
    outputs.add_property("content-type", "application/jsonlines")
    outputs.add_stream_content(stream_items(input_sentences,history=history,**params))
    return outputs


Overwriting LLM_chatglm_deploy_code/model.py


#### Note: option.s3url 需要按照自己的账号进行修改

In [103]:
%%writefile LLM_chatglm_deploy_code/serving.properties
engine=Python
option.tensor_parallel_degree=1
option.enable_streaming=True
option.predict_timeout=240
option.s3url = s3://sagemaker-us-west-2-946277762357/LLM-RAG/workshop/LLM_chatglm_model/

Overwriting LLM_chatglm_deploy_code/serving.properties


#### 注意: 必须把transformers升级到4.27.1以上，否则会出现 [Issue344](https://github.com/THUDM/ChatGLM-6B/issues/344)

In [104]:
%%writefile LLM_chatglm_deploy_code/requirements.txt
transformers==4.28.1

Overwriting LLM_chatglm_deploy_code/requirements.txt


In [105]:
!rm model.tar.gz
!cd LLM_chatglm_deploy_code && rm -rf ".ipynb_checkpoints"
!tar czvf model.tar.gz LLM_chatglm_deploy_code

LLM_chatglm_deploy_code/
LLM_chatglm_deploy_code/requirements.txt
LLM_chatglm_deploy_code/serving.properties
LLM_chatglm_deploy_code/model.py


In [106]:
s3_code_artifact = sess.upload_data("model.tar.gz", bucket, s3_code_prefix)
print(f"S3 Code or Model tar ball uploaded to --- > {s3_code_artifact}")

S3 Code or Model tar ball uploaded to --- > s3://sagemaker-us-west-2-946277762357/LLM-RAG/workshop/LLM_chatglm_deploy_code/model.tar.gz


### 4. 创建模型 & 创建endpoint

In [107]:
from sagemaker.utils import name_from_base
import boto3

model_name = name_from_base(f"chatglm-stream") # Append a timestamp to the provided string
print(model_name)
print(f"Image going to be used is ---- > {inference_image_uri}")

create_model_response = sm_client.create_model(
    ModelName=model_name,
    ExecutionRoleArn=role,
    PrimaryContainer={
        "Image": inference_image_uri,
        "ModelDataUrl": s3_code_artifact
    },
    
)
model_arn = create_model_response["ModelArn"]

print(f"Created Model: {model_arn}")

chatglm-stream-2023-06-25-07-34-15-297
Image going to be used is ---- > 763104351884.dkr.ecr.us-west-2.amazonaws.com/djl-inference:0.22.1-deepspeed0.8.3-cu118
Created Model: arn:aws:sagemaker:us-west-2:946277762357:model/chatglm-stream-2023-06-25-07-34-15-297


In [108]:
endpoint_config_name = f"{model_name}-config"
endpoint_name = f"{model_name}-endpoint"

#Note: ml.g4dn.2xlarge 也可以选择
endpoint_config_response = sm_client.create_endpoint_config(
    EndpointConfigName=endpoint_config_name,
    ProductionVariants=[
        {
            "VariantName": "variant1",
            "ModelName": model_name,
            "InstanceType": "ml.g5.2xlarge",
            "InitialInstanceCount": 1,
            # "VolumeSizeInGB" : 400,
            # "ModelDataDownloadTimeoutInSeconds": 2400,
            "ContainerStartupHealthCheckTimeoutInSeconds": 15*60,
        },
    ],
)
endpoint_config_response

{'EndpointConfigArn': 'arn:aws:sagemaker:us-west-2:946277762357:endpoint-config/chatglm-stream-2023-06-25-07-34-15-297-config',
 'ResponseMetadata': {'RequestId': 'b523f4dd-34c1-4f4f-8c0d-b6ca7cd96372',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': 'b523f4dd-34c1-4f4f-8c0d-b6ca7cd96372',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '126',
   'date': 'Sun, 25 Jun 2023 07:34:15 GMT'},
  'RetryAttempts': 0}}

In [109]:
create_endpoint_response = sm_client.create_endpoint(
    EndpointName=f"{endpoint_name}", EndpointConfigName=endpoint_config_name
)
print(f"Created Endpoint: {create_endpoint_response['EndpointArn']}")

Created Endpoint: arn:aws:sagemaker:us-west-2:946277762357:endpoint/chatglm-stream-2023-06-25-07-34-15-297-endpoint


#### 持续检测模型部署进度

In [110]:
import time

resp = sm_client.describe_endpoint(EndpointName=endpoint_name)
status = resp["EndpointStatus"]
print("Status: " + status)

while status == "Creating":
    time.sleep(60)
    resp = sm_client.describe_endpoint(EndpointName=endpoint_name)
    status = resp["EndpointStatus"]
    print("Status: " + status)

print("Arn: " + resp["EndpointArn"])
print("Status: " + status)

Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: InService
Arn: arn:aws:sagemaker:us-west-2:946277762357:endpoint/chatglm-stream-2023-06-25-07-34-15-297-endpoint
Status: InService


### 5. 模型测试

In [111]:
%%time
import json
import boto3

smr_client = boto3.client("sagemaker-runtime")

parameters = {
  "max_length": 2048,
  "temperature": 0.01,
  "top_p":1
}

CPU times: user 6.65 ms, sys: 0 ns, total: 6.65 ms
Wall time: 13 ms


In [112]:
import io


class StreamScanner:
    """
    A helper class for parsing the InvokeEndpointWithResponseStream event stream. 
    
    The output of the model will be in the following format:
    ```
    b'{"outputs": [" a"]}\n'
    b'{"outputs": [" challenging"]}\n'
    b'{"outputs": [" problem"]}\n'
    ...
    ```
    
    While usually each PayloadPart event from the event stream will contain a byte array 
    with a full json, this is not guaranteed and some of the json objects may be split across
    PayloadPart events. For example:
    ```
    {'PayloadPart': {'Bytes': b'{"outputs": '}}
    {'PayloadPart': {'Bytes': b'[" problem"]}\n'}}
    ```
    
    This class accounts for this by concatenating bytes written via the 'write' function
    and then exposing a method which will return lines (ending with a '\n' character) within
    the buffer via the 'readlines' function. It maintains the position of the last read 
    position to ensure that previous bytes are not exposed again. 
    """
    
    def __init__(self):
        self.buff = io.BytesIO()
        self.read_pos = 0
        
    def write(self, content):
        self.buff.seek(0, io.SEEK_END)
        self.buff.write(content)
        
    def readlines(self):
        self.buff.seek(self.read_pos)
        for line in self.buff.readlines():
            if line[-1] != b'\n':
                self.read_pos += len(line)
                yield line[:-1]
                
    def reset(self):
        self.read_pos = 0

In [None]:
prompts1 = """写一篇500字左右的文章，介绍中国的首都"""
# prompts1 = """write a 500 words story about scifiction"""
response_model = smr_client.invoke_endpoint_with_response_stream(
            EndpointName=endpoint_name,
            Body=json.dumps(
            {
                "inputs": prompts1,
                "parameters": parameters,
                "history" : []
            }
            ),
            ContentType="application/json",
        )

event_stream = response_model['Body']
scanner = StreamScanner()
for event in event_stream:
    scanner.write(event['PayloadPart']['Bytes'])
    for line in scanner.readlines():
        try:
            resp = json.loads(line)
            print(resp.get("outputs")['outputs'], end='')
        except Exception as e:
            # print(line)
            continue

中国的首都是北京，位于中国北方平原地区的中心地带，是中国的政治、文化、经济、科技和交通中心。

北京是中国的历史文化名城，有着悠久的历史和文化底蕴。作为中国的首都，北京有着众多的名胜古迹，如故宫、长城、颐和园、圆明园等，这些景点不仅吸引着众多游客前来观光，也是中国文化遗产的重要组成部分。

北京还是中国的科技创新中心，是中国重要的经济中心之一。北京拥有众多高等院校和科研机构，是中国重要的科技人才培养基地。近年来，北京更是成为了全球科技创新的重要聚集地之一，吸引了许多的科技企业前来发展。

北京还是中国的文化中心，是中国重要的文化旅游胜地。北京文化遗产和民俗文化，如相声、京剧、评剧、驴打滚等，这些文化产品不仅在国内广受欢迎，也在国际上享有很高的声誉。此外，北京还是中国重要的艺术旅游胜地，吸引了许多的艺术家前来创作和演出。

北京作为中国的首都，不仅有着和文化底蕴，还是中国的重要经济中心

In [87]:
prompts1 = """AWS Clean Rooms 的FAQ文档有提到 Q: 是否发起者和数据贡献者都会被收费？A: 是单方收费，只有查询的接收方会收费。
请问AWS Clean Rooms是多方都会收费吗？
"""
response_model = smr_client.invoke_endpoint(
            EndpointName=endpoint_name,
            Body=json.dumps(
            {
                "inputs": prompts1,
                "parameters": parameters,
                "history" : []
            }
            ),
            ContentType="application/json",
        )

response_model['Body'].read().decode('utf8')

'{"outputs": {"outputs": "AWS", "response": "AWS", "finished": false}}\n{"outputs": {"outputs": " Clean", "response": "AWS Clean", "finished": false}}\n{"outputs": {"outputs": " Room", "response": "AWS Clean Room", "finished": false}}\n{"outputs": {"outputs": "s", "response": "AWS Clean Rooms", "finished": false}}\n{"outputs": {"outputs": "\\u662f\\u4e00\\u79cd", "response": "AWS Clean Rooms\\u662f\\u4e00\\u79cd", "finished": false}}\n{"outputs": {"outputs": "\\u6570\\u636e", "response": "AWS Clean Rooms\\u662f\\u4e00\\u79cd\\u6570\\u636e", "finished": false}}\n{"outputs": {"outputs": "\\u9690\\u79c1", "response": "AWS Clean Rooms\\u662f\\u4e00\\u79cd\\u6570\\u636e\\u9690\\u79c1", "finished": false}}\n{"outputs": {"outputs": "\\u4fdd\\u62a4", "response": "AWS Clean Rooms\\u662f\\u4e00\\u79cd\\u6570\\u636e\\u9690\\u79c1\\u4fdd\\u62a4", "finished": false}}\n{"outputs": {"outputs": "\\u6280\\u672f", "response": "AWS Clean Rooms\\u662f\\u4e00\\u79cd\\u6570\\u636e\\u9690\\u79c1\\u4fdd\\u62a

In [48]:
prompts1 = """AWS Clean Rooms 的FAQ文档有提到:\nQuestion: AWS Clean Rooms的一个协作中可以有多少个参与方? \nAnswer: 每个协作AWS Clean Rooms最多支持五名参与方。\n\n
请问AWS Clean Rooms的一个协作中可以有多少个参与方?"""

response_model = smr_client.invoke_endpoint(
            EndpointName=endpoint_name,
            Body=json.dumps(
            {
                "inputs": prompts1,
                "parameters": parameters,
                "history" : []
            }
            ),
            ContentType="application/json",
        )

response_model['Body'].read().decode('utf8')

'{\n  "outputs":"根据AWS Clean Rooms的FAQ文档，每个协作AWS Clean Rooms最多支持五名参与方。因此，一个AWS Clean Rooms协作中可以有最多五名参与方。",\n  "history":[\n    [\n      "AWS Clean Rooms 的FAQ文档有提到:\\nQuestion: AWS Clean Rooms的一个协作中可以有多少个参与方? \\nAnswer: 每个协作AWS Clean Rooms最多支持五名参与方。\\n\\n\\n请问AWS Clean Rooms的一个协作中可以有多少个参与方?",\n      "根据AWS Clean Rooms的FAQ文档，每个协作AWS Clean Rooms最多支持五名参与方。因此，一个AWS Clean Rooms协作中可以有最多五名参与方。"\n    ]\n  ]\n}'

#### 清除模型Endpoint和config

In [None]:
# !aws sagemaker delete-endpoint --endpoint-name bloomz-7b1-mt-2023-04-13-11-02-25-553-endpoint

In [None]:
# !aws sagemaker delete-endpoint-config --endpoint-config-name bloomz-7b1-mt-2023-04-13-11-02-25-553-config