### 1. 下载模型到本地

In [1]:
# For notebook instances (Amazon Linux)
!sudo yum update -y
!sudo yum install amazon-linux-extras
!sudo amazon-linux-extras install epel -y
!sudo yum update -y
!sudo yum install git-lfs git -y

Loaded plugins: dkms-build-requires, extras_suggestions, kernel-livepatch,
              : langpacks, priorities, update-motd, versionlock
Existing lock /var/run/yum.pid: another copy is running as pid 30478.
Another app is currently holding the yum lock; waiting for it to exit...
  The other application is: yum
    Memory : 173 M RSS (570 MB VSZ)
    Started: Mon Dec 18 13:23:09 2023 - 00:03 ago
    State  : Running, pid: 30478
Existing lock /var/run/yum.pid: another copy is running as pid 30578.
Another app is currently holding the yum lock; waiting for it to exit...
  The other application is: yum
    Memory : 115 M RSS (582 MB VSZ)
    Started: Mon Dec 18 13:23:13 2023 - 00:01 ago
    State  : Running, pid: 30578
Another app is currently holding the yum lock; waiting for it to exit...
  The other application is: yum
    Memory : 195 M RSS (661 MB VSZ)
    Started: Mon Dec 18 13:23:13 2023 - 00:03 ago
    State  : Running, pid: 30578
https://download.docker.com/linux/centos/2/x86_64

In [5]:
#下载模型snapshot到本地，需要25G空间
#需大约15-30分钟时间，请耐心等待, 如果左侧大括号内还是[*]，就还在下载中，*变成任意数例如[3]就证明已完成

from pathlib import Path
local_model_path = Path("./chatglm3-6b")
local_model_path.mkdir(exist_ok=True)
model_name = "ZhipuAI/chatglm3-6b"
clone_path = f"https://www.wisemodel.cn/{model_name}.git"
print(clone_path)

!git lfs install
!git clone $clone_path
!cd ./chatglm3-6b && rm -rf .git

https://www.wisemodel.cn/ZhipuAI/chatglm3-6b.git
Updated git hooks.
Git LFS initialized.
Cloning into 'chatglm3-6b'...
remote: Enumerating objects: 71, done.[K
remote: Total 71 (delta 0), reused 0 (delta 0), pack-reused 71[K
Receiving objects: 100% (71/71), 37.17 KiB | 12.39 MiB/s, done.
Resolving deltas: 100% (28/28), done.
Filtering content: 100% (8/8), 11.63 GiB | 9.08 MiB/s, done.


### 2. 把模型拷贝到S3为后续部署做准备

In [6]:
import sagemaker
import boto3

s3_client = boto3.client("s3")
sm_client = boto3.client("sagemaker")
smr_client = boto3.client("sagemaker-runtime")

role = sagemaker.get_execution_role()
sagemaker_session = sagemaker.Session()
sagemaker_session_bucket = sagemaker_session.default_bucket()

region = sagemaker_session._region_name
account_id = sagemaker_session.account_id()
bucket = sagemaker_session.default_bucket()

s3_code_prefix = f"lmi_inference_code/{model_name.split('/')[-1]}"

s3_location = f"s3://{sagemaker_session_bucket}/llm_model/{model_name.split('/')[-1]}/"

#你也可以把local_model_path直接替换成你的模型路径，例"model_snapshot_path=./chatglm3-6b", 这个文件夹里需要包含config.json
model_snapshot_path = local_model_path

print(f"model_snapshot_path: {model_snapshot_path}")
print("s3_location:",s3_location)
print("s3_code_prefix:",s3_code_prefix)

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml
model_snapshot_path: chatglm3-6b
s3_location: s3://sagemaker-us-east-1-340636688520/llm_model/chatglm3-6b/
s3_code_prefix: lmi_inference_code/chatglm3-6b


In [7]:
#上传模型
!aws s3 sync $model_snapshot_path $s3_location

upload: chatglm3-6b/README.md to s3://sagemaker-us-east-1-340636688520/llm_model/chatglm3-6b/README.md
upload: chatglm3-6b/MODEL_LICENSE to s3://sagemaker-us-east-1-340636688520/llm_model/chatglm3-6b/MODEL_LICENSE
upload: chatglm3-6b/config.json to s3://sagemaker-us-east-1-340636688520/llm_model/chatglm3-6b/config.json
upload: chatglm3-6b/.gitattributes to s3://sagemaker-us-east-1-340636688520/llm_model/chatglm3-6b/.gitattributes
upload: chatglm3-6b/configuration_chatglm.py to s3://sagemaker-us-east-1-340636688520/llm_model/chatglm3-6b/configuration_chatglm.py
upload: chatglm3-6b/modeling_chatglm.py to s3://sagemaker-us-east-1-340636688520/llm_model/chatglm3-6b/modeling_chatglm.py
upload: chatglm3-6b/pytorch_model-00001-of-00007.bin to s3://sagemaker-us-east-1-340636688520/llm_model/chatglm3-6b/pytorch_model-00001-of-00007.bin
upload: chatglm3-6b/pytorch_model-00005-of-00007.bin to s3://sagemaker-us-east-1-340636688520/llm_model/chatglm3-6b/pytorch_model-00005-of-00007.bin
upload: chat

### 3. 模型部署准备（entrypoint脚本，容器镜像，服务配置）

In [9]:
inference_image_uri = (
    f"763104351884.dkr.ecr.{region}.amazonaws.com/djl-inference:0.24.0-deepspeed0.10.0-cu118"
)
if "cn-" in region:
    inference_image_uri = (
        f"727897471807.dkr.ecr.{region}.amazonaws.com.cn/djl-inference:0.24.0-deepspeed0.10.0-cu118"
    )


print(f"Image going to be used is ---- > {inference_image_uri}")

Image going to be used is ---- > 763104351884.dkr.ecr.us-east-1.amazonaws.com/djl-inference:0.24.0-deepspeed0.10.0-cu118


In [10]:
!mkdir -p LLM_chatglm3_6b_deploy_code

In [38]:
%%writefile LLM_chatglm3_6b_deploy_code/model.py
from djl_python import Input, Output
import torch
import logging
import math
import os
from transformers import pipeline, AutoModel, AutoTokenizer, AutoModelForCausalLM
import transformers
import torch
import deepspeed

from transformers.generation.utils import GenerationConfig


def load_model(properties):
    tensor_parallel_degree = properties["tensor_parallel_degree"]
    model_location = properties['model_dir']
    if "model_id" in properties:
        model_location = properties['model_id']
    logging.info(f"Loading model in {model_location}")
    
    
    print('============================tokenizer====================')
    
    tokenizer = AutoTokenizer.from_pretrained(model_location, trust_remote_code=True)
    
    print('============================model====================')
    
    model = AutoModel.from_pretrained(model_location, trust_remote_code=True).half().cuda()
    model = model.eval()
    
    print('============================model loaded====================')
    
    return model, tokenizer


model = None
tokenizer = None
generator = None

def stream_items(prompt,history, max_length, top_p, temperature):
    global model, tokenizer
    size = 0
    response = ""
    for response, history in model.stream_chat(tokenizer, prompt, history=history, max_length=max_length, top_p=top_p,
                                               temperature=temperature):
        this_response = response[size:]
        history = [list(h) for h in history]
        size = len(response)
        stream_buffer = { "outputs":this_response}
        yield stream_buffer

def handle(inputs: Input):
    global model, tokenizer
    if not model:
        model, tokenizer = load_model(inputs.get_properties())

    if inputs.is_empty():
        return None
    data = inputs.get_as_json()
    
    
        
    #zpf
    input_sentences = data["ask"]
    params={}
    if "parameters" in data:
        params = data["parameters"]
    elif "temperature" in data:
        params = {"temperature": data["temperature"]}
    else:
        params = {"temperature":0.01}
    history=[]
    if "history" in data:
        history = data["history"]
    stream=False
    if "stream" in data:
        stream = data.get('stream')
    print(f'input prompt:{input_sentences}')  
    outputs = Output()
    if stream:
        outputs.add_property("content-type", "application/jsonlines")
        outputs.add_stream_content(stream_items(prompt=input_sentences,history=history,**params))
    else:
        input_sentences = data["ask"]
        response, history = model.chat(tokenizer, input_sentences, history=history)
    
        result = {"answer": response}
        return outputs.add_as_json(result)    
    
    return outputs
    

Overwriting LLM_chatglm3_6b_deploy_code/model.py


In [39]:
%%writefile LLM_chatglm3_6b_deploy_code/serving.properties
engine=Python
option.tensor_parallel_degree=1
option.model_id=s3://sagemaker-us-west-2-687912291502/llm/models/chatglm3-6b/

Overwriting LLM_chatglm3_6b_deploy_code/serving.properties


In [40]:
#将模型的s3路径更新到inference.py中
!sed -i 's|option.model_id=.*|option.model_id={s3_location}|' LLM_chatglm3_6b_deploy_code/serving.properties

In [41]:
%%writefile LLM_chatglm3_6b_deploy_code/requirements.txt
-i https://pypi.tuna.tsinghua.edu.cn/simple
transformers==4.30.2
cpm_kernels
protobuf
mdtex2html
sentencepiece
accelerate>=0.17.1
einops

Overwriting LLM_chatglm3_6b_deploy_code/requirements.txt


In [42]:
!rm model.tar.gz
!cd LLM_chatglm3_6b_deploy_code && rm -rf ".ipynb_checkpoints"
!tar czvf model.tar.gz LLM_chatglm3_6b_deploy_code

LLM_chatglm3_6b_deploy_code/
LLM_chatglm3_6b_deploy_code/requirements.txt
LLM_chatglm3_6b_deploy_code/model.py
LLM_chatglm3_6b_deploy_code/serving.properties


In [43]:
s3_code_artifact = sagemaker_session.upload_data("model.tar.gz", bucket, s3_code_prefix)
print(f"S3 Code or Model tar ball uploaded to --- > {s3_code_artifact}")

S3 Code or Model tar ball uploaded to --- > s3://sagemaker-us-east-1-340636688520/lmi_inference_code/chatglm3-6b/model.tar.gz


### 4. 创建模型 & 创建endpoint

In [44]:
from sagemaker.utils import name_from_base
import boto3

model_name = 'dji-inference-chatglm3-6b'
print(model_name)
print(f"Image going to be used is ---- > {inference_image_uri}")

create_model_response = sm_client.create_model(
    ModelName=model_name,
    ExecutionRoleArn=role,
    PrimaryContainer={
        "Image": inference_image_uri,
        "ModelDataUrl": s3_code_artifact
    },
    
)
model_arn = create_model_response["ModelArn"]

print(f"Created Model: {model_arn}")

endpoint_config_name = model_name
endpoint_name = model_name
endpoint_config_response = sm_client.create_endpoint_config(
    EndpointConfigName=endpoint_config_name,
    ProductionVariants=[
        {
            "VariantName": "variant1",
            "ModelName": model_name,
            "InstanceType": "ml.g4dn.4xlarge",
            "InitialInstanceCount": 1,
            # "VolumeSizeInGB" : 400,
            # "ModelDataDownloadTimeoutInSeconds": 2400,
            "ContainerStartupHealthCheckTimeoutInSeconds": 15*60,
        },
    ],
)

print(endpoint_config_response)

create_endpoint_response = sm_client.create_endpoint(
    EndpointName=f"{endpoint_name}", EndpointConfigName=endpoint_config_name
)
print(f"Created Endpoint: {create_endpoint_response['EndpointArn']}")

dji-inference-chatglm3-6b
Image going to be used is ---- > 763104351884.dkr.ecr.us-east-1.amazonaws.com/djl-inference:0.24.0-deepspeed0.10.0-cu118
Created Model: arn:aws:sagemaker:us-east-1:340636688520:model/dji-inference-chatglm3-6b
{'EndpointConfigArn': 'arn:aws:sagemaker:us-east-1:340636688520:endpoint-config/dji-inference-chatglm3-6b', 'ResponseMetadata': {'RequestId': '891d8b15-4803-44e0-bba9-61596733cae8', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amzn-requestid': '891d8b15-4803-44e0-bba9-61596733cae8', 'content-type': 'application/x-amz-json-1.1', 'content-length': '106', 'date': 'Mon, 18 Dec 2023 15:01:55 GMT'}, 'RetryAttempts': 0}}
Created Endpoint: arn:aws:sagemaker:us-east-1:340636688520:endpoint/dji-inference-chatglm3-6b


#### 持续检测模型部署进度

In [None]:
import time

resp = sm_client.describe_endpoint(EndpointName=endpoint_name)
status = resp["EndpointStatus"]
print("Status: " + status)

while status == "Creating":
    time.sleep(60)
    resp = sm_client.describe_endpoint(EndpointName=endpoint_name)
    status = resp["EndpointStatus"]
    print("Status: " + status)

print("Arn: " + resp["EndpointArn"])
print("Status: " + status)

Status: Creating
Status: Creating
Status: Creating


### 5. 测试模型

In [32]:
import json

runtime= boto3.client('runtime.sagemaker')
def chatglm(prompt):
  def query_endpoint_with_json_payload(encoded_json):
    response = runtime.invoke_endpoint(EndpointName=model_name, ContentType='application/json', Body=encoded_json)
    return response

  def parse_response_texts(query_response):
      model_predictions = json.loads(query_response['Body'].read())
      generated_text = model_predictions["answer"]
      return generated_text
  payload = {"ask": prompt,
             "parameters": {}}
  query_response = query_endpoint_with_json_payload(json.dumps(payload).encode('utf-8'))
  generated_texts = parse_response_texts(query_response)
  return generated_texts

chatglm('好累啊')

'疲倦是人体的一种自然反应,通常是由于身体需要休息和恢复所发出的信号。以下是一些可以帮助你缓解疲劳感的方法:\n\n1. 休息:找到一个安静的地方,尽可能地放松身心,让自己得到充足的睡眠。\n\n2. 运动:适当的运动可以促进血液循环,增强身体的免疫力,缓解疲劳感。\n\n3. 饮食:饮食健康,多吃蔬菜水果,少吃油腻食品,可以提供给身体足够的能量。\n\n4. 调整心态:保持乐观的心态,减轻压力,可以有效地缓解疲劳感。\n\n5. 娱乐活动:听音乐、看电影、读书等活动可以缓解疲劳,让你的身心得到放松。\n\n希望这些方法可以帮助你缓解疲劳感。'

In [None]:
import json

endpoint_name = model_name
prompts1 = """
你是MySQL的专家。给定一个输入问题，创建一个语法正确的MySQL查询语句。
除非用户在问题中指定了要获得的特定数量的示例，否则使用LIMIT子句查询最多3个结果。您可以对结果进行排序，以返回数据库中信息量最大的数据。您必须仅查询回答问题所需的列。将每个列名用反引号（`）括起来，表示为分隔的标识符。
请注意，仅可以使用在下面这些表中看到的列名，不要查询不存在的列。此外，还要注意哪个列在哪个表中。如果问题涉及”今天”，请注意使用CURDATE()函数获取当前日期.

使用如下格式:
Question: 具体的问题
SQLQuery: 运行的sql语句
SQLResult: SQLQuery运行的结果
Answer: 最终的回答


使用如下的表:
CREATE TABLE customer (
	c_customer_sk INTEGER NOT NULL, 
	c_customer_id CHAR(16) NOT NULL, 
	c_current_cdemo_sk INTEGER, 
	c_current_hdemo_sk INTEGER, 
	c_current_addr_sk INTEGER, 
	c_first_shipto_date_sk INTEGER, 
	c_first_sales_date_sk INTEGER, 
	c_salutation CHAR(10), 
	c_first_name CHAR(20), 
	c_last_name CHAR(30), 
	c_preferred_cust_flag CHAR(1), 
	c_birth_day INTEGER, 
	c_birth_month INTEGER, 
	c_birth_year INTEGER, 
	c_birth_country VARCHAR(20), 
	c_login CHAR(13), 
	c_email_address CHAR(50), 
	c_last_review_date CHAR(10), 
	PRIMARY KEY (c_customer_sk)
)ENGINE=InnoDB DEFAULT CHARSET=utf8


CREATE TABLE web_sales (
	ws_sold_date_sk INTEGER, 
	ws_sold_time_sk INTEGER, 
	ws_ship_date_sk INTEGER, 
	ws_item_sk INTEGER NOT NULL, 
	ws_bill_customer_sk INTEGER, 
	ws_bill_cdemo_sk INTEGER, 
	ws_bill_hdemo_sk INTEGER, 
	ws_bill_addr_sk INTEGER, 
	ws_ship_customer_sk INTEGER, 
	ws_ship_cdemo_sk INTEGER, 
	ws_ship_hdemo_sk INTEGER, 
	ws_ship_addr_sk INTEGER, 
	ws_web_page_sk INTEGER, 
	ws_web_site_sk INTEGER, 
	ws_ship_mode_sk INTEGER, 
	ws_warehouse_sk INTEGER, 
	ws_promo_sk INTEGER, 
	ws_order_number INTEGER NOT NULL, 
	ws_quantity INTEGER, 
	ws_wholesale_cost DECIMAL(7, 2), 
	ws_list_price DECIMAL(7, 2), 
	ws_sales_price DECIMAL(7, 2), 
	ws_ext_discount_amt DECIMAL(7, 2), 
	ws_ext_sales_price DECIMAL(7, 2), 
	ws_ext_wholesale_cost DECIMAL(7, 2), 
	ws_ext_list_price DECIMAL(7, 2), 
	ws_ext_tax DECIMAL(7, 2), 
	ws_coupon_amt DECIMAL(7, 2), 
	ws_ext_ship_cost DECIMAL(7, 2), 
	ws_net_paid DECIMAL(7, 2), 
	ws_net_paid_inc_tax DECIMAL(7, 2), 
	ws_net_paid_inc_ship DECIMAL(7, 2), 
	ws_net_paid_inc_ship_tax DECIMAL(7, 2), 
	ws_net_profit DECIMAL(7, 2), 
	PRIMARY KEY (ws_item_sk, ws_order_number)
)ENGINE=InnoDB DEFAULT CHARSET=utf8

Question: 我需要知道销售报表中，下单金额最大的客户email地址
"""

prompts2="给我一个青海和甘肃旅游的路线，8天7晚"
prompts3="好累啊"
parameters={
    "do_sample": False,
    "top_p": 0.9,
    "temperature": 1,
    "max_new_tokens": 300,
    "repetition_penalty": 1.03
}
response_model = smr_client.invoke_endpoint(
            EndpointName=endpoint_name,
            Body=json.dumps(
            {
                "ask": prompts1,
                "parameters": parameters,
            }
            ),
            ContentType="application/json",
        )

response_model['Body'].read().decode("utf-8")

### 流式输出测试

In [20]:
import io


class LineIterator:
    """
    A helper class for parsing the InvokeEndpointWithResponseStream event stream. 
    
    The output of the model will be in the following format:
    ```
    b'{"outputs": [" a"]}\n'
    b'{"outputs": [" challenging"]}\n'
    b'{"outputs": [" problem"]}\n'
    ...
    ```
    
    While usually each PayloadPart event from the event stream will contain a byte array 
    with a full json, this is not guaranteed and some of the json objects may be split across
    PayloadPart events. For example:
    ```
    {'PayloadPart': {'Bytes': b'{"outputs": '}}
    {'PayloadPart': {'Bytes': b'[" problem"]}\n'}}
    ```
    
    This class accounts for this by concatenating bytes written via the 'write' function
    and then exposing a method which will return lines (ending with a '\n' character) within
    the buffer via the 'readlines' function. It maintains the position of the last read 
    position to ensure that previous bytes are not exposed again. 
    """
    
    def __init__(self):
        self.buff = io.BytesIO()
        self.read_pos = 0
        
    def write(self, content):
        self.buff.seek(0, io.SEEK_END)
        self.buff.write(content)
        
    def readlines(self):
        self.buff.seek(self.read_pos)
        for line in self.buff.readlines():
            if line[-1] != b'\n':
                self.read_pos += len(line)
                yield line[:-1]
                
    def reset(self):
        self.read_pos = 0

In [46]:
parameters = {
  "max_length": 1024,
  "temperature": 0.5,
  "top_p":0.9
}
body = {"ask": "what is life", "parameters": parameters,"history" : [],"stream":True}
resp = smr_client.invoke_endpoint_with_response_stream(EndpointName=endpoint_name, Body=json.dumps(body), ContentType="application/json")
event_stream = resp['Body']

scanner = LineIterator()
for event in event_stream:
    scanner.write(event['PayloadPart']['Bytes'])
    for line in scanner.readlines():
        try:
            resp = json.loads(line)
            # print(resp)
            print(resp.get("outputs")['outputs'], end='')
        except Exception as e:
            # print(line)
            continue

Life is the characteristic that distinguishes physical entities with biological processes, such as growth, reproduction, and response to stimuli, from those without such processes. It is the condition that distinguishes living things from non-living things, such as inanimate matter. The exact definition of life is still a topic of debate among scientists and philosophers.

#### 清除模型Endpoint和config

In [35]:
!aws sagemaker delete-endpoint --endpoint-name $model_name
!aws sagemaker delete-endpoint-config --endpoint-config-name $model_name
!aws sagemaker delete-model --model-name $model_name

In [37]:
parameters = {
  "max_length": 1024,
  "temperature": 0.5,
  "top_p":0.9
}
def func(top_p,temperature,top_p,max_length):
    return
func(top=0.8,**parameters)

SyntaxError: duplicate argument 'top_p' in function definition (1471300743.py, line 6)