### 1. 安装HuggingFace 并下载模型到本地

In [None]:
!pip install huggingface-hub -Uqq 
!pip install -U sagemaker

In [None]:
!rm -rf ./LLM_qwen15_int4_model

In [None]:
from huggingface_hub import snapshot_download
from pathlib import Path
local_model_path = Path("./LLM_qwen15_14b_int4_model")


In [None]:
local_model_path.mkdir(exist_ok=True)

model_name = "Qwen/Qwen1.5-14B-Chat-GPTQ-Int4"
commit_hash = "2303ef27e4d8f3bf668c3139d1653e09fa41c83d"

In [None]:
snapshot_download(repo_id=model_name, revision=commit_hash, cache_dir=local_model_path)

### 2. 把模型拷贝到S3为后续部署做准备

In [None]:
import sagemaker
from sagemaker import image_uris
import boto3
import os
import time
import json

role = sagemaker.get_execution_role()  # execution role for the endpoint
sess = sagemaker.session.Session()  # sagemaker session for interacting with different AWS APIs
bucket = sess.default_bucket()  # bucket to house artifacts

region = sess._region_name
account_id = sess.account_id()

s3_client = boto3.client("s3")
sm_client = boto3.client("sagemaker")
smr_client = boto3.client("sagemaker-runtime")

In [None]:
s3_model_prefix = f"aigc-llm-models/{model_name}"  # folder where model checkpoint will go
model_snapshot_path = list(local_model_path.glob("**/snapshots/*"))[0]
s3_code_prefix = f"aigc-llm-models/{model_name}_deploy_code"
print(f"s3_code_prefix: {s3_code_prefix}")
print(f"model_snapshot_path: {model_snapshot_path}")

In [None]:
!aws s3 cp --recursive {model_snapshot_path} s3://{bucket}/{s3_model_prefix}

### 3. 模型部署准备（entrypoint脚本，容器镜像，服务配置）

In [None]:
# 可以从 https://github.com/aws/deep-learning-containers/blob/master/available_images.md#large-model-inference-containers 中寻找更新版本的 Container

#中国区需要替换为下面的image_uri
# inference_image_uri = (
#     f"727897471807.dkr.ecr.{region}.amazonaws.com.cn/djl-inference:0.26.0-deepspeed0.12.6-cu121"
# )

inference_image_uri = f"763104351884.dkr.ecr.{region}.amazonaws.com/djl-inference:0.26.0-deepspeed0.12.6-cu121"
print(f"Image going to be used is ---- > {inference_image_uri}")

In [None]:
local_code_dir = s3_code_prefix.split('/')[-1]
!mkdir -p {local_code_dir}

In [None]:
%%writefile {local_code_dir}/model.py
from djl_python import Input, Output
import torch
import logging
import math
import os

from transformers import AutoTokenizer,AutoModelForCausalLM
from transformers.generation import GenerationConfig
# from auto_gptq import AutoGPTQForCausalLM


STOP_flag = "[DONE]"


def load_model(properties):
    tensor_parallel = properties["tensor_parallel_degree"]
    model_location = properties['model_dir']
    if "model_id" in properties:
        model_location = properties['model_id']
    logging.info(f"Loading model in {model_location}")
    
    tokenizer = AutoTokenizer.from_pretrained(model_location, trust_remote_code=True)
    model = AutoModelForCausalLM.from_pretrained(model_location, device_map="auto", trust_remote_code=True).eval()
    model.generation_config = GenerationConfig.from_pretrained(model_location, trust_remote_code=True) # 可指定不同的生成长度、top_p等相关超参

    return model, tokenizer, model.generation_config


model = None
tokenizer = None
generator = None
config = None

def stream_items(prompt, history, max_length, top_p, temperature):
    global model, tokenizer, config
    size = 0
    response = ""
    config.max_new_tokens = max_length
    config.top_p = top_p
    
    ##传入temperature会报错
    ##model.generation_config.temperature = temperature 
    res_generator = model.chat_stream(tokenizer, prompt, history=history,generation_config=config)
    for response in res_generator:
        this_response = response[size:]
        size = len(response)
        stream_buffer = { "outputs":this_response,"finished": False}
        yield stream_buffer
    ## stop
    # yield {"query": prompt, "outputs": STOP_flag, "response": response, "history": [], "finished": True}


def handle(inputs: Input):
    global model, tokenizer,config
    if not model:
        model, tokenizer,config = load_model(inputs.get_properties())

    if inputs.is_empty():
        return None
    data = inputs.get_as_json()
    
    input_sentences = data["inputs"]
    params = data["parameters"]
    history = data.get("history",[])
    stream = data.get('stream')
    print(f'input prompt:{input_sentences}')   
    print(f'generation config:{config}')
    outputs = Output()
    if stream:
        outputs.add_property("content-type", "application/jsonlines")
        outputs.add_stream_content(stream_items(input_sentences,history=history,**params))
    else:
        device = 'cuda'
        config.max_new_tokens = params.get('max_length',1024)
        config.top_p = params.get('top_p',1)
        
        prompt = input_sentences
        
        messages = [
            {"role": "user", "content": prompt}
        ]
        text = tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=True
        )
        model_inputs = tokenizer([text], return_tensors="pt").to(device)

        generated_ids = model.generate(
            model_inputs.input_ids,
            max_new_tokens= config.max_new_tokens
        )
        generated_ids = [
            output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
        ]

        response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

        result = {"outputs": response, "history" : []}
        outputs.add_as_json(result)
        
    return outputs

In [None]:
s3_path = f"s3://{bucket}/{s3_model_prefix}/"
print(f"option.s3url ==> {s3_path}")

#### Note: option.s3url 需要按照自己的账号进行修改

In [None]:
%%writefile {local_code_dir}/serving.properties
engine=Python
option.tensor_parallel_degree=1
option.enable_streaming=True
option.predict_timeout=240
option.s3url = S3PATH

In [None]:
!sed -i "s|option.s3url = S3PATH|option.s3url = {s3_path}|" {local_code_dir}/serving.properties

#### 注意: 必须把transformers升级到4.37.0以上，否则会出现  [Issue34](https://github.com/QwenLM/Qwen1.5/issues/34)

如果是中国区建议添加国内的pip镜像,如下代码所示
```https://github.com/QwenLM/Qwen1.5/issues/34
%%writefile {local_code_dir}/requirements.txt
-i https://pypi.tuna.tsinghua.edu.cn/simple
transformers==4.37.0
```

In [None]:
%%writefile {local_code_dir}/requirements.txt
transformers==4.37.0
accelerate
tiktoken
einops
scipy
transformers_stream_generator==0.0.4
peft
deepspeed
auto-gptq
optimum

In [None]:
# !pip install auto-gptq

In [None]:
!rm model.tar.gz
!cd {local_code_dir} && rm -rf ".ipynb_checkpoints"
!tar czvf model.tar.gz {local_code_dir}

In [None]:
s3_code_artifact = sess.upload_data("model.tar.gz", bucket, s3_code_prefix)
print(f"S3 Code or Model tar ball uploaded to --- > {s3_code_artifact}")

### 4. 创建模型 & 创建endpoint

In [None]:
from sagemaker.utils import name_from_base
import boto3

model_name = name_from_base(f"qwen15-14B-int4") #Note: Need to specify model_name
print(model_name)
print(f"Image going to be used is ---- > {inference_image_uri}")

create_model_response = sm_client.create_model(
    ModelName=model_name,
    ExecutionRoleArn=role,
    PrimaryContainer={
        "Image": inference_image_uri,
        "ModelDataUrl": s3_code_artifact
    },
    
)
model_arn = create_model_response["ModelArn"]

print(f"Created Model: {model_arn}")

In [None]:
endpoint_config_name = f"{model_name}-config"
endpoint_name = f"{model_name}-endpoint"

#Note: ml.g4dn.2xlarge 也可以选择
endpoint_config_response = sm_client.create_endpoint_config(
    EndpointConfigName=endpoint_config_name,
    ProductionVariants=[
        {
            "VariantName": "variant1",
            "ModelName": model_name,
            "InstanceType": "ml.g5.2xlarge",
            "InitialInstanceCount": 1,
            # "VolumeSizeInGB" : 400,
            # "ModelDataDownloadTimeoutInSeconds": 2400,
            "ContainerStartupHealthCheckTimeoutInSeconds": 10*60,
        },
    ],
)
endpoint_config_response

In [None]:
create_endpoint_response = sm_client.create_endpoint(
    EndpointName=f"{endpoint_name}", EndpointConfigName=endpoint_config_name
)
print(f"Created Endpoint: {create_endpoint_response['EndpointArn']}")

#### 持续检测模型部署进度

In [None]:
import time
resp = sm_client.describe_endpoint(EndpointName=endpoint_name)
status = resp["EndpointStatus"]
print("Status: " + status)

while status == "Creating":
    time.sleep(60)
    resp = sm_client.describe_endpoint(EndpointName=endpoint_name)
    status = resp["EndpointStatus"]
    print("Status: " + status)

print("Arn: " + resp["EndpointArn"])
print("Status: " + status)

#### [optional] 注册模型到System Parameter store, 用于方便获取账号内所有部署的llm endpoint

In [None]:
SMM_KEY_AVAIL_LLM_ENDPOINTS = 'avail_llm_endpoints'

def get_all_private_llm():
    ret = {}

    # only get the llm endpoint from this account
    ssm = boto3.client('ssm')
    try:
        parameter = ssm.get_parameter(Name=SMM_KEY_AVAIL_LLM_ENDPOINTS, WithDecryption=False)
        ret=json.loads(parameter['Parameter']['Value'])
    except Exception as e:
        print(str(e))
        
    return ret

def llm_endpoint_regist(model_id, model_endpoint):
    ssm = boto3.client('ssm')
    existed_llm_endpoints_dict=get_all_private_llm()

    append_llm_endpoint = {
        model_id: model_endpoint,
    }
    existed_llm_endpoints_dict.update(append_llm_endpoint)

    ssm_val = json.dumps(existed_llm_endpoints_dict)
    ssm.put_parameter(
        Name=SMM_KEY_AVAIL_LLM_ENDPOINTS,
        Overwrite=True,
        Type='String',
        Value=ssm_val,
    )
    
llm_endpoint_regist('qwen1.5_14B_GPTQ_INT4', endpoint_name)

### 5. 模型测试

In [None]:
%%time
import json
import boto3

smr_client = boto3.client("sagemaker-runtime")

parameters = {
  "max_length": 1024,
  "temperature": 0.1,
  "top_p":0.8
}

## No stream (qwen1.5 only support NoStream)

In [None]:
prompts1 = """AWS Clean Rooms 的FAQ文档有提到 Q: 是否发起者和数据贡献者都会被收费？A: 是单方收费，只有查询的接收方会收费。
请问AWS Clean Rooms是多方都会收费吗？
"""
prompts1 = """写一篇500字的科幻小说，背景关于宇宙战争"""
start = time.time()
response_model = smr_client.invoke_endpoint(
            EndpointName=endpoint_name,
            Body=json.dumps(
            {
                "inputs": prompts1,
                "parameters": parameters,
                "history" : [],
            }
            ),
            ContentType="application/json",
        )

resp = response_model['Body'].read()
print (f"\ntime:{time.time()-start} s")
# print(resp.decode('utf8'))
print(json.loads(resp)['outputs'])


#### 清除模型Endpoint和config(仅限清除资源时使用）

In [None]:
!aws sagemaker delete-endpoint --endpoint-name {endpoint_name}

In [None]:
!aws sagemaker delete-endpoint-config --endpoint-config-name {endpoint_config_name}

In [None]:
!aws sagemaker delete-model --model-name {model_name}