### 1. 安装HuggingFace 并下载模型到本地

In [None]:
!pip install huggingface-hub -Uqq
!pip install -Uqq sagemaker

In [None]:
from huggingface_hub import snapshot_download
from pathlib import Path

local_model_path = Path("./baichuan2_13b_model")
local_model_path.mkdir(exist_ok=True)

allow_patterns = ["*.json", "*.pt", "*.bin", "*.model", "*.py", "*.txt"]
model_name = "baichuan-inc/Baichuan2-13B-Chat"

model_download_path = snapshot_download(
    repo_id=model_name,
    cache_dir=local_model_path,
    allow_patterns=allow_patterns
)

In [None]:
import boto3
import sagemaker
from sagemaker import Model, image_uris, serializers, deserializers

role = sagemaker.get_execution_role()  # execution role for the endpoint
sess = sagemaker.session.Session()  # sagemaker session for interacting with different AWS APIs
region = sess._region_name  # region name of the current SageMaker Studio environment
account_id = sess.account_id()  # account_id of the current SageMaker Studio environment
bucket = sess.default_bucket()
s3_client = boto3.client("s3")
sm_client = boto3.client("sagemaker")
smr_client = boto3.client("sagemaker-runtime")

In [None]:
s3_model_prefix = "LLM-RAG/workshop/baichuan2_13b_vllm_model"  # folder where model checkpoint will go
model_snapshot_path = ''
if region in ['cn-north-1', 'cn-northwest-1']:
    model_snapshot_path = f'{local_model_path}/{model_name}'
else:
    model_snapshot_path = list(local_model_path.glob("**/snapshots/*"))[0]
s3_code_prefix = "LLM-RAG/workshop/baichuan2_13b_model_deploy_code"
print(f"s3_code_prefix: {s3_code_prefix}")
print(f"model_snapshot_path: {model_snapshot_path}")

In [None]:
!rm {model_snapshot_path}/tokenization_baichuan.py
!cp -f tokenization_baichuan.py {model_snapshot_path}/tokenization_baichuan.py

In [None]:
!aws s3 cp --recursive {model_snapshot_path} s3://{bucket}/{s3_model_prefix}

In [None]:
print(f"option.model_id ==> s3://{bucket}/{s3_model_prefix}/")

### 3. 模型部署准备（entrypoint脚本，容器镜像，服务配置）

In [None]:
image_uri = image_uris.retrieve(
        framework="djl-deepspeed",
        region=sess.boto_session.region_name,
        version="0.25.0"
    )

In [None]:
!mkdir -p baichuan2_13b_model_deploy_code

In [None]:
import os

if not os.path.exists("baichuan2_13b_model_deploy_code"):
    os.mkdir("baichuan2_13b_model_deploy_code")

with open('baichuan2_13b_model_deploy_code/serving.properties', 'w') as f:
    f.write("engine=Python")
    f.write("\n")
    f.write(f"option.model_id=s3://{bucket}/{s3_model_prefix}/")
    f.write("\n")
    f.write("option.task=text-generation")
    f.write("\n")
    f.write("option.trust_remote_code=true")
    f.write("\n")
    f.write("option.tensor_parallel_degree=4")
    f.write("\n")
    f.write("option.rolling_batch=vllm")
    f.write("\n")
    f.write("option.dtype=fp16")
    f.write("\n")
    f.write("option.enable_streaming=true")

In [None]:
!rm model.tar.gz
!cd baichuan2_13b_model_deploy_code && rm -rf ".ipynb_checkpoints"
!tar czvf model.tar.gz baichuan2_13b_model_deploy_code

In [None]:
s3_code_artifact = sess.upload_data("model.tar.gz", bucket, s3_code_prefix)
print(f"S3 Code or Model tar ball uploaded to --- > {s3_code_artifact}")

### 4. 创建模型 & 创建endpoint

In [None]:
from sagemaker.utils import name_from_base
import boto3

model_name = name_from_base(f"baichuan2-13b") # Note: Need to specify model_name
print(model_name)
print(f"Image going to be used is ---- > {image_uri}")

create_model_response = sm_client.create_model(
    ModelName=model_name,
    ExecutionRoleArn=role,
    PrimaryContainer={
        "Image": image_uri,
        "ModelDataUrl": s3_code_artifact
    },
    
)
model_arn = create_model_response["ModelArn"]

print(f"Created Model: {model_arn}")

In [None]:
endpoint_config_name = f"{model_name}-config"
endpoint_name = f"{model_name}-endpoint"

#Note: ml.g4dn.2xlarge 也可以选择
endpoint_config_response = sm_client.create_endpoint_config(
    EndpointConfigName=endpoint_config_name,
    ProductionVariants=[
        {
            "VariantName": "variant1",
            "ModelName": model_name,
            "InstanceType": "ml.g4dn.12xlarge",
            "InitialInstanceCount": 1,
            # "VolumeSizeInGB" : 400,
            # "ModelDataDownloadTimeoutInSeconds": 2400,
            "ContainerStartupHealthCheckTimeoutInSeconds": 15*60,
        },
    ],
)
endpoint_config_response

In [None]:
create_endpoint_response = sm_client.create_endpoint(
    EndpointName=f"{endpoint_name}", EndpointConfigName=endpoint_config_name
)
print(f"Created Endpoint: {create_endpoint_response['EndpointArn']}")

#### 持续检测模型部署进度

In [None]:
import time

resp = sm_client.describe_endpoint(EndpointName=endpoint_name)
status = resp["EndpointStatus"]
print("Status: " + status)

while status == "Creating":
    time.sleep(60)
    resp = sm_client.describe_endpoint(EndpointName=endpoint_name)
    status = resp["EndpointStatus"]
    print("Status: " + status)

print("Arn: " + resp["EndpointArn"])
print("Status: " + status)

### 5. 模型测试

In [None]:
import json
import boto3

smr_client = boto3.client("sagemaker-runtime")

parameters = {
  "max_tokens": 1024,
  "temperature": 0.1,
  "top_p":0.8
}

In [None]:
import io

class StreamScanner:
    
    def __init__(self):
        self.buff = io.BytesIO()
        self.read_pos = 0
        
    def write(self, content):
        self.buff.seek(0, io.SEEK_END)
        self.buff.write(content)
        
    def readlines(self):
        self.buff.seek(self.read_pos)
        for line in self.buff.readlines():
            if line[-1] != b'\n':
                self.read_pos += len(line)
                yield line[:-1]
                
    def reset(self):
        self.read_pos = 0

In [None]:
import time

prompts1 = """"Human:Here is a list of aimed functions: <api_schemas><api_schema> {"name": "service_availability", "description": "query the availability of service in specified region", "parameters": {"type": "object", "properties": {"service": {"type": "string", "description": "the AWS service name"}, "region": {"type": "string", "description": "the AWS region name where the service is located in, for example us-east-1(N.Virginal), us-west-2(Oregon), eu-west-2(London), ap-southeast-1(Singapore)"}}, "required": ["service", "region"]}}, {"name": "get_contact", "description": "query the contact person in the 'SSO' organization", "parameters": {"type": "object", "properties": {"employee": {"type": "string", "description": "employee name in the 'SSO' organization"}, "role": {"type": "string", "description": "employee's role, usually it's Sales, Product Manager, Tech, Program Manager, Leader"}, "domain": {"type": "string", "description": "Techical domain for the employee，For Example AIML, Analytics, Compute"}, "scope": {"type": "string", "description": "employee's scope of responsibility. For Sales role, it could be territory like north/east/south/west, For tech role, it could be specific service"}}, "required": ["service"]}}, {"name": "QA", "description": "answer question according to searched relevant content"} </api_schema></api_schemas> You should follow below examples to choose the corresponding function and params according to user's query <examples> <query>北京region 有没有clean room服务？</query> <output>"{\"func\": \"service_availability\", \"parameters\": {\"service\": \"clean room\", \"region\": \"cn-north-1\"}}"</output> <query>数据治理的GTMS是谁？</query> <output>"{\"func\": \"get_contact\", \"param\": {\"role\": \"Product Manager\", \"scope\": \"Analytics\"}}"</output> <query>Amazon Rekognition 支持哪些图像和视频格式？</query> <output>"{\"func\": \"QA\"}"</output> <query>怎么看现有的Capacity？</query> <output>"{\"func\": \"QA\"}"</output> <query>请问Lex是哪位SSA老师负责啊？有个api的问题请教一下</query> <output>"{\"func\": \"get_contact\", \"param\": {\"role\": \"Tech\", \"scope\": \"Lex\"}}"</output> </examples> Assistant:<query>clean room 支持哪些数据源</query> <output>{"func":"""
history = []
start = time.time()
response_model = smr_client.invoke_endpoint_with_response_stream(
            EndpointName=endpoint_name,
            Body=json.dumps(
            {
                "inputs": prompts1,
                "parameters": parameters,
                "history" : history,
                "stream": True
            }
            ),
            ContentType="application/json",
        )

event_stream = response_model['Body']
scanner = StreamScanner()
for event in event_stream:
    eventJson=event['PayloadPart']['Bytes'].decode('utf-8')
    output=(eventJson)
    print(output)
    # scanner.write(event['PayloadPart']['Bytes'])
    # for line in scanner.readlines():
    #     try:
    #         print(line)
    #         # resp = json.loads(line)
    #         # print(resp)
    #         # print(resp.get("outputs")['outputs'], end='')
    #     except Exception as e:
    #         print(e)
    #         # print(line)
    #         continue
print (f"time:{time.time()-start} s")