In [None]:
%pip install --upgrade pip --quiet
%pip install sagemaker boto3 awscli --upgrade --quiet

In [None]:
import boto3
import sagemaker
from sagemaker import Model, serializers, deserializers

role = sagemaker.get_execution_role()  # execution role for the endpoint
sess = sagemaker.session.Session()  # sagemaker session for interacting with different AWS APIs
bucket = sess.default_bucket()  # bucket to house artifacts
region = sess._region_name  # region name of the current SageMaker Studio environment
account_id = sess.account_id()  # account_id of the current SageMaker Studio environment

s3_code_prefix = "east-ai-models/chatglm2/accelerate"

In [None]:
!mkdir mymodel

In [None]:
%%writefile ./mymodel/serving.properties
engine=Python
option.model_id=THUDM/chatglm2-6b
option.tensor_parallel_degree=1

In [None]:
%%writefile ./mymodel/model.py
from djl_python import Input, Output
import os
import torch
from transformers import AutoTokenizer, AutoModel
from typing import Any, Dict, Tuple
import warnings
import json

model = None
tokenizer = None


def get_model(properties):
    model_name = properties["model_id"]
    local_rank = int(os.getenv("LOCAL_RANK", "0"))
    model = AutoModel.from_pretrained(model_name, trust_remote_code=True).half().cuda()
    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
    return model, tokenizer


def handle(inputs: Input) -> None:
    global model, tokenizer
    print("print inputs: " + str(inputs) + '.'*20)
    if not model:
        model, tokenizer = get_model(inputs.get_properties())

    if inputs.is_empty():
        # Model server makes an empty call to warmup the model on startup
        return None
    input_map = inputs.get_as_json()
    data = input_map.pop("inputs", input_map)
    parameters = input_map.pop("parameters", {})
    print("print data: " + str(data) + '.'*20)
    model = model.eval()
    response, history = model.chat(tokenizer, data, **parameters)
    print("print response: " + response + '.'*20)
    print("print history: " + str(history) + '.'*20)
    out = {'response': response, 'history': history}
    return Output().add(json.dumps(out))

In [None]:
!rm -f model.tar.gz
!rm -rf mymodel/.ipynb_checkpoints
!tar czvf model.tar.gz -C mymodel .
s3_code_artifact = sess.upload_data("model.tar.gz", bucket, s3_code_prefix)
print(f"S3 Code or Model tar uploaded to --- > {s3_code_prefix}")

In [None]:
image_uri = sagemaker.image_uris.retrieve(
    framework="djl-deepspeed", region=region, version="0.23.0"
)


print(image_uri)

model = Model(image_uri=image_uri, model_data=s3_code_artifact, role=role)

In [None]:
instance_type = "ml.g5.2xlarge"  # "ml.g5.2xlarge" - #single GPU. really need one GPU for this since tensor split is '1'

endpoint_name = sagemaker.utils.name_from_base("chatglm2-lmi-model")

model.deploy(
    initial_instance_count=1,
    instance_type=instance_type,
    endpoint_name=endpoint_name,
    container_startup_health_check_timeout=900,
)

# our requests and responses will be in json format so we specify the serializer and the deserializer
predictor = sagemaker.Predictor(
    endpoint_name=endpoint_name,
    sagemaker_session=sess,
    serializer=serializers.JSONSerializer(),
    deserializer=deserializers.JSONDeserializer(),
)

In [None]:
import time

tic = time.time()
res = predictor.predict(
    {"inputs": "你好，建议两个适合度假的地方。", "parameters": {"history": []}}
)
toc = time.time()
print(res)
print(toc - tic)