In [2]:
%pip install --upgrade pip --quiet

[33mDEPRECATION: pyodbc 4.0.0-unsupported has a non-standard version number. pip 23.3 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of pyodbc or contact the author to suggest that they release a version with a conforming version number. Discussion can be found at https://github.com/pypa/pip/issues/12063[0m[33m
[0mNote: you may need to restart the kernel to use updated packages.


In [3]:
%pip install sagemaker boto3 awscli --upgrade --quiet

[33mDEPRECATION: pyodbc 4.0.0-unsupported has a non-standard version number. pip 23.3 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of pyodbc or contact the author to suggest that they release a version with a conforming version number. Discussion can be found at https://github.com/pypa/pip/issues/12063[0m[33m
[0m[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
aiobotocore 2.4.2 requires botocore<1.27.60,>=1.27.59, but you have botocore 1.31.23 which is incompatible.[0m[31m
[0mNote: you may need to restart the kernel to use updated packages.


In [4]:
import boto3
import sagemaker
from sagemaker import Model, serializers, deserializers

role = sagemaker.get_execution_role()  # execution role for the endpoint
sess = sagemaker.session.Session()  # sagemaker session for interacting with different AWS APIs
bucket = sess.default_bucket()  # bucket to house artifacts
region = sess._region_name  # region name of the current SageMaker Studio environment
account_id = sess.account_id()  # account_id of the current SageMaker Studio environment

s3_code_prefix_deepspeed = "hf-large-model-djl-/llama2-chinese-7b-chat/deepspeed"

In [5]:
print(role, region, account_id)

arn:aws:iam::851108988172:role/service-role/AmazonSageMaker-ExecutionRole-20191011T144691 us-east-1 851108988172


In [6]:
!mkdir mymodel

In [7]:
pretrained_model_location = f"s3://{bucket}/models/llama2-chinese-7b-chat/"
print(f"Pretrained model will be downloaded from ---- > {pretrained_model_location}")

Pretrained model will be downloaded from ---- > s3://sagemaker-us-east-1-851108988172/models/llama2-chinese-7b-chat/


In [8]:
%%writefile ./mymodel/serving.properties
engine=DeepSpeed
option.model_id=FlagAlpha/Llama2-Chinese-7b-Chat
option.tensor_parallel_degree=1

Writing ./mymodel/serving.properties


In [9]:
%%writefile ./mymodel/model.py
from djl_python import Input, Output
import os
import torch
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer
from transformers.models.llama.modeling_llama import LlamaDecoderLayer
from transformers import LlamaTokenizer, LlamaForCausalLM
from typing import Any, Dict, Tuple
import deepspeed
import warnings
import json

model = None
tokenizer = None


def get_model(properties):
    model_name = properties["model_id"]
    local_rank = int(os.getenv("LOCAL_RANK", "0"))
    model = LlamaForCausalLM.from_pretrained(
        model_name, low_cpu_mem_usage=True, trust_remote_code=True, torch_dtype=torch.bfloat16
    )
    model = deepspeed.init_inference(model, mp_size=properties["tensor_parallel_degree"])
    tokenizer = LlamaTokenizer.from_pretrained(model_name, trust_remote_code=True)
    return model, tokenizer


def handle(inputs: Input) -> None:
    global model, tokenizer
    print("print inputs: " + str(inputs) + '.'*20)
    if not model:
        model, tokenizer = get_model(inputs.get_properties())

    if inputs.is_empty():
        # Model server makes an empty call to warmup the model on startup
        return None
    input_map = inputs.get_as_json()
    data = input_map.pop("inputs", input_map)
    parameters = input_map.pop("parameters", {})
    print("print data: " + str(data) + '.'*20)
    # data = tokenizer(data, return_tensors='pt')
    input_tokens = tokenizer(data, return_tensors="pt").to(
            torch.cuda.current_device()
        )
    with torch.no_grad():
        output_tokens = model.generate(input_tokens.input_ids, **parameters)
    generated_text = tokenizer.decode(output_tokens[0], skip_special_tokens=True)
    print("print generated_text: " + generated_text + '.'*20)
    out = {'generated_text': generated_text}
    return Output().add(json.dumps(out))

Writing ./mymodel/model.py


In [10]:
!rm -f model.tar.gz
!rm -rf mymodel/.ipynb_checkpoints
!tar czvf model.tar.gz -C mymodel .
s3_code_artifact_deepspeed = sess.upload_data("model.tar.gz", bucket, s3_code_prefix_deepspeed)
print(f"S3 Code or Model tar for deepspeed uploaded to --- > {s3_code_artifact_deepspeed}")

./
./model.py
./serving.properties
S3 Code or Model tar for deepspeed uploaded to --- > s3://sagemaker-us-east-1-851108988172/hf-large-model-djl-/llama2-chinese-7b-chat/deepspeed/model.tar.gz


In [11]:
image_uri = sagemaker.image_uris.retrieve(
    framework="djl-deepspeed", region=region, version="0.23.0"
)


print(image_uri)

model = Model(image_uri=image_uri, model_data=s3_code_artifact_deepspeed, role=role)

763104351884.dkr.ecr.us-east-1.amazonaws.com/djl-inference:0.23.0-deepspeed0.9.5-cu118


In [12]:
instance_type = "ml.g5.2xlarge"  # "ml.g5.2xlarge" - #single GPU. really need one GPU for this since tensor split is '1'

endpoint_name = sagemaker.utils.name_from_base("llama2-chinese-7b-chat-lmi-model")

model.deploy(
    initial_instance_count=1,
    instance_type=instance_type,
    endpoint_name=endpoint_name,
    container_startup_health_check_timeout=900,
)

# our requests and responses will be in json format so we specify the serializer and the deserializer
predictor = sagemaker.Predictor(
    endpoint_name=endpoint_name,
    sagemaker_session=sess,
    serializer=serializers.JSONSerializer(),
    deserializer=deserializers.JSONDeserializer(),
)

---------------!

In [16]:
import time

tic = time.time()
res = predictor.predict(
    {"inputs": "你好，睡眠不好怎么办？", "parameters": {"max_new_tokens": 256}}
)
toc = time.time()
print(res)
print(toc - tic)

{'generated_text': '你好，睡眠不好怎么办？\nЋ睡眠不好，怎么办？\n\n睡眠不好是一种常见的疾病，它可能会对你的身体和心理健康产生负面影响。以下是一些可能有助于缓解睡眠不好的建议：\n\n1. 制定一个安静的睡眠环境，包括室内温度和光线的调整。\n\n2. 避免吸烟和饮酒，这些活动可能会对睡眠产生负面影响。\n\n3. 建立一个安静的睡眠习惯，例如每晚在床上睡觉，并在床上保持一定'}
10.166033506393433
