### 1. 安装HuggingFace 并下载模型到本地

In [None]:
!pip install huggingface-hub -Uqq
!pip install -U sagemaker

In [3]:
from huggingface_hub import snapshot_download
from pathlib import Path

local_model_path = Path("./buffer-cross-001-model")
local_model_path.mkdir(exist_ok=True)
model_name = "csdc-atl/buffer-cross-001"
commit_hash = "46d270928463db49b317e5ea469a8ac8152f4a13"

In [4]:
snapshot_download(repo_id=model_name, revision=commit_hash, cache_dir=local_model_path)

Fetching 10 files:   0%|          | 0/10 [00:00<?, ?it/s]

'buffer-cross-001-model/models--csdc-atl--buffer-cross-001/snapshots/46d270928463db49b317e5ea469a8ac8152f4a13'

In [5]:
import os
from dotenv import load_dotenv
load_dotenv()
profile_name = os.environ["ATL"]
%env AWS_DEFAULT_PROFILE = {profile_name}
s3_bucket_name=os.environ["ATL_S3_BUCKET_NAME"]
role=os.environ["ATL_ROLE"]
s3_bucket_name

env: AWS_DEFAULT_PROFILE=atl


'aws-gcr-csdc-atl-exp-us-west-2'

In [6]:
from datetime import datetime

currentDay = datetime.now().day
currentMonth = datetime.now().month
currentYear = datetime.now().year

current_time = f"{currentYear}{currentMonth}{currentDay}"

### 2. 把模型拷贝到S3为后续部署做准备

In [8]:
import sagemaker
from sagemaker import image_uris
import boto3
import os
import time
import json

role = role  # execution role for the endpoint
sess = sagemaker.session.Session()  # sagemaker session for interacting with different AWS APIs
bucket = sess.default_bucket()  # bucket to house artifacts

region = sess._region_name
account_id = sess.account_id()

s3_client = boto3.client("s3")
sm_client = boto3.client("sagemaker")
smr_client = boto3.client("sagemaker-runtime")

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ubuntu/.config/sagemaker/config.yaml


In [9]:
s3_model_prefix = "LLM-RAG/workshop/buffer-cross-001-model"  # folder where model checkpoint will go
model_snapshot_path = list(local_model_path.glob("**/snapshots/*"))[0]
s3_code_prefix = "LLM-RAG/workshop/buffer_cross_001_deploy_code"
print(f"s3_code_prefix: {s3_code_prefix}")
print(f"model_snapshot_path: {model_snapshot_path}")

s3_code_prefix: LLM-RAG/workshop/buffer_cross_001_deploy_code
model_snapshot_path: buffer-cross-001-model/models--csdc-atl--buffer-cross-001/snapshots/46d270928463db49b317e5ea469a8ac8152f4a13


In [10]:
!aws s3 cp --recursive {model_snapshot_path} s3://{bucket}/{s3_model_prefix}

upload: buffer-cross-001-model/models--csdc-atl--buffer-cross-001/snapshots/46d270928463db49b317e5ea469a8ac8152f4a13/added_tokens.json to s3://sagemaker-us-west-2-316327952690/LLM-RAG/workshop/buffer-cross-001-model/added_tokens.json
upload: buffer-cross-001-model/models--csdc-atl--buffer-cross-001/snapshots/46d270928463db49b317e5ea469a8ac8152f4a13/.gitattributes to s3://sagemaker-us-west-2-316327952690/LLM-RAG/workshop/buffer-cross-001-model/.gitattributes
upload: buffer-cross-001-model/models--csdc-atl--buffer-cross-001/snapshots/46d270928463db49b317e5ea469a8ac8152f4a13/README.md to s3://sagemaker-us-west-2-316327952690/LLM-RAG/workshop/buffer-cross-001-model/README.md
upload: buffer-cross-001-model/models--csdc-atl--buffer-cross-001/snapshots/46d270928463db49b317e5ea469a8ac8152f4a13/cross_model.py to s3://sagemaker-us-west-2-316327952690/LLM-RAG/workshop/buffer-cross-001-model/cross_model.py
upload: buffer-cross-001-model/models--csdc-atl--buffer-cross-001/snapshots/46d270928463db49

### 3. 模型部署准备（entrypoint脚本，容器镜像，服务配置）

In [11]:
inference_image_uri = (
    f"763104351884.dkr.ecr.{region}.amazonaws.com/djl-inference:0.21.0-deepspeed0.8.3-cu117"
)

#中国区需要替换为下面的image_uri
# inference_image_uri = (
#     f"727897471807.dkr.ecr.{region}.amazonaws.com.cn/djl-inference:0.21.0-deepspeed0.8.3-cu117"
# )

print(f"Image going to be used is ---- > {inference_image_uri}")

Image going to be used is ---- > 763104351884.dkr.ecr.us-west-2.amazonaws.com/djl-inference:0.21.0-deepspeed0.8.3-cu117


In [12]:
!mkdir -p buffer_cross_001_deploy_code

In [13]:
%%writefile buffer_cross_001_deploy_code/model.py
from djl_python import Input, Output
import torch
import logging
import math
import os
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer, AutoModel

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(f'--device={device}')


def load_model(properties):
    tensor_parallel = properties["tensor_parallel_degree"]
    model_location = properties['model_dir']
    if "model_id" in properties:
        model_location = properties['model_id']
    logging.info(f"Loading model in {model_location}")
    
    tokenizer = AutoTokenizer.from_pretrained(model_location, use_fast=False)
    model = AutoModel.from_pretrained(
        model_location, 
        # device_map="balanced_low_0", 
        trust_remote_code=True
    ).half()
    # load the model on GPU
    model.to(device) 
    model.requires_grad_(False)
    model.eval()
    
    return model, tokenizer


model = None
tokenizer = None
generator = None

def handle(inputs: Input):
    global model, tokenizer
    if not model:
        model, tokenizer = load_model(inputs.get_properties())

    if inputs.is_empty():
        return None
    data = inputs.get_as_json()
    
    queries = data["inputs"]
    docs = data["docs"]
    
    encoded_input = tokenizer(text = [queries], text_pair=[docs], padding=True, truncation=True, max_length=2048, return_tensors='pt')['input_ids'].to(device)
    # Compute token embeddings
    with torch.no_grad():
        model_output = model(input_ids=encoded_input)

    # Perform pooling. In this case, max pooling.

#     # preprocess
#     input_ids = tokenizer(input_sentences, return_tensors="pt").input_ids
#     # pass inputs with all kwargs in data
#     if params is not None:
#         outputs = model.generate(input_ids, **params)
#     else:
#         outputs = model.generate(input_ids)

#     # postprocess the prediction
#     prediction = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    result = {"scores": model_output.cpu().numpy()}
    return Output().add_as_json(result)

Writing buffer_cross_001_deploy_code/model.py


In [14]:
print(f"option.s3url ==> s3://{bucket}/{s3_model_prefix}/")

option.s3url ==> s3://sagemaker-us-west-2-316327952690/LLM-RAG/workshop/buffer-cross-001-model/


#### Note: option.s3url 需要按照自己的账号进行修改, 可以拷贝上一个cell的输出

In [15]:
%%writefile buffer_cross_001_deploy_code/serving.properties
engine=Python
option.tensor_parallel_degree=1
option.s3url = s3://sagemaker-us-west-2-316327952690/LLM-RAG/workshop/buffer-cross-001-model/

Writing buffer_cross_001_deploy_code/serving.properties


In [16]:
!rm cross_model.tar.gz
!cd buffer_cross_001_deploy_code && rm -rf ".ipynb_checkpoints"
!tar czvf cross_model.tar.gz buffer_cross_001_deploy_code

rm: cannot remove 'cross_model.tar.gz': No such file or directory
buffer_cross_001_deploy_code/
buffer_cross_001_deploy_code/model.py
buffer_cross_001_deploy_code/serving.properties


In [17]:
s3_code_artifact = sess.upload_data("cross_model.tar.gz", bucket, s3_code_prefix)
print(f"S3 Code or Model tar ball uploaded to --- > {s3_code_artifact}")

S3 Code or Model tar ball uploaded to --- > s3://sagemaker-us-west-2-316327952690/LLM-RAG/workshop/buffer_cross_001_deploy_code/cross_model.tar.gz


### 4. 创建模型 & 创建endpoint

In [18]:
from sagemaker.utils import name_from_base
import boto3

model_name = name_from_base("buffer-cross-001")# name_from_base("st-paraphrase-mpnet-base-v2") Note: Need to specify model_name
print(model_name)
print(f"Image going to be used is ---- > {inference_image_uri}")

create_model_response = sm_client.create_model(
    ModelName=model_name,
    ExecutionRoleArn=role,
    PrimaryContainer={
        "Image": inference_image_uri,
        "ModelDataUrl": s3_code_artifact
    },
    
)
model_arn = create_model_response["ModelArn"]

print(f"Created Model: {model_arn}")

buffer-cross-001-2023-11-09-06-02-30-135
Image going to be used is ---- > 763104351884.dkr.ecr.us-west-2.amazonaws.com/djl-inference:0.21.0-deepspeed0.8.3-cu117
Created Model: arn:aws:sagemaker:us-west-2:316327952690:model/buffer-cross-001-2023-11-09-06-02-30-135


#### 推理机型选择 (https://aws.amazon.com/cn/sagemaker/pricing/)
- GPU
  + ml.g4dn.xlarge 按需价格 0.526 USD/Hour
- CPU
  + ml.c5.xlarge   按需价格 0.204 USD/Hour

In [21]:
endpoint_config_name = f"{model_name}-config"
endpoint_name = f"{model_name}-endpoint"

endpoint_config_response = sm_client.create_endpoint_config(
    EndpointConfigName=endpoint_config_name,
    ProductionVariants=[
        {
            "VariantName": "variant1",
            "ModelName": model_name,
            "InstanceType": "ml.g4dn.xlarge",
            "InitialInstanceCount": 1,
            # "VolumeSizeInGB" : 400,
            # "ModelDataDownloadTimeoutInSeconds": 2400,
            "ContainerStartupHealthCheckTimeoutInSeconds": 15*60,
        },
    ],
)
endpoint_config_response

ClientError: An error occurred (ValidationException) when calling the CreateEndpointConfig operation: Cannot create already existing endpoint configuration "arn:aws:sagemaker:us-west-2:316327952690:endpoint-config/buffer-cross-001-2023-11-09-06-02-30-135-config".

In [22]:
tag=f"{current_time}-{commit_hash}"
create_endpoint_response = sm_client.create_endpoint(
    EndpointName=f"{endpoint_name}", EndpointConfigName=endpoint_config_name, Tags=[{"Key":"version", "Value":tag}],
)
print(f"Created Endpoint: {create_endpoint_response['EndpointArn']}")

Created Endpoint: arn:aws:sagemaker:us-west-2:316327952690:endpoint/buffer-cross-001-2023-11-09-06-02-30-135-endpoint


In [23]:
import time

resp = sm_client.describe_endpoint(EndpointName=endpoint_name)
status = resp["EndpointStatus"]
print("Status: " + status)

while status == "Creating":
    time.sleep(60)
    resp = sm_client.describe_endpoint(EndpointName=endpoint_name)
    status = resp["EndpointStatus"]
    print("Status: " + status)

print("Arn: " + resp["EndpointArn"])
print("Status: " + status)

Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: InService
Arn: arn:aws:sagemaker:us-west-2:316327952690:endpoint/buffer-cross-001-2023-11-09-06-02-30-135-endpoint
Status: InService


### 5. 模型测试

In [None]:
def get_vector_by_sm_endpoint(questions, docs, sm_client, endpoint_name):
    response_model = sm_client.invoke_endpoint(
        EndpointName=endpoint_name,
        Body=json.dumps(
            {
                "inputs": questions,
                "docs": docs
            }
        ),
        ContentType="application/json",
    )
    json_str = response_model['Body'].read().decode('utf8')
    json_obj = json.loads(json_str)
    embeddings = json_obj['scores'][0][1]
    return embeddings

In [None]:
prompts1 = """请问AWS Clean Rooms是多方都会收费吗？"""
docs1 = """请问AWS Clean Rooms是多方都会收费吗？"""
print(get_vector_by_sm_endpoint(prompts1, docs1, smr_client, endpoint_name))

In [None]:
!aws sagemaker delete-endpoint --endpoint-name buffer-cross-001-2023-07-06-16-01-01-786-endpoint

In [None]:
!aws sagemaker delete-endpoint-config --endpoint-config-name buffer-cross-001-2023-07-06-16-01-01-786-config

In [None]:
!aws sagemaker delete-model --model-name buffer-cross-001-2023-07-06-16-01-01-786