### 1. 安装HuggingFace 并下载模型到本地

In [1]:
!pip install huggingface-hub -Uqq
!pip install -U sagemaker

Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
Collecting sagemaker
  Downloading sagemaker-2.157.0.tar.gz (790 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m790.6/790.6 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting attrs<24,>=23.1.0
  Downloading attrs-23.1.0-py3-none-any.whl (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.2/61.2 kB[0m [31m16.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting boto3<2.0,>=1.26.131
  Downloading boto3-1.26.137-py3-none-any.whl (135 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m135.6/135.6 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hCollecting cloudpickle==2.2.1
  Downloading cloudpickle-2.2.1-py3-none-any.whl (25 kB)
Collecting PyYAML==6.0
  Downloading PyYAML-6.0-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.

In [2]:
from huggingface_hub import snapshot_download
from pathlib import Path

local_model_path = Path("./sentence2emb-model")
local_model_path.mkdir(exist_ok=True)
model_name = "sentence-transformers/paraphrase-multilingual-mpnet-base-v2"
commit_hash = "ef15aed8b328d308d7237b9bf15269f2cd19e268"

In [3]:
snapshot_download(repo_id=model_name, revision=commit_hash, cache_dir=local_model_path)

Fetching 13 files:   0%|          | 0/13 [00:00<?, ?it/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

Downloading (…)9e268/.gitattributes:   0%|          | 0.00/690 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading (…)d19e268/modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

Downloading (…)cd19e268/config.json:   0%|          | 0.00/723 [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)tencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/402 [00:00<?, ?B/s]

Downloading (…)f2cd19e268/README.md:   0%|          | 0.00/3.77k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading tf_model.h5:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

Downloading (…)9e268/tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

'sentence2emb-model/models--sentence-transformers--paraphrase-multilingual-mpnet-base-v2/snapshots/ef15aed8b328d308d7237b9bf15269f2cd19e268'

### 2. 把模型拷贝到S3为后续部署做准备

In [4]:
import sagemaker
from sagemaker import image_uris
import boto3
import os
import time
import json

role = sagemaker.get_execution_role()  # execution role for the endpoint
sess = sagemaker.session.Session()  # sagemaker session for interacting with different AWS APIs
bucket = sess.default_bucket()  # bucket to house artifacts

region = sess._region_name
account_id = sess.account_id()

s3_client = boto3.client("s3")
sm_client = boto3.client("sagemaker")
smr_client = boto3.client("sagemaker-runtime")

In [5]:
s3_model_prefix = "LLM-RAG/workshop/sentence2emb-model"  # folder where model checkpoint will go
model_snapshot_path = list(local_model_path.glob("**/snapshots/*"))[0]
s3_code_prefix = "LLM-RAG/workshop/sentence2emb_deploy_code"
print(f"s3_code_prefix: {s3_code_prefix}")
print(f"model_snapshot_path: {model_snapshot_path}")

s3_code_prefix: LLM-RAG/workshop/sentence2emb_deploy_code
model_snapshot_path: sentence2emb-model/models--sentence-transformers--paraphrase-multilingual-mpnet-base-v2/snapshots/ef15aed8b328d308d7237b9bf15269f2cd19e268


In [6]:
!aws s3 cp --recursive {model_snapshot_path} s3://{bucket}/{s3_model_prefix}

upload: sentence2emb-model/models--sentence-transformers--paraphrase-multilingual-mpnet-base-v2/snapshots/ef15aed8b328d308d7237b9bf15269f2cd19e268/.gitattributes to s3://sagemaker-ap-northeast-1-946277762357/LLM-RAG/workshop/sentence2emb-model/.gitattributes
upload: sentence2emb-model/models--sentence-transformers--paraphrase-multilingual-mpnet-base-v2/snapshots/ef15aed8b328d308d7237b9bf15269f2cd19e268/1_Pooling/config.json to s3://sagemaker-ap-northeast-1-946277762357/LLM-RAG/workshop/sentence2emb-model/1_Pooling/config.json
upload: sentence2emb-model/models--sentence-transformers--paraphrase-multilingual-mpnet-base-v2/snapshots/ef15aed8b328d308d7237b9bf15269f2cd19e268/README.md to s3://sagemaker-ap-northeast-1-946277762357/LLM-RAG/workshop/sentence2emb-model/README.md
upload: sentence2emb-model/models--sentence-transformers--paraphrase-multilingual-mpnet-base-v2/snapshots/ef15aed8b328d308d7237b9bf15269f2cd19e268/sentence_bert_config.json to s3://sagemaker-ap-northeast-1-946277762357/

### 3. 模型部署准备（entrypoint脚本，容器镜像，服务配置）

In [7]:
inference_image_uri = (
    f"763104351884.dkr.ecr.{region}.amazonaws.com/djl-inference:0.21.0-deepspeed0.8.3-cu117"
)

#中国区需要替换为下面的image_uri
# inference_image_uri = (
#     f"727897471807.dkr.ecr.{region}.amazonaws.com.cn/djl-inference:0.21.0-deepspeed0.8.3-cu117"
# )

print(f"Image going to be used is ---- > {inference_image_uri}")

Image going to be used is ---- > 763104351884.dkr.ecr.ap-northeast-1.amazonaws.com/djl-inference:0.21.0-deepspeed0.8.3-cu117


In [8]:
!mkdir -p sentence2emb_deploy_code

In [55]:
%%writefile sentence2emb_deploy_code/model.py
from djl_python import Input, Output
import torch
import logging
import math
import os
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer, AutoModel

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(f'--device={device}')


def load_model(properties):
    tensor_parallel = properties["tensor_parallel_degree"]
    model_location = properties['model_dir']
    if "model_id" in properties:
        model_location = properties['model_id']
    logging.info(f"Loading model in {model_location}")
    
    tokenizer = AutoTokenizer.from_pretrained(model_location)
   
    model = AutoModel.from_pretrained(
        model_location, 
        # device_map="balanced_low_0", 
        #load_in_8bit=True
    )
    # load the model on GPU
    model.to(device) 
    model.requires_grad_(False)
    model.eval()
    
    return model, tokenizer


model = None
tokenizer = None
generator = None

def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0].to(device) #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float().to(device)
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)


def handle(inputs: Input):
    global model, tokenizer
    if not model:
        model, tokenizer = load_model(inputs.get_properties())

    if inputs.is_empty():
        return None
    data = inputs.get_as_json()
    
    input_sentences = data["inputs"]
    params = data["parameters"]
    logging.info(f"inputs: {input_sentences}")
    logging.info(f"parameters: {params}")
    
    encoded_input = tokenizer(input_sentences, padding=True, truncation=True, return_tensors='pt').to(device)
    # Compute token embeddings
    with torch.no_grad():
        model_output = model(**encoded_input)

    # Perform pooling. In this case, max pooling.
    sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask']).to(device).cpu().numpy()

#     # preprocess
#     input_ids = tokenizer(input_sentences, return_tensors="pt").input_ids
#     # pass inputs with all kwargs in data
#     if params is not None:
#         outputs = model.generate(input_ids, **params)
#     else:
#         outputs = model.generate(input_ids)

#     # postprocess the prediction
#     prediction = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    result = {"sentence_embeddings": sentence_embeddings}
    return Output().add_as_json(result)

Overwriting sentence2emb_deploy_code/model.py


#### Note: option.s3url 需要按照自己的账号进行修改

In [56]:
%%writefile sentence2emb_deploy_code/serving.properties
engine=Python
option.tensor_parallel_degree=1
option.s3url = s3://sagemaker-ap-northeast-1-946277762357/LLM-RAG/workshop/sentence2emb-model/

Overwriting sentence2emb_deploy_code/serving.properties


In [57]:
!rm s2e_model.tar.gz
!cd sentence2emb_deploy_code && rm -rf ".ipynb_checkpoints"
!tar czvf s2e_model.tar.gz sentence2emb_deploy_code

sentence2emb_deploy_code/
sentence2emb_deploy_code/serving.properties
sentence2emb_deploy_code/model.py


In [58]:
s3_code_artifact = sess.upload_data("s2e_model.tar.gz", bucket, s3_code_prefix)
print(f"S3 Code or Model tar ball uploaded to --- > {s3_code_artifact}")

S3 Code or Model tar ball uploaded to --- > s3://sagemaker-ap-northeast-1-946277762357/LLM-RAG/workshop/sentence2emb_deploy_code/s2e_model.tar.gz


### 4. 创建模型 & 创建endpoint

In [59]:
from sagemaker.utils import name_from_base
import boto3

model_name = # name_from_base("st-paraphrase-mpnet-base-v2") Note: Need to specify model_name
print(model_name)
print(f"Image going to be used is ---- > {inference_image_uri}")

create_model_response = sm_client.create_model(
    ModelName=model_name,
    ExecutionRoleArn=role,
    PrimaryContainer={
        "Image": inference_image_uri,
        "ModelDataUrl": s3_code_artifact
    },
    
)
model_arn = create_model_response["ModelArn"]

print(f"Created Model: {model_arn}")

st-paraphrase-mpnet-base-v2-cuda-embedding
Image going to be used is ---- > 763104351884.dkr.ecr.ap-northeast-1.amazonaws.com/djl-inference:0.21.0-deepspeed0.8.3-cu117
Created Model: arn:aws:sagemaker:ap-northeast-1:946277762357:model/st-paraphrase-mpnet-base-v2-cuda-embedding


In [60]:
endpoint_config_name = f"{model_name}-config"
endpoint_name = f"{model_name}-endpoint"

endpoint_config_response = sm_client.create_endpoint_config(
    EndpointConfigName=endpoint_config_name,
    ProductionVariants=[
        {
            "VariantName": "variant1",
            "ModelName": model_name,
            "InstanceType": "ml.g4dn.xlarge",
            "InitialInstanceCount": 1,
            # "VolumeSizeInGB" : 400,
            # "ModelDataDownloadTimeoutInSeconds": 2400,
            "ContainerStartupHealthCheckTimeoutInSeconds": 15*60,
        },
    ],
)
endpoint_config_response

{'EndpointConfigArn': 'arn:aws:sagemaker:ap-northeast-1:946277762357:endpoint-config/st-paraphrase-mpnet-base-v2-cuda-embedding-config',
 'ResponseMetadata': {'RequestId': '0f4a30ed-3981-4673-b948-538f796aa105',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '0f4a30ed-3981-4673-b948-538f796aa105',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '135',
   'date': 'Mon, 22 May 2023 01:50:20 GMT'},
  'RetryAttempts': 0}}

In [61]:
create_endpoint_response = sm_client.create_endpoint(
    EndpointName=f"{endpoint_name}", EndpointConfigName=endpoint_config_name
)
print(f"Created Endpoint: {create_endpoint_response['EndpointArn']}")

Created Endpoint: arn:aws:sagemaker:ap-northeast-1:946277762357:endpoint/st-paraphrase-mpnet-base-v2-cuda-embedding-endpoint


In [62]:
import time

resp = sm_client.describe_endpoint(EndpointName=endpoint_name)
status = resp["EndpointStatus"]
print("Status: " + status)

while status == "Creating":
    time.sleep(60)
    resp = sm_client.describe_endpoint(EndpointName=endpoint_name)
    status = resp["EndpointStatus"]
    print("Status: " + status)

print("Arn: " + resp["EndpointArn"])
print("Status: " + status)

Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: InService
Arn: arn:aws:sagemaker:ap-northeast-1:946277762357:endpoint/st-paraphrase-mpnet-base-v2-cuda-embedding-endpoint
Status: InService


### 5. 模型测试

In [63]:
def get_vector_by_sm_endpoint(questions, sm_client, endpoint_name):
    parameters = {
        # "early_stopping": True,
        # "length_penalty": 2.0,
        "max_new_tokens": 50,
        "temperature": 0,
        "min_length": 10,
        "no_repeat_ngram_size": 2,
    }

    response_model = sm_client.invoke_endpoint(
        EndpointName=endpoint_name,
        Body=json.dumps(
            {
                "inputs": questions,
                "parameters": parameters
            }
        ),
        ContentType="application/json",
    )
    json_str = response_model['Body'].read().decode('utf8')
    json_obj = json.loads(json_str)
    embeddings = json_obj['sentence_embeddings']
    return embeddings

In [64]:
prompts1 = """请问AWS Clean Rooms是多方都会收费吗？"""

print(get_vector_by_sm_endpoint(prompts1, smr_client, endpoint_name))

[[-0.09426412731409073, 0.1308506429195404, -0.01236279308795929, -0.02249860391020775, -0.04928739741444588, 0.07633629441261292, 0.11948582530021667, -0.048521965742111206, 0.031562887132167816, -3.7288758903741837e-05, -0.07421009987592697, -0.1545608639717102, -0.006521969567984343, 0.02532505989074707, -0.040099505335092545, 0.0862278938293457, 0.03407088667154312, 0.1100526973605156, -0.03708254545927048, -0.07843822240829468, -0.08710747957229614, -0.09370113909244537, -0.042732104659080505, -0.03769661858677864, 0.06182694807648659, -0.11406316608190536, -0.05065455287694931, -0.05727192014455795, 0.16234728693962097, 0.10307775437831879, 0.22929474711418152, 0.10204464197158813, 0.08882834762334824, 0.13194499909877777, 0.06006615236401558, -0.07494837045669556, -0.030907675623893738, 0.04655185714364052, -0.17468029260635376, 0.06435616314411163, -0.11814874410629272, 0.11599691212177277, 0.0395064502954483, -0.010311184450984001, -0.06457937508821487, -0.16605383157730103, 0

In [53]:
!aws sagemaker delete-endpoint --endpoint-name st-paraphrase-mpnet-base-v2-cuda-embedding-endpoint

In [54]:
!aws sagemaker delete-endpoint-config --endpoint-config-name st-paraphrase-mpnet-base-v2-cuda-embedding-config

In [None]:
!aws sagemaker delete-model --model-name st-paraphrase-mpnet-base-v2-cuda-embedding