### 1. 安装HuggingFace 并下载模型到本地

In [1]:
!pip install huggingface-hub -Uqq
!pip install -U sagemaker



In [2]:
# bge-rerank

from huggingface_hub import snapshot_download
from pathlib import Path

local_model_path_name = "./BAAI_bge_reranker_large"
model_hf_name = "BAAI/bge-reranker-large"
model_name = model_hf_name.split('/')[-1]
commit_hash = "27c9168d479987529781de8474dff94d69beca11"

local_model_path = Path(local_model_path_name)
local_model_path.mkdir(exist_ok=True)
model_name = model_name
commit_hash = commit_hash

In [3]:
snapshot_download(repo_id=model_hf_name, revision=commit_hash, cache_dir=local_model_path)

Fetching 8 files:   0%|          | 0/8 [00:00<?, ?it/s]

'BAAI_bge_reranker_large/models--BAAI--bge-reranker-large/snapshots/27c9168d479987529781de8474dff94d69beca11'

In [4]:
import os
from dotenv import load_dotenv
load_dotenv()
name="ATL_UW2"
profile_name = os.environ[f"{name}"]
%env AWS_DEFAULT_PROFILE = {profile_name}
s3_bucket_name=os.environ[f"{name}_S3_BUCKET_NAME"]
role=os.environ[f"{name}_ROLE"]
s3_bucket_name

env: AWS_DEFAULT_PROFILE=atl-uw2


'aws-gcr-csdc-atl-exp-us-west-2'

In [5]:
from datetime import datetime

currentDay = datetime.now().day
currentMonth = datetime.now().month
currentYear = datetime.now().year

current_time = f"{currentYear}{currentMonth}{currentDay}"
current_time

'20231121'

In [6]:
import sagemaker
from sagemaker import image_uris
import boto3
import os
import time
import json

# role = sagemaker.get_execution_role()  # execution role for the endpoint
sess = sagemaker.session.Session()  # sagemaker session for interacting with different AWS APIs
bucket = sess.default_bucket()

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ubuntu/.config/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ubuntu/.config/sagemaker/config.yaml


### 2. 把模型拷贝到S3为后续部署做准备

In [7]:
import sagemaker
from sagemaker import image_uris
import boto3
import os
import time
import json

#role = sagemaker.get_execution_role()  # execution role for the endpoint
sess = sagemaker.session.Session()  # sagemaker session for interacting with different AWS APIs
# bucket = sess.default_bucket()  # bucket to house artifacts
bucket = s3_bucket_name  # bucket to house artifacts

region = sess._region_name
account_id = sess.account_id()

s3_client = boto3.client("s3")
sm_client = boto3.client("sagemaker")
smr_client = boto3.client("sagemaker-runtime")

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ubuntu/.config/sagemaker/config.yaml


In [8]:
s3_model_prefix = f"aigc-embedding-models/{model_name}"  # folder where model checkpoint will go
model_snapshot_path = list(local_model_path.glob("**/snapshots/*"))[0]
s3_code_prefix = f"aigc-embedding-models/{model_name}_deploy_code"
print(f"s3_code_prefix: {s3_code_prefix}")
print(f"model_snapshot_path: {model_snapshot_path}")

s3_code_prefix: aigc-embedding-models/bge-reranker-large_deploy_code
model_snapshot_path: BAAI_bge_reranker_large/models--BAAI--bge-reranker-large/snapshots/27c9168d479987529781de8474dff94d69beca11


In [9]:
!aws s3 cp --recursive {model_snapshot_path} s3://{bucket}/{s3_model_prefix}

upload: BAAI_bge_reranker_large/models--BAAI--bge-reranker-large/snapshots/27c9168d479987529781de8474dff94d69beca11/tokenizer_config.json to s3://aws-gcr-csdc-atl-exp-us-west-2/aigc-embedding-models/bge-reranker-large/tokenizer_config.json
upload: BAAI_bge_reranker_large/models--BAAI--bge-reranker-large/snapshots/27c9168d479987529781de8474dff94d69beca11/README.md to s3://aws-gcr-csdc-atl-exp-us-west-2/aigc-embedding-models/bge-reranker-large/README.md
upload: BAAI_bge_reranker_large/models--BAAI--bge-reranker-large/snapshots/27c9168d479987529781de8474dff94d69beca11/.gitattributes to s3://aws-gcr-csdc-atl-exp-us-west-2/aigc-embedding-models/bge-reranker-large/.gitattributes
upload: BAAI_bge_reranker_large/models--BAAI--bge-reranker-large/snapshots/27c9168d479987529781de8474dff94d69beca11/special_tokens_map.json to s3://aws-gcr-csdc-atl-exp-us-west-2/aigc-embedding-models/bge-reranker-large/special_tokens_map.json
upload: BAAI_bge_reranker_large/models--BAAI--bge-reranker-large/snapshots

### 3. 模型部署准备（entrypoint脚本，容器镜像，服务配置）

In [10]:
inference_image_uri = (
    f"763104351884.dkr.ecr.{region}.amazonaws.com/djl-inference:0.21.0-deepspeed0.8.3-cu117"
)

#中国区需要替换为下面的image_uri
# inference_image_uri = (
#     f"727897471807.dkr.ecr.{region}.amazonaws.com.cn/djl-inference:0.21.0-deepspeed0.8.3-cu117"
# )

print(f"Image going to be used is ---- > {inference_image_uri}")

Image going to be used is ---- > 763104351884.dkr.ecr.us-west-2.amazonaws.com/djl-inference:0.21.0-deepspeed0.8.3-cu117


In [11]:
local_code_dir = s3_code_prefix.split('/')[-1]
!mkdir -p {local_code_dir}

In [12]:
%%writefile {local_code_dir}/model.py
from djl_python import Input, Output
import torch
import logging
import math
import os
from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer, AutoModel

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(f'--device={device}')


def load_model(properties):
    tensor_parallel = properties["tensor_parallel_degree"]
    model_location = properties['model_dir']
    if "model_id" in properties:
        model_location = properties['model_id']
    logging.info(f"Loading model in {model_location}")
    
    tokenizer = AutoTokenizer.from_pretrained(model_location, trust_remote_code=True)
    # tokenizer.padding_side = 'right'
    # model = AutoModelForSequenceClassification.from_pretrained(
    #     model_location, 
    #     # device_map="balanced_low_0", 
    #     trust_remote_code=True
    # ).half()
    model = AutoModelForSequenceClassification.from_pretrained(
        model_location, 
        # device_map="balanced_low_0", 
        trust_remote_code=True
    )
    # load the model on GPU
    model.to(device) 
    # model.requires_grad_(False)
    model.eval()
    
    return model, tokenizer


model = None
tokenizer = None
generator = None

def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0].to(device) #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float().to(device)
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)


def handle(inputs: Input):
    global model, tokenizer
    if not model:
        model, tokenizer = load_model(inputs.get_properties())

    if inputs.is_empty():
        return None
    data = inputs.get_as_json()
    
    input_sentences = data["inputs"]
    logging.info(f"inputs: {input_sentences}")
    
    # Compute token embeddings
    with torch.no_grad():
        encoded_input = tokenizer(input_sentences, padding=True, truncation=True, return_tensors='pt', max_length=512).to(device)
        scores = model(**encoded_input, return_dict=True).logits.view(-1, ).float()
        # model_output = model(**encoded_input)
        # sentence_embeddings = model_output[0][:, 0]

    # Perform pooling. In this case, max pooling.
    # sentence_embeddings = model_output.cpu().numpy()
    # sentence_embeddings = torch.nn.functional.normalize(sentence_embeddings, p=2, dim=1).cpu().numpy()

#     # preprocess
#     input_ids = tokenizer(input_sentences, return_tensors="pt").input_ids
#     # pass inputs with all kwargs in data
#     if params is not None:
#         outputs = model.generate(input_ids, **params)
#     else:
#         outputs = model.generate(input_ids)

#     # postprocess the prediction
#     prediction = tokenizer.decode(outputs[0], skip_special_tokens=True)
    output = scores.cpu().numpy()
    
    result = {"rerank_scores": output}
    return Output().add_as_json(result)

Overwriting bge-reranker-large_deploy_code/model.py


In [13]:
s3_path = f"s3://{bucket}/{s3_model_prefix}/"
print(f"option.s3url ==> {s3_path}")

option.s3url ==> s3://aws-gcr-csdc-atl-exp-us-west-2/aigc-embedding-models/bge-reranker-large/


#### Note: option.s3url 需要按照自己的账号进行修改, 可以拷贝上一个cell的输出

In [14]:
%%writefile {local_code_dir}/serving.properties
engine=Python
option.tensor_parallel_degree=1
# option.s3url = fs3://sagemaker-us-west-2-316327952690/LLM-RAG/workshop/buffer-embedding-002-model/
option.s3url = S3PATH

Overwriting bge-reranker-large_deploy_code/serving.properties


In [15]:
%%writefile {local_code_dir}/requirements.txt
protobuf==3.20.0

Overwriting bge-reranker-large_deploy_code/requirements.txt


In [16]:
!sed -i "s|option.s3url = S3PATH|option.s3url = {s3_path}|" {local_code_dir}/serving.properties

In [17]:
!rm s2e_model.tar.gz
!cd {local_code_dir} && rm -rf ".ipynb_checkpoints"
!tar czvf s2e_model.tar.gz {local_code_dir}

bge-reranker-large_deploy_code/
bge-reranker-large_deploy_code/model.py
bge-reranker-large_deploy_code/serving.properties
bge-reranker-large_deploy_code/requirements.txt


In [18]:
s3_code_artifact = sess.upload_data("s2e_model.tar.gz", bucket, s3_code_prefix)
print(f"S3 Code or Model tar ball uploaded to --- > {s3_code_artifact}")

S3 Code or Model tar ball uploaded to --- > s3://aws-gcr-csdc-atl-exp-us-west-2/aigc-embedding-models/bge-reranker-large_deploy_code/s2e_model.tar.gz


### 4. 创建模型 & 创建endpoint

In [19]:
from sagemaker.utils import name_from_base
import boto3

model_name_ep = name_from_base(f"{model_name}").replace('.','-').replace('_','-')# name_from_base("st-paraphrase-mpnet-base-v2") Note: Need to specify model_name
print(model_name_ep)
print(f"Image going to be used is ---- > {inference_image_uri}")

create_model_response = sm_client.create_model(
    ModelName=model_name_ep,
    ExecutionRoleArn=role,
    PrimaryContainer={
        "Image": inference_image_uri,
        "ModelDataUrl": s3_code_artifact
    },
    
)
model_arn = create_model_response["ModelArn"]

print(f"Created Model: {model_arn}")

bge-reranker-large-2023-11-21-06-11-35-036
Image going to be used is ---- > 763104351884.dkr.ecr.us-west-2.amazonaws.com/djl-inference:0.21.0-deepspeed0.8.3-cu117


Created Model: arn:aws:sagemaker:us-west-2:316327952690:model/bge-reranker-large-2023-11-21-06-11-35-036


#### 推理机型选择 (https://aws.amazon.com/cn/sagemaker/pricing/)
- GPU
  + ml.g4dn.xlarge 按需价格 0.526 USD/Hour
- CPU
  + ml.c5.xlarge   按需价格 0.204 USD/Hour

In [20]:
endpoint_config_name = f"{model_name_ep}-config"
endpoint_name = f"{model_name_ep}-endpoint"

endpoint_config_response = sm_client.create_endpoint_config(
    EndpointConfigName=endpoint_config_name,
    ProductionVariants=[
        {
            "VariantName": "variant1",
            "ModelName": model_name_ep,
            "InstanceType": "ml.g5.2xlarge",
            "InitialInstanceCount": 1,
            # "VolumeSizeInGB" : 400,
            # "ModelDataDownloadTimeoutInSeconds": 2400,
            "ContainerStartupHealthCheckTimeoutInSeconds": 15*60,
        },
    ],
)
endpoint_config_response

{'EndpointConfigArn': 'arn:aws:sagemaker:us-west-2:316327952690:endpoint-config/bge-reranker-large-2023-11-21-06-11-35-036-config',
 'ResponseMetadata': {'RequestId': '74a2aa52-227c-441c-a628-31c1edc1c147',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '74a2aa52-227c-441c-a628-31c1edc1c147',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '130',
   'date': 'Tue, 21 Nov 2023 06:11:35 GMT'},
  'RetryAttempts': 0}}

In [21]:
endpoint_name

'bge-reranker-large-2023-11-21-06-11-35-036-endpoint'

In [22]:
tag=f"{current_time}-{commit_hash}"

create_endpoint_response = sm_client.create_endpoint(
    EndpointName=f"{endpoint_name}", EndpointConfigName=endpoint_config_name, Tags=[{"Key":"version", "Value":tag}],
)
print(f"Created Endpoint: {create_endpoint_response['EndpointArn']}")

Created Endpoint: arn:aws:sagemaker:us-west-2:316327952690:endpoint/bge-reranker-large-2023-11-21-06-11-35-036-endpoint


In [23]:
import time

resp = sm_client.describe_endpoint(EndpointName=endpoint_name)
status = resp["EndpointStatus"]
print("Status: " + status)

while status == "Creating":
    time.sleep(60)
    resp = sm_client.describe_endpoint(EndpointName=endpoint_name)
    status = resp["EndpointStatus"]
    print("Status: " + status)

print("Arn: " + resp["EndpointArn"])
print("Status: " + status)

Status: Creating


Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: InService
Arn: arn:aws:sagemaker:us-west-2:316327952690:endpoint/bge-reranker-large-2023-11-21-06-11-35-036-endpoint
Status: InService


### 5. 模型测试

In [None]:
def get_vector_by_sm_endpoint(questions, sm_client, endpoint_name):
    parameters = {
        # "early_stopping": True,
        # "length_penalty": 2.0,
        "max_new_tokens": 50,
        "temperature": 0,
        "min_length": 10,
        "no_repeat_ngram_size": 2,
    }

    response_model = sm_client.invoke_endpoint(
        EndpointName=endpoint_name,
        Body=json.dumps(
            {
                "inputs": questions,
            }
        ),
        ContentType="application/json",
    )
    json_str = response_model['Body'].read().decode('utf8')
    json_obj = json.loads(json_str)
    embeddings = json_obj['rerank_scores']
    return embeddings

In [25]:
pairs = [['what is panda?', 'hi'], ['what is panda?', 'The giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China.']]

# endpoint_name = "bge-reranker-large-2023-11-21-03-59-28-576-endpoint"

res = get_vector_by_sm_endpoint(pairs, smr_client, endpoint_name)

KeyError: 'sentence_embeddings'

In [None]:
res

[[0.0029659271240234375,
  0.020538330078125,
  -0.028289794921875,
  0.005054473876953125,
  0.04986572265625,
  0.0618896484375,
  -0.02288818359375,
  -0.006893157958984375,
  -0.0166015625,
  0.03216552734375,
  0.00055694580078125,
  0.01629638671875,
  0.0005846023559570312,
  -0.01776123046875,
  0.0380859375,
  -0.027435302734375,
  0.11285400390625,
  0.0258636474609375,
  0.027313232421875,
  -0.00858306884765625,
  0.047149658203125,
  -0.022430419921875,
  -0.00611114501953125,
  0.0090484619140625,
  -0.0687255859375,
  -0.043670654296875,
  -0.029052734375,
  0.016082763671875,
  0.0411376953125,
  -0.01116943359375,
  -0.0254974365234375,
  0.0306549072265625,
  -0.0157470703125,
  0.039031982421875,
  0.0164337158203125,
  -0.00998687744140625,
  -0.0133209228515625,
  -0.018341064453125,
  -0.0007719993591308594,
  -0.009552001953125,
  0.03173828125,
  0.056488037109375,
  -0.0142364501953125,
  -0.035919189453125,
  0.034881591796875,
  -0.0292205810546875,
  -0.0386

In [None]:
# test for zh endpoint
import numpy as np

endpoint_name = "bge-large-zh-v1-5-2023-11-15-06-52-26-105-endpoint"


cand1 = []
cand1.append("AWS提供的关键服务有哪些，以支持不同类型的企业需求和工作负载？")
cand1.append("Amazon SageMaker提供哪些关键功能，以支持企业在机器学习领域的需求和挑战？")
cand2 = []
cand2.append("AWS的全球基础架构是如何支持高可靠性和可扩展性，以满足企业的云计算需求？")
cand2.append("Amazon SageMaker如何简化机器学习模型的开发、训练和部署过程，以提高企业的生产效率和创新能力？")

cand = cand1 + cand2

res = get_vector_by_sm_endpoint(cand, smr_client, endpoint_name)

cand1_embed = res[0:2]
cand2_embed = res[2:]

results = {}

for idx1, cand1_score in enumerate(cand1_embed):
    for idx2, cand2_score in enumerate(cand2_embed):
        # print(cand2_score)
        print(f"{cand1[idx1]}")
        print(f"{cand2[idx2]}") 
        print(f"score: {np.dot(np.array(cand1_score), np.array(cand2_score).T)}")

AWS提供的关键服务有哪些，以支持不同类型的企业需求和工作负载？
AWS的全球基础架构是如何支持高可靠性和可扩展性，以满足企业的云计算需求？
score: 0.7131936687519556
AWS提供的关键服务有哪些，以支持不同类型的企业需求和工作负载？
Amazon SageMaker如何简化机器学习模型的开发、训练和部署过程，以提高企业的生产效率和创新能力？
score: 0.528399250037694
Amazon SageMaker提供哪些关键功能，以支持企业在机器学习领域的需求和挑战？
AWS的全球基础架构是如何支持高可靠性和可扩展性，以满足企业的云计算需求？
score: 0.5657403004042862
Amazon SageMaker提供哪些关键功能，以支持企业在机器学习领域的需求和挑战？
Amazon SageMaker如何简化机器学习模型的开发、训练和部署过程，以提高企业的生产效率和创新能力？
score: 0.717500172299026


In [None]:
# test for en endpoint
import numpy as np

endpoint_name = "bge-large-en-v1-5-2023-11-15-06-19-29-526-endpoint"


cand1 = []
cand1.append("What are some key advantages of adopting AWS for cloud computing compared to traditional on-premises infrastructure?")
cand1.append("What are the key advantages of using Amazon SageMaker for machine learning compared to traditional ML development workflows?")
cand2 = []
cand2.append("Can you explain the main benefits that businesses can gain by leveraging AWS services for their cloud computing needs?")
cand2.append("How does Amazon SageMaker simplify the process of building, training, and deploying machine learning models compared to traditional approaches?")

cand = cand1 + cand2

res = get_vector_by_sm_endpoint(cand, smr_client, endpoint_name)

cand1_embed = res[0:2]
cand2_embed = res[2:]

results = {}

for idx1, cand1_score in enumerate(cand1_embed):
    for idx2, cand2_score in enumerate(cand2_embed):
        # print(cand2_score)
        print(f"{cand1[idx1]}")
        print(f"{cand2[idx2]}") 
        print(f"score: {np.dot(np.array(cand1_score), np.array(cand2_score).T)}")

What are some key advantages of adopting AWS for cloud computing compared to traditional on-premises infrastructure?
Can you explain the main benefits that businesses can gain by leveraging AWS services for their cloud computing needs?
score: 0.8515167698601687
What are some key advantages of adopting AWS for cloud computing compared to traditional on-premises infrastructure?
How does Amazon SageMaker simplify the process of building, training, and deploying machine learning models compared to traditional approaches?
score: 0.5905170103072663
What are the key advantages of using Amazon SageMaker for machine learning compared to traditional ML development workflows?
Can you explain the main benefits that businesses can gain by leveraging AWS services for their cloud computing needs?
score: 0.5681599540677098
What are the key advantages of using Amazon SageMaker for machine learning compared to traditional ML development workflows?
How does Amazon SageMaker simplify the process of buildi

In [None]:
prompts1 = """请问AWS Clean Rooms是多方都会收费吗？"""

res = get_vector_by_sm_endpoint([prompts1], smr_client, endpoint_name)

In [None]:
prompts1 = """What is the purpose of Amazon S3 in AWS, and how is it typically used?"""

res = get_vector_by_sm_endpoint([prompts1], smr_client, endpoint_name)

In [None]:
res

[[0.02490234375,
  0.023651123046875,
  -0.027740478515625,
  0.0184173583984375,
  -0.04144287109375,
  0.0265350341796875,
  -0.01412200927734375,
  0.024078369140625,
  0.0567626953125,
  0.03436279296875,
  -0.0024242401123046875,
  -0.0408935546875,
  -0.0012235641479492188,
  -0.0306854248046875,
  -0.039947509765625,
  -0.0174407958984375,
  -0.02435302734375,
  0.01317596435546875,
  -0.02569580078125,
  -0.01448822021484375,
  0.067626953125,
  -0.006134033203125,
  -0.040252685546875,
  -0.032440185546875,
  -0.0265655517578125,
  0.0374755859375,
  -0.0300750732421875,
  0.0093536376953125,
  0.07977294921875,
  0.059112548828125,
  -0.046112060546875,
  -0.00732421875,
  -0.0013217926025390625,
  -0.04296875,
  0.0223388671875,
  -0.010772705078125,
  -0.0182342529296875,
  -0.0014886856079101562,
  -0.048614501953125,
  -0.031494140625,
  0.0305328369140625,
  0.01317596435546875,
  -0.004276275634765625,
  -0.0253143310546875,
  -0.054840087890625,
  0.01605224609375,
  0