### 1. 安装HuggingFace 并下载模型到本地

In [None]:
!pip install huggingface-hub -Uqq
!pip install -U sagemaker

In [None]:
# # bge-en

# from huggingface_hub import snapshot_download
# from pathlib import Path

# # local_model_path = Path("./buffer-embedding-002-model")
# local_model_path_name = "./BAAI_bge_large_en_v1.5"
# model_hf_name = "BAAI/bge-large-en-v1.5"
# model_name = model_hf_name.split('/')[-1]
# commit_hash = "5888da4a3a013e65d33dd6f612ecd4625eb87a7d"

# local_model_path = Path(local_model_path_name)
# local_model_path.mkdir(exist_ok=True)
# model_name = model_name
# commit_hash = commit_hash

In [None]:
# bge-zh

from huggingface_hub import snapshot_download
from pathlib import Path

# local_model_path = Path("./buffer-embedding-002-model")
local_model_path_name = "./BAAI_bge_large_zh_v1.5"
model_hf_name = "BAAI/bge-large-zh-v1.5"
model_name = model_hf_name.split('/')[-1]
commit_hash = "b5c9d86d763d9945f7c0a73e549a4a39c423d520"

local_model_path = Path(local_model_path_name)
local_model_path.mkdir(exist_ok=True)
model_name = model_name
commit_hash = commit_hash

In [None]:
snapshot_download(repo_id=model_hf_name, revision=commit_hash, cache_dir=local_model_path)

In [None]:
import os
from dotenv import load_dotenv
load_dotenv()
name="ATL_UW2"
profile_name = os.environ[f"{name}"]
%env AWS_DEFAULT_PROFILE = {profile_name}
s3_bucket_name=os.environ[f"{name}_S3_BUCKET_NAME"]
role=os.environ[f"{name}_ROLE"]
s3_bucket_name

In [None]:
from datetime import datetime

currentDay = datetime.now().day
currentMonth = datetime.now().month
currentYear = datetime.now().year

current_time = f"{currentYear}{currentMonth}{currentDay}"
current_time

In [None]:
import sagemaker
from sagemaker import image_uris
import boto3
import os
import time
import json

# role = sagemaker.get_execution_role()  # execution role for the endpoint
sess = sagemaker.session.Session()  # sagemaker session for interacting with different AWS APIs
bucket = sess.default_bucket()

### 2. 把模型拷贝到S3为后续部署做准备

In [None]:
import sagemaker
from sagemaker import image_uris
import boto3
import os
import time
import json

#role = sagemaker.get_execution_role()  # execution role for the endpoint
sess = sagemaker.session.Session()  # sagemaker session for interacting with different AWS APIs
# bucket = sess.default_bucket()  # bucket to house artifacts
bucket = s3_bucket_name  # bucket to house artifacts

region = sess._region_name
account_id = sess.account_id()

s3_client = boto3.client("s3")
sm_client = boto3.client("sagemaker")
smr_client = boto3.client("sagemaker-runtime")

In [None]:
s3_model_prefix = f"aigc-embedding-models/{model_name}"  # folder where model checkpoint will go
model_snapshot_path = list(local_model_path.glob("**/snapshots/*"))[0]
s3_code_prefix = f"aigc-embedding-models/{model_name}_deploy_code"
print(f"s3_code_prefix: {s3_code_prefix}")
print(f"model_snapshot_path: {model_snapshot_path}")

In [None]:
!aws s3 cp --recursive {model_snapshot_path} s3://{bucket}/{s3_model_prefix}

### 3. 模型部署准备（entrypoint脚本，容器镜像，服务配置）

In [None]:
inference_image_uri = (
    f"763104351884.dkr.ecr.{region}.amazonaws.com/djl-inference:0.21.0-deepspeed0.8.3-cu117"
)

#中国区需要替换为下面的image_uri
# inference_image_uri = (
#     f"727897471807.dkr.ecr.{region}.amazonaws.com.cn/djl-inference:0.21.0-deepspeed0.8.3-cu117"
# )

print(f"Image going to be used is ---- > {inference_image_uri}")

In [None]:
local_code_dir = s3_code_prefix.split('/')[-1]
!mkdir -p {local_code_dir}

In [None]:
%%writefile {local_code_dir}/model.py
from djl_python import Input, Output
import torch
import logging
import math
import os
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer, AutoModel

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(f'--device={device}')


def load_model(properties):
    tensor_parallel = properties["tensor_parallel_degree"]
    model_location = properties['model_dir']
    if "model_id" in properties:
        model_location = properties['model_id']
    logging.info(f"Loading model in {model_location}")
    
    tokenizer = AutoTokenizer.from_pretrained(model_location, trust_remote_code=True)
    tokenizer.padding_side = 'right'
    model = AutoModel.from_pretrained(
        model_location, 
        # device_map="balanced_low_0", 
        trust_remote_code=True
    ).half()
    # load the model on GPU
    model.to(device) 
    model.requires_grad_(False)
    model.eval()
    
    return model, tokenizer


model = None
tokenizer = None
generator = None

def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0].to(device) #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float().to(device)
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)


def handle(inputs: Input):
    global model, tokenizer
    if not model:
        model, tokenizer = load_model(inputs.get_properties())

    if inputs.is_empty():
        return None
    data = inputs.get_as_json()
    
    input_sentences = data["inputs"]
    logging.info(f"inputs: {input_sentences}")
    
    encoded_input = tokenizer(input_sentences, padding=True, truncation=True, max_length=512, return_tensors='pt').to(device)
    # Compute token embeddings
    with torch.no_grad():
        model_output = model(**encoded_input)
        sentence_embeddings = model_output[0][:, 0]

    # Perform pooling. In this case, max pooling.
    # sentence_embeddings = model_output.cpu().numpy()
    sentence_embeddings = torch.nn.functional.normalize(sentence_embeddings, p=2, dim=1).cpu().numpy()

#     # preprocess
#     input_ids = tokenizer(input_sentences, return_tensors="pt").input_ids
#     # pass inputs with all kwargs in data
#     if params is not None:
#         outputs = model.generate(input_ids, **params)
#     else:
#         outputs = model.generate(input_ids)

#     # postprocess the prediction
#     prediction = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    result = {"sentence_embeddings": sentence_embeddings}
    return Output().add_as_json(result)

In [None]:
s3_path = f"s3://{bucket}/{s3_model_prefix}/"
print(f"option.s3url ==> {s3_path}")

#### Note: option.s3url 需要按照自己的账号进行修改, 可以拷贝上一个cell的输出

In [None]:
%%writefile {local_code_dir}/serving.properties
engine=Python
option.tensor_parallel_degree=1
# option.s3url = fs3://sagemaker-us-west-2-316327952690/LLM-RAG/workshop/buffer-embedding-002-model/
option.s3url = S3PATH

In [None]:
!sed -i "s|option.s3url = S3PATH|option.s3url = {s3_path}|" {local_code_dir}/serving.properties

In [None]:
!rm s2e_model.tar.gz
!cd {local_code_dir} && rm -rf ".ipynb_checkpoints"
!tar czvf s2e_model.tar.gz {local_code_dir}

In [None]:
s3_code_artifact = sess.upload_data("s2e_model.tar.gz", bucket, s3_code_prefix)
print(f"S3 Code or Model tar ball uploaded to --- > {s3_code_artifact}")

### 4. 创建模型 & 创建endpoint

In [None]:
from sagemaker.utils import name_from_base
import boto3

model_name = name_from_base(f"{model_name}").replace('.','-').replace('_','-')# name_from_base("st-paraphrase-mpnet-base-v2") Note: Need to specify model_name
print(model_name)
print(f"Image going to be used is ---- > {inference_image_uri}")

create_model_response = sm_client.create_model(
    ModelName=model_name,
    ExecutionRoleArn=role,
    PrimaryContainer={
        "Image": inference_image_uri,
        "ModelDataUrl": s3_code_artifact
    },
    
)
model_arn = create_model_response["ModelArn"]

print(f"Created Model: {model_arn}")

#### 推理机型选择 (https://aws.amazon.com/cn/sagemaker/pricing/)
- GPU
  + ml.g4dn.xlarge 按需价格 0.526 USD/Hour
- CPU
  + ml.c5.xlarge   按需价格 0.204 USD/Hour

In [None]:
endpoint_config_name = f"{model_name}-config"
endpoint_name = f"{model_name}-endpoint"

endpoint_config_response = sm_client.create_endpoint_config(
    EndpointConfigName=endpoint_config_name,
    ProductionVariants=[
        {
            "VariantName": "variant1",
            "ModelName": model_name,
            "InstanceType": "ml.g4dn.2xlarge",
            "InitialInstanceCount": 1,
            # "VolumeSizeInGB" : 400,
            # "ModelDataDownloadTimeoutInSeconds": 2400,
            "ContainerStartupHealthCheckTimeoutInSeconds": 15*60,
        },
    ],
)
endpoint_config_response

In [None]:
endpoint_name

In [None]:
tag=f"{current_time}-{commit_hash}"

create_endpoint_response = sm_client.create_endpoint(
    EndpointName=f"{endpoint_name}", EndpointConfigName=endpoint_config_name, Tags=[{"Key":"version", "Value":tag}],
)
print(f"Created Endpoint: {create_endpoint_response['EndpointArn']}")

In [None]:
import time

resp = sm_client.describe_endpoint(EndpointName=endpoint_name)
status = resp["EndpointStatus"]
print("Status: " + status)

while status == "Creating":
    time.sleep(60)
    resp = sm_client.describe_endpoint(EndpointName=endpoint_name)
    status = resp["EndpointStatus"]
    print("Status: " + status)

print("Arn: " + resp["EndpointArn"])
print("Status: " + status)

### 5. 模型测试

In [None]:
def get_vector_by_sm_endpoint(questions, sm_client, endpoint_name):
    parameters = {
        # "early_stopping": True,
        # "length_penalty": 2.0,
        "max_new_tokens": 50,
        "temperature": 0,
        "min_length": 10,
        "no_repeat_ngram_size": 2,
    }

    response_model = sm_client.invoke_endpoint(
        EndpointName=endpoint_name,
        Body=json.dumps(
            {
                "inputs": questions,
            }
        ),
        ContentType="application/json",
    )
    json_str = response_model['Body'].read().decode('utf8')
    json_obj = json.loads(json_str)
    embeddings = json_obj['sentence_embeddings']
    return embeddings

In [None]:
sentences = ['你分享的信息显示你想查看语法"ALTER SYSTEM SET SMTP_OUT_SERVER =…"的文档？  从RDS Oracle公共文档进一步研究，我能找到：[+]<https://docs.aws.amazon.com/AmazonRDS/latest/UserGuide/Oracle.Options.UTLMAIL.html>  SMTP_OUT_SERVER是一个客户可以设置的参数。为了更改系统，客户需要修改该参数。  "您可以使用DB参数组为DB实例设置默认SMTP_OUT_SERVER。您可以通过在DB实例上运行以下代码来为会话设置SMTP_OUT_SERVER参数。   ALTER SESSION SET smtp_out_server = mailserver.domain.com:25 ;"  希望以上内容有所帮助。', 'Title:有关Oracle"ALTER SYSTEM SET SMTP_OUT_SERVER =…"的rdsadmin语法请求帮助。 Content:这似乎是那些没有被文档化为"通用示例"的特定命令之一，当然我们无法查看rdsadmin_util软件包中的过程。至于邮件服务器，它的解析源是URL而不是IP地址。谢谢。 Answer:你分享的信息显示你想查看语法"ALTER SYSTEM SET SMTP_OUT_SERVER =…"的文档？  从RDS Oracle公共文档进一步研究，我能找到：[+]<https://docs.aws.amazon.com/AmazonRDS/latest/UserGuide/Oracle.Options.UTLMAIL.html>  SMTP_OUT_SERVER是一个客户可以设置的参数。为了更改系统，客户需要修改该参数。  "您可以使用DB参数组为DB实例设置默认SMTP_OUT_SERVER。您可以通过在DB实例上运行以下代码来为会话设置SMTP_OUT_SERVER参数。   ALTER SESSION SET smtp_out_server = mailserve', '在本地环境中，最佳的处理数据库的方式是什么？ ', '我们目前正在使用Aurora RDS postgres数据库。最初，我们的数据库很小，容易管理，我们将转储发送到S3并提供给开发人员开发新功能和解决错误。  目前我们的数据库重达700GB，**最好的策略是什么，以便开发人员可以获得数据？**下载整个数据库是不可能的，这将浪费大量时间成本。最重要的是，开发人员能够获得数据库的副本来构建开发环境，以支持他们开发新功能和解决缺陷。 ', '首先，考虑一下是否拥有一个所有开发人员都连接的单个副本可以满足您的需求。他们可以远程连接，并且它将是您生产数据的一个副本。因此，它非常棒，因为很容易定期更新并保证覆盖所有生产用例。它有很多优势。  但这可能不能满足您的需求。您说您需要本地数据库。在这种情况下，我认为您有两条主要路径要考虑。  1. 生成示例数据 2. 创建提取生产数据子集的过程  在开始时，生成示例数据可能需要更多的工作。但它有一些不错的优点。很容易确保您生成所需的数据。它将是参数化的，因此每个开发人员在那个时刻生成他/她关心的数据。下载大数据集没有网络问题。  但如果您确实需要提取主数据库的一部分，那么您需要将其视为提取-转换-加载(ETL)项目。使用数据集成(DI/ETL)工具连接到主数据库并提取一些子集。理想情况下，子集将很容易定义。也许对于大多数表，您只需取最近2个月的数据，对于其他表(如参考表)，则取整个表。定义所有单个映射可能需要很大的努力...但实际上并不复杂。您可以决定详细信息，如将数据加载到另一个数据库或保存到CSV文件中。然后使数据库转储或CSV文件可供您的开发人员使用。作为一名开发人员，您可能会倾向于编写自己的脚本来执行']

In [None]:
sent_all = [sent[:512] for sent in sentences]

endpoint_name = "bge-large-zh-v1-5-2023-11-15-06-52-26-105-endpoint"

res = get_vector_by_sm_endpoint(sent_all, smr_client, endpoint_name)

In [None]:
res

In [None]:
# test for zh endpoint
import numpy as np

endpoint_name = "bge-large-zh-v1-5-2023-11-15-06-52-26-105-endpoint"


cand1 = []
cand1.append("AWS提供的关键服务有哪些，以支持不同类型的企业需求和工作负载？")
cand1.append("Amazon SageMaker提供哪些关键功能，以支持企业在机器学习领域的需求和挑战？")
cand2 = []
cand2.append("AWS的全球基础架构是如何支持高可靠性和可扩展性，以满足企业的云计算需求？")
cand2.append("Amazon SageMaker如何简化机器学习模型的开发、训练和部署过程，以提高企业的生产效率和创新能力？")

cand = cand1 + cand2

res = get_vector_by_sm_endpoint(cand, smr_client, endpoint_name)

cand1_embed = res[0:2]
cand2_embed = res[2:]

results = {}

for idx1, cand1_score in enumerate(cand1_embed):
    for idx2, cand2_score in enumerate(cand2_embed):
        # print(cand2_score)
        print(f"{cand1[idx1]}")
        print(f"{cand2[idx2]}") 
        print(f"score: {np.dot(np.array(cand1_score), np.array(cand2_score).T)}")

In [None]:
# test for en endpoint
import numpy as np

endpoint_name = "bge-large-en-v1-5-2023-11-15-06-19-29-526-endpoint"


cand1 = []
cand1.append("What are some key advantages of adopting AWS for cloud computing compared to traditional on-premises infrastructure?")
cand1.append("What are the key advantages of using Amazon SageMaker for machine learning compared to traditional ML development workflows?")
cand2 = []
cand2.append("Can you explain the main benefits that businesses can gain by leveraging AWS services for their cloud computing needs?")
cand2.append("How does Amazon SageMaker simplify the process of building, training, and deploying machine learning models compared to traditional approaches?")

cand = cand1 + cand2

res = get_vector_by_sm_endpoint(cand, smr_client, endpoint_name)

cand1_embed = res[0:2]
cand2_embed = res[2:]

results = {}

for idx1, cand1_score in enumerate(cand1_embed):
    for idx2, cand2_score in enumerate(cand2_embed):
        # print(cand2_score)
        print(f"{cand1[idx1]}")
        print(f"{cand2[idx2]}") 
        print(f"score: {np.dot(np.array(cand1_score), np.array(cand2_score).T)}")

In [None]:
prompts1 = """请问AWS Clean Rooms是多方都会收费吗？"""

res = get_vector_by_sm_endpoint([prompts1], smr_client, endpoint_name)

In [None]:
prompts1 = """What is the purpose of Amazon S3 in AWS, and how is it typically used?"""

res = get_vector_by_sm_endpoint([prompts1], smr_client, endpoint_name)

In [None]:
res