### 1. 安装HuggingFace 并下载模型到本地

In [1]:
!pip install huggingface-hub -Uqq
!pip install -U sagemaker

Collecting sagemaker
  Using cached sagemaker-3.0.1-py3-none-any.whl.metadata (12 kB)
Collecting sagemaker-core<3.0.0,>=2.0.0 (from sagemaker)
  Using cached sagemaker_core-2.0.1-py3-none-any.whl.metadata (5.4 kB)
Collecting sagemaker-train<2.0.0 (from sagemaker)
  Using cached sagemaker_train-1.0-py3-none-any.whl.metadata (7.6 kB)
Collecting sagemaker-serve<2.0.0 (from sagemaker)
  Using cached sagemaker_serve-1.0-py3-none-any.whl.metadata (1.6 kB)
Collecting sagemaker-mlops<2.0.0 (from sagemaker)
  Using cached sagemaker_mlops-1.0-py3-none-any.whl.metadata (5.7 kB)
Collecting rich<14.0.0,>=13.0.0 (from sagemaker-core<3.0.0,>=2.0.0->sagemaker)
  Using cached rich-13.9.4-py3-none-any.whl.metadata (18 kB)
Collecting torch>=1.9.0 (from sagemaker-core<3.0.0,>=2.0.0->sagemaker)
  Using cached torch-2.6.0-cp310-cp310-manylinux1_x86_64.whl.metadata (28 kB)
Collecting deepdiff (from sagemaker-serve<2.0.0->sagemaker)
  Downloading deepdiff-8.6.1-py3-none-any.whl.metadata (8.6 kB)
Collecting ml

In [2]:
from huggingface_hub import snapshot_download
from pathlib import Path

local_model_path = Path("./bge-reranker-v2-m3")
local_model_path.mkdir(exist_ok=True)
model_name = "BAAI/bge-reranker-v2-m3"
commit_hash = "12e974610ba9083ed95f3edf08d7e899581f4de4"

In [3]:
snapshot_download(repo_id=model_name, revision=commit_hash, cache_dir=local_model_path)

Downloading (incomplete total...): 0.00B [00:00, ?B/s]

Fetching 13 files:   0%|          | 0/13 [00:00<?, ?it/s]

'bge-reranker-v2-m3/models--BAAI--bge-reranker-v2-m3/snapshots/12e974610ba9083ed95f3edf08d7e899581f4de4'

### 2. 把模型拷贝到S3为后续部署做准备

In [4]:
import sagemaker
from sagemaker import image_uris
import boto3
import os
import time
import json

role = sagemaker.get_execution_role()  # execution role for the endpoint
sess = sagemaker.session.Session()  # sagemaker session for interacting with different AWS APIs
bucket = sess.default_bucket()  # bucket to house artifacts

region = sess._region_name
account_id = sess.account_id()

s3_client = boto3.client("s3")
sm_client = boto3.client("sagemaker")
smr_client = boto3.client("sagemaker-runtime")

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


In [5]:
s3_model_prefix = f"aigc-llm-models/{model_name}"  # folder where model checkpoint will go
model_snapshot_path = list(local_model_path.glob("**/snapshots/*"))[0]
s3_code_prefix = f"aigc-llm-models/{model_name}_deploy_code"
print(f"s3_code_prefix: {s3_code_prefix}")
print(f"model_snapshot_path: {model_snapshot_path}")

s3_code_prefix: aigc-llm-models/BAAI/bge-reranker-v2-m3_deploy_code
model_snapshot_path: bge-reranker-v2-m3/models--BAAI--bge-reranker-v2-m3/snapshots/12e974610ba9083ed95f3edf08d7e899581f4de4


In [6]:
!aws s3 cp --recursive {model_snapshot_path} s3://{bucket}/{s3_model_prefix}

upload: bge-reranker-v2-m3/models--BAAI--bge-reranker-v2-m3/snapshots/12e974610ba9083ed95f3edf08d7e899581f4de4/README.md to s3://sagemaker-us-east-1-687752207838/aigc-llm-models/BAAI/bge-reranker-v2-m3/README.md
upload: bge-reranker-v2-m3/models--BAAI--bge-reranker-v2-m3/snapshots/12e974610ba9083ed95f3edf08d7e899581f4de4/.gitattributes to s3://sagemaker-us-east-1-687752207838/aigc-llm-models/BAAI/bge-reranker-v2-m3/.gitattributes
upload: bge-reranker-v2-m3/models--BAAI--bge-reranker-v2-m3/snapshots/12e974610ba9083ed95f3edf08d7e899581f4de4/assets/BEIR-bge-en-v1.5.png to s3://sagemaker-us-east-1-687752207838/aigc-llm-models/BAAI/bge-reranker-v2-m3/assets/BEIR-bge-en-v1.5.png
upload: bge-reranker-v2-m3/models--BAAI--bge-reranker-v2-m3/snapshots/12e974610ba9083ed95f3edf08d7e899581f4de4/assets/llama-index.png to s3://sagemaker-us-east-1-687752207838/aigc-llm-models/BAAI/bge-reranker-v2-m3/assets/llama-index.png
upload: bge-reranker-v2-m3/models--BAAI--bge-reranker-v2-m3/snapshots/12e974610b

### 3. 模型部署准备（entrypoint脚本，容器镜像，服务配置）

In [7]:
inference_image_uri = (
    f"763104351884.dkr.ecr.{region}.amazonaws.com/djl-inference:0.23.0-deepspeed0.9.5-cu118"
)

#中国区需要替换为下面的image_uri
if region in ['cn-north-1', 'cn-northwest-1']:
    inference_image_uri = (
        f"727897471807.dkr.ecr.{region}.amazonaws.com.cn/djl-inference:0.23.0-deepspeed0.9.5-cu118"
    )

print(f"Image going to be used is ---- > {inference_image_uri}")

Image going to be used is ---- > 763104351884.dkr.ecr.us-east-1.amazonaws.com/djl-inference:0.23.0-deepspeed0.9.5-cu118


In [8]:
local_code_dir = s3_code_prefix.split('/')[-1]
!mkdir -p {local_code_dir}

In [9]:
%%writefile {local_code_dir}/model.py
from djl_python import Input, Output
import torch
import logging
import math
import os
from FlagEmbedding import FlagReranker

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(f'--device={device}')


def load_model(properties):
    tensor_parallel = properties["tensor_parallel_degree"]
    model_location = properties['model_dir']
    if "model_id" in properties:
        model_location = properties['model_id']
    logging.info(f"Loading model in {model_location}")
    
    model = FlagReranker(model_location, use_fp16=True)
    
    return model

model = None

def handle(inputs: Input):
    global model 
    if not model:
        model  = load_model(inputs.get_properties())

    if inputs.is_empty():
        return None
    data = inputs.get_as_json()
    
    queries = data["inputs"]
    docs = data["docs"]
    
    pairs = []
    for i,q in enumerate(queries):
        pairs.append([q,docs[i]])
    
    scores = model.compute_score(pairs)
      
    result = {"scores": scores}

    return Output().add_as_json(result)

Writing bge-reranker-v2-m3_deploy_code/model.py


In [10]:
s3_path = f"s3://{bucket}/{s3_model_prefix}/"
print(f"option.s3url ==> {s3_path}")

option.s3url ==> s3://sagemaker-us-east-1-687752207838/aigc-llm-models/BAAI/bge-reranker-v2-m3/


#### Note: option.s3url 需要按照自己的账号进行修改

In [12]:
%%writefile {local_code_dir}/serving.properties
engine=Python
option.tensor_parallel_degree=1
option.s3url = S3PATH

Overwriting bge-reranker-v2-m3_deploy_code/serving.properties


In [13]:
!sed -i "s|option.s3url = S3PATH|option.s3url = {s3_path}|" {local_code_dir}/serving.properties

In [21]:
%%writefile {local_code_dir}/requirements.txt
FlagEmbedding==1.2.0

Overwriting bge-reranker-v2-m3_deploy_code/requirements.txt


In [22]:
!rm model.tar.gz
!cd {local_code_dir} && rm -rf ".ipynb_checkpoints"
!tar czvf model.tar.gz {local_code_dir}

bge-reranker-v2-m3_deploy_code/
bge-reranker-v2-m3_deploy_code/serving.properties
bge-reranker-v2-m3_deploy_code/requirements.txt
bge-reranker-v2-m3_deploy_code/model.py


In [23]:
s3_code_artifact = sess.upload_data("model.tar.gz", bucket, s3_code_prefix)
print(f"S3 Code or Model tar ball uploaded to --- > {s3_code_artifact}")

S3 Code or Model tar ball uploaded to --- > s3://sagemaker-us-east-1-687752207838/aigc-llm-models/BAAI/bge-reranker-v2-m3_deploy_code/model.tar.gz


### 4. 创建模型 & 创建endpoint

In [24]:
from sagemaker.utils import name_from_base
import boto3

model_name = name_from_base("bge-reranker-v2-m3")# name_from_base("st-paraphrase-mpnet-base-v2") Note: Need to specify model_name
print(model_name)
print(f"Image going to be used is ---- > {inference_image_uri}")

create_model_response = sm_client.create_model(
    ModelName=model_name,
    ExecutionRoleArn=role,
    PrimaryContainer={
        "Image": inference_image_uri,
        "ModelDataUrl": s3_code_artifact
    },
    
)
model_arn = create_model_response["ModelArn"]

print(f"Created Model: {model_arn}")

bge-reranker-v2-m3-2025-11-26-10-31-02-575
Image going to be used is ---- > 763104351884.dkr.ecr.us-east-1.amazonaws.com/djl-inference:0.23.0-deepspeed0.9.5-cu118
Created Model: arn:aws:sagemaker:us-east-1:687752207838:model/bge-reranker-v2-m3-2025-11-26-10-31-02-575


#### 推理机型选择 (https://aws.amazon.com/cn/sagemaker/pricing/)
- GPU
  + ml.g4dn.xlarge 按需价格 0.526 USD/Hour
- CPU
  + ml.c5.xlarge   按需价格 0.204 USD/Hour

In [25]:
endpoint_config_name = f"{model_name}-config"
endpoint_name = f"{model_name}-endpoint"

endpoint_config_response = sm_client.create_endpoint_config(
    EndpointConfigName=endpoint_config_name,
    ProductionVariants=[
        {
            "VariantName": "variant1",
            "ModelName": model_name,
            "InstanceType": "ml.g4dn.xlarge",
            "InitialInstanceCount": 1,
            # "VolumeSizeInGB" : 400,
            # "ModelDataDownloadTimeoutInSeconds": 2400,
            "ContainerStartupHealthCheckTimeoutInSeconds": 10*60,
        },
    ],
)
endpoint_config_response

{'EndpointConfigArn': 'arn:aws:sagemaker:us-east-1:687752207838:endpoint-config/bge-reranker-v2-m3-2025-11-26-10-31-02-575-config',
 'ResponseMetadata': {'RequestId': 'e22adc4a-8077-4405-b8bc-b6e078988fd4',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': 'e22adc4a-8077-4405-b8bc-b6e078988fd4',
   'strict-transport-security': 'max-age=47304000; includeSubDomains',
   'x-frame-options': 'DENY',
   'content-security-policy': "frame-ancestors 'none'",
   'cache-control': 'no-cache, no-store, must-revalidate',
   'x-content-type-options': 'nosniff',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '130',
   'date': 'Wed, 26 Nov 2025 10:31:05 GMT'},
  'RetryAttempts': 0}}

In [26]:
create_endpoint_response = sm_client.create_endpoint(
    EndpointName=f"{endpoint_name}", EndpointConfigName=endpoint_config_name
)
print(f"Created Endpoint: {create_endpoint_response['EndpointArn']}")

Created Endpoint: arn:aws:sagemaker:us-east-1:687752207838:endpoint/bge-reranker-v2-m3-2025-11-26-10-31-02-575-endpoint


In [27]:
import time

resp = sm_client.describe_endpoint(EndpointName=endpoint_name)
status = resp["EndpointStatus"]
print("Status: " + status)

while status == "Creating":
    time.sleep(60)
    resp = sm_client.describe_endpoint(EndpointName=endpoint_name)
    status = resp["EndpointStatus"]
    print("Status: " + status)

print("Arn: " + resp["EndpointArn"])
print("Status: " + status)

Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: InService
Arn: arn:aws:sagemaker:us-east-1:687752207838:endpoint/bge-reranker-v2-m3-2025-11-26-10-31-02-575-endpoint
Status: InService


### 5. 模型测试

In [28]:
def rerank_by_sm_endpoint(questions, docs, sm_client, endpoint_name):
    response_model = sm_client.invoke_endpoint(
        EndpointName=endpoint_name,
        Body=json.dumps(
            {
                "inputs": questions,
                "docs": docs
            }
        ),
        ContentType="application/json",
    )
    json_str = response_model['Body'].read().decode('utf8')
    json_obj = json.loads(json_str)
    scores = [item for item in json_obj['scores']]
    return scores

In [35]:
import time
start = time.time()

prompts1 = ["请问AWS Clean Rooms是多方都会收费吗？"] * 2
docs1 = ["会收费","生成式AI(generative AI/Gen AI)是一种AI技术,可以创造新的内容和想法的人工智能，例如图像、视频、文本、代码、音乐等。它利用机器学习模型基于大量数据进行预训练得到的超大模型也即基础模型来提供支持。"]
print(rerank_by_sm_endpoint(prompts1, docs1, smr_client, endpoint_name))

end = time.time()
print(f"运行时间: {end - start:.4f} 秒")

[-1.6298828125, -11.0390625]
运行时间: 0.0374 秒
