In [None]:
!pip install --upgrade boto3 -i https://pypi.tuna.tsinghua.edu.cn/simple/
!pip install --upgrade sagemaker==2.183.0 -i https://pypi.tuna.tsinghua.edu.cn/simple/

In [None]:
# For notebook instances (Amazon Linux)
!sudo yum update -y
!sudo yum install amazon-linux-extras
!sudo amazon-linux-extras install epel -y
!sudo yum update -y
!sudo yum install git-lfs git -y

In [None]:
import sagemaker
import boto3
sess = sagemaker.Session()
# sagemaker session bucket -> used for uploading data, models and logs
# sagemaker will automatically create this bucket if it not exists
sagemaker_session_bucket=None
if sagemaker_session_bucket is None and sess is not None:
    # set to default bucket if a bucket name is not given
    sagemaker_session_bucket = sess.default_bucket()

try:
    role = sagemaker.get_execution_role()
except ValueError:
    iam = boto3.client('iam')
    role = iam.get_role(RoleName='sagemaker_execution_role')['Role']['Arn']

sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)

print(f"sagemaker role arn: {role}")
print(f"sagemaker bucket: {sess.default_bucket()}")
print(f"sagemaker session region: {sess.boto_region_name}")

In [None]:
!mkdir code

In [None]:
%%writefile code/inference.py

from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F

# Helper: Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)


def model_fn(model_dir):
    # Load model from HuggingFace Hub
    tokenizer = AutoTokenizer.from_pretrained(model_dir)
    model = AutoModel.from_pretrained(model_dir)
    model.eval()
    return model, tokenizer

def predict_fn(data, model_and_tokenizer):
    # destruct model and tokenizer
    model, tokenizer = model_and_tokenizer

    # Tokenize sentences
    sentences = data.pop("inputs", data)
#     encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
    instruction = "为这个句子生成表示以用于检索相关文章："
    encoded_input = tokenizer([instruction + q for q in sentences], padding=True, truncation=True, return_tensors='pt')


    # Compute token embeddings
    with torch.no_grad():
        model_output = model(**encoded_input)
        # Perform pooling. In this case, cls pooling.
        sentence_embeddings = model_output[0][:, 0]
    
    sentence_embeddings = torch.nn.functional.normalize(sentence_embeddings, p=2, dim=1)

    # return dictonary, which will be json serializable
    return sentence_embeddings.tolist()


In [None]:
repository = "BAAI/bge-large-zh-v1.5"
model_id=repository.split("/")[-1]
s3_location=f"s3://{sess.default_bucket()}/custom_inference/{model_id}/model.tar.gz"

In [None]:
!git lfs install
#!git clone https://huggingface.co/$repository
!git clone https://www.modelscope.cn/Xorbits/bge-large-zh-v1.5.git

In [None]:
!cp -r code/ $model_id/code/

In [None]:
%cd $model_id
!tar zcvf model.tar.gz *

In [None]:
!aws s3 cp model.tar.gz $s3_location

In [None]:
import sagemaker
from sagemaker.huggingface.model import HuggingFaceModel

role = sagemaker.get_execution_role()

hub = {
	'HF_MODEL_ID':'BAAI/bge-large-zh-v1.5',
	'HF_TASK':'feature-extraction'
}

# create Hugging Face Model Class
huggingface_model = HuggingFaceModel(
   model_data=s3_location,       # path to your model and script
   role=role,                    # iam role with permissions to create an Endpoint
   transformers_version="4.12",  # transformers version used
   pytorch_version="1.9",        # pytorch version used
   py_version='py38',            # python version used
   #env=hub,
)

endpoint_name = 'huggingface-inference-eb'
# instance_type = 'ml.g4dn.2xlarge'
instance_type = 'ml.g4dn.xlarge'
instance_count = 1

# deploy the endpoint endpoint
from sagemaker.serializers import JSONSerializer
from sagemaker.deserializers import JSONDeserializer
predictor = huggingface_model.deploy(
    endpoint_name = endpoint_name,
    instance_type = instance_type, 
    initial_instance_count = instance_count,
    serializer = JSONSerializer(),
    deserializer = JSONDeserializer()
)


In [None]:
# Inference testing
import time
import sagemaker
from sagemaker.huggingface import HuggingFaceModel

hfp = sagemaker.huggingface.model.HuggingFacePredictor('huggingface-inference-eb')

embedding = hfp.predict({'inputs':['测试一下']})
print(embedding)
print(len(embedding))