In [None]:
!pip install sentence-transformers -Uq

In [None]:
!pip install datasets
!pip install  accelerate -U 

# finetune 模型

In [None]:
from sentence_transformers import SentenceTransformer, LoggingHandler
from datasets import load_dataset
from sentence_transformers import InputExample
from torch.utils.data import DataLoader
from sentence_transformers import losses
import torch
import logging

In [None]:
logging.basicConfig(format='%(asctime)s - %(message)s', datefmt = '%Y-%m-%d %H:%M:S', level=logging.INFO, handlers =[LoggingHandler()] )

## 从hf加载模型

In [None]:
modelB = SentenceTransformer('sentence-transformers/paraphrase-multilingual-mpnet-base-v2')
# modelB=SentenceTransformer('intfloat/multilingual-e5-large')

In [None]:
# dataset_id = "embedding-data/sentence-compression"
# dataset = load_dataset(dataset_id)

### 加载TopWar原始语料

In [None]:
import os,datasets
# filenames = os.listdir('topwar_faq')
filenames = ['cleaned_topwar_enrich_faq_0911.faq','topwarfaq230908.faq']

In [None]:
def parse_faq(file_content,QA_SEP='====='):
    arr = file_content.split(QA_SEP)
    list_arr = []
    for item in arr:
        question, answer = item.strip().split("\n", 1)
        question = question.replace("Question: ", "")
        answer = answer.replace("Answer: ", "")
        list_arr.append((answer,question))
    return list_arr

all_datas = []
for fn in filenames:
    if fn == '.ipynb_checkpoints':
        continue
    with open(f"docs/{fn}") as f:
        data = f.read()
        all_datas += parse_faq(data)
print(f"data size:{len(all_datas)}")

### 加载enrich之后的语料

In [None]:
# import pickle
# pklfn = 'docs/topwarfaq0817.pkl'
# all_datas = []
# with open(pklfn, 'rb') as f:
#     all_datas = pickle.load(f)

In [None]:
# print(f"data size:{len(all_datas)}")

## 加载csv预料

In [None]:
# import pandas as pd
# import re

In [None]:
# df = pd.read_csv('docs/topwar_faq_new.csv')
# df=df[['answer','question']].dropna()

In [None]:
# df['question'] = df['question'].map(lambda x: re.sub(r'^(\d+.\s?)','',x.strip()))

In [None]:
# df.sample(10)

In [None]:
#增加问题对标志

# def convert_vocab_to_token_id(vocab, word):
#     vocab_dict = {word: idx for idx, word in enumerate(vocab)}
#     # token_ids = [vocab_dict.get(word, -1) for word in words]
#     token_id = vocab_dict.get(word, -1)
#     return token_id


# vocab = list(set(df['answer']))

In [None]:
# df['idx'] = df['answer'].map(lambda x:convert_vocab_to_token_id(vocab,x)) 

In [None]:
# all_datas=df[['answer','question']].values.tolist()

## 准备dataset

In [None]:
train_examples = []

for i in range(len(all_datas)):
    example = all_datas[i]
    train_examples.append(InputExample(texts=[example[0], example[1]]))

In [None]:
train_examples[4000].texts

In [None]:

train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=2)
train_loss = losses.MultipleNegativesRankingLoss(model=modelB)
num_epochs = 2
warmup_steps = int(len(train_dataloader) * num_epochs * 0.1) #10% of train data

In [None]:
torch.cuda.empty_cache()
modelB.fit(train_objectives=[(train_dataloader, train_loss)],
          epochs=num_epochs,
           # evaluator=evaluator,
            # evaluation_steps=5,
           output_path='./finetuned-sentence-embedding',
          warmup_steps=warmup_steps)

## 从本地加载finetuned的模型

In [None]:
# modelB = SentenceTransformer('./finetuned-sentence-embedding')

## 抽样

In [None]:
from sentence_transformers import evaluation,util
import pandas as pd

In [None]:
df = pd.DataFrame(all_datas,columns=['answer','question'])
df

## 抽样20%数据拿出来算距离

In [None]:
### 抽取一部分，做交叉负样本
# sample_size = int(len(all_datas)*0.2)
# df_sample_1 = df.sample(sample_size)
# df_sample_2 = df.sample(sample_size)

# ##去除sample1 和 sample 2中的重复部分
# df_dup_sample = df_sample_1.join(df_sample_2,rsuffix='_r')
# df_dup_sample = df_dup_sample[~df_dup_sample.question_r.isna()][['question','answer']]

# df_sample_1_dedup = df_sample_1.join(df_dup_sample,how='left',rsuffix ='_r')
# df_sample_1_dedup = df_sample_1_dedup[df_sample_1_dedup.question_r.isna()]

# df_sample_2_dedup = df_sample_2.join(df_dup_sample,how='left',rsuffix='_r')
# df_sample_2_dedup = df_sample_2_dedup[df_sample_2_dedup.question_r.isna()]


# #将sample 1 的答案和sample 2 的问题组成负样本
# input_answer  = []
# input_question  = []
# for a,b in zip(df_sample_1_dedup.answer,df_sample_2_dedup.question):
#     input_answer.append(a)
#     input_question.append(b)
# print(f'negative sample size:{df_sample_2_dedup.shape[0]}')

In [None]:
# emb_answer = modelB.encode(input_answer)
# emb_question = modelB.encode(input_question)

In [None]:
input_answer  = []
input_question  = []
sample_size = int(len(all_datas)*0.2)
df_sample = df.sample(sample_size)
for a,b in zip(df_sample.answer,df_sample.question):
    input_answer.append(a)
    input_question.append(b)
print(f'sample size:{df_sample.shape[0]}')

In [None]:
emb_answer = modelB.encode(input_answer)
emb_question = modelB.encode(input_question)

## 将question和answer进行cross 对比，查看这个分布。
### 输出矩阵的对角线的结果代表的是正样本，其他非对角线的则是交叉样本（负样本）

In [None]:
# 计算所有q和a之间的相似度
cross_simsvalues = util.cos_sim(emb_answer,emb_question).flatten()

In [None]:
cross_sims_s = pd.Series(cross_simsvalues)

In [None]:
N = len(input_question)
pos_indices = [ i*N+i for i in range(N)] ##只取出正例的index

In [None]:
# 正样本的得分
pos_cross_sims_s = cross_sims_s[pos_indices]
pos_cross_sims_s.describe()

In [None]:
##去除对角线上的元素
neg_cross_sims_s = cross_sims_s.drop(pos_indices)
neg_cross_sims_s.describe()

In [None]:
import seaborn as sns
sns.histplot(pos_cross_sims_s, color='green',kde=True)
 ##负样本较多，只采样一部分进行plot
sns.histplot(neg_cross_sims_s.sample(len(pos_cross_sims_s)), color='red',kde=True)

## 输出具体的结果

In [None]:
import numpy as np
def similarity(v1,v2):
    dot_product = np.dot(v1, v2)

    magnitude_v1 = np.linalg.norm(v1)
    magnitude_v2 = np.linalg.norm(v2)

    return dot_product / (magnitude_v1 * magnitude_v2)

In [None]:
for i in range(len(input_question)):
    sims = similarity(emb_answer[i],emb_question[i])
    print(f"Question:{input_question[i]}\nAnswer:{input_answer[i]}\n{sims}\n")

In [None]:
input_queries = ['雷电将军好不好使？']
q_embedding = modelB.encode(input_queries)
results = util.semantic_search(query_embeddings = q_embedding,corpus_embeddings= emb_question,top_k=3)
results

In [None]:
for ret in results[0]:
    print(f"{all_datas[ret['corpus_id']]} score:{ret['score']}")

# 使用pre trained 模型对比

In [None]:
modelA = SentenceTransformer('sentence-transformers/paraphrase-multilingual-mpnet-base-v2')

In [None]:
emb_answer_2 = modelA.encode(input_answer)
emb_question_2 = modelA.encode(input_question)
cross_simsvalues_2 = util.cos_sim(emb_answer_2,emb_question_2).flatten()
cross_sims_s_2 = pd.Series(cross_simsvalues_2)

In [None]:
len(emb_answer)

In [None]:
#负样本得分
N = len(input_question)
pos_indices = [ i*N+i for i in range(N)] 
neg_cross_sims_s_2 = cross_sims_s_2.drop(pos_indices)
neg_cross_sims_s_2.describe()

In [None]:
# 正样本的得分
pos_cross_sims_s_2 = cross_sims_s_2[pos_indices]
pos_cross_sims_s_2.describe()

In [None]:
import seaborn as sns
sns.histplot(pos_cross_sims_s_2, color='green',kde=True)
 ##负样本较多，只采样一部分进行plot
sns.histplot(neg_cross_sims_s_2.sample(N), color='red',kde=True)

# 部署模型到sagemaker

In [None]:
!pip install --upgrade pip -q
!pip install -U sagemaker -q

## 2. 把模型拷贝到S3为后续部署做准备

In [None]:
import sagemaker
from sagemaker import image_uris
import boto3
import os
import time
import json

role = sagemaker.get_execution_role()  # execution role for the endpoint
sess = sagemaker.session.Session()  # sagemaker session for interacting with different AWS APIs
bucket = sess.default_bucket()  # bucket to house artifacts

region = sess._region_name
account_id = sess.account_id()

s3_client = boto3.client("s3")
sm_client = boto3.client("sagemaker")
smr_client = boto3.client("sagemaker-runtime")

In [None]:
s3_model_prefix = "LLM-RAG/workshop/finetuned-sentence2emb-model"  # folder where model checkpoint will go
model_snapshot_path = "./finetuned-sentence-embedding"
s3_code_prefix = "LLM-RAG/workshop/finetuned-sentence2emb_deploy_code"
print(f"s3_code_prefix: {s3_code_prefix}")
print(f"model_snapshot_path: {model_snapshot_path}")

In [None]:
!aws s3 cp --recursive {model_snapshot_path} s3://{bucket}/{s3_model_prefix}

In [None]:
!pip install -U sagemaker -q

### 3. 模型部署准备（entrypoint脚本，容器镜像，服务配置）

In [None]:
# China Region
# inference_image_uri = (
#     f"763104351884.dkr.ecr.{region}.amazonaws.com/djl-inference:0.21.0-deepspeed0.8.3-cu117"
# )

inference_image_uri = image_uris.retrieve(
    framework="djl-deepspeed",
    region=sess.boto_session.region_name,
    version="0.23.0"
)
print(f"Image going to be used is ---- > {inference_image_uri}")

In [None]:
!mkdir -p sentence2emb_deploy_code

In [None]:
%%writefile sentence2emb_deploy_code/model.py
from djl_python import Input, Output
import torch
import logging
import math
import os
from sentence_transformers import SentenceTransformer


def load_model(properties):
    tensor_parallel = properties["tensor_parallel_degree"]
    model_location = properties['model_dir']
    if "model_id" in properties:
        model_location = properties['model_id']
    logging.info(f"Loading model in {model_location}")

    # model =  FlagModel(model_location)
    model = SentenceTransformer(model_location)
    
    return model

model = None

def handle(inputs: Input):
    global model
    if not model:
        model = load_model(inputs.get_properties())

    if inputs.is_empty():
        return None
    data = inputs.get_as_json()
    
    input_sentences = None
    inputs = data["inputs"]
    if isinstance(inputs, list):
        input_sentences = inputs
    else:
        input_sentences =  [inputs]
        
    logging.info(f"inputs: {input_sentences}")

    sentence_embeddings =  model.encode(input_sentences,normalize_embeddings=True)
        
    result = {"sentence_embeddings": sentence_embeddings}
    return Output().add_as_json(result)

#### Note: option.s3url 需要按照自己的账号进行修改

In [None]:
%%writefile sentence2emb_deploy_code/serving.properties
engine=Python
option.tensor_parallel_degree=1
option.s3url = s3://sagemaker-us-east-2-946277762357/LLM-RAG/workshop/finetuned-sentence2emb-model/

In [None]:
%%writefile sentence2emb_deploy_code/requirements.txt
transformers==4.30.2
sentence-transformers

In [None]:
!rm s2e_model.tar.gz
!cd sentence2emb_deploy_code && rm -rf ".ipynb_checkpoints"
!tar czvf s2e_model.tar.gz sentence2emb_deploy_code

In [None]:
s3_code_artifact = sess.upload_data("s2e_model.tar.gz", bucket, s3_code_prefix)
print(f"S3 Code or Model tar ball uploaded to --- > {s3_code_artifact}")

### 4. 创建模型 & 创建endpoint

In [None]:
from sagemaker.utils import name_from_base
import boto3

model_name = name_from_base("finetuned-mpnet") #Note: Need to specify model_name
print(model_name)
print(f"Image going to be used is ---- > {inference_image_uri}")

create_model_response = sm_client.create_model(
    ModelName=model_name,
    ExecutionRoleArn=role,
    PrimaryContainer={
        "Image": inference_image_uri,
        "ModelDataUrl": s3_code_artifact
    },
    
)
model_arn = create_model_response["ModelArn"]

print(f"Created Model: {model_arn}")

###  如果批量创建索引量较多，建议改成"InstanceType": "ml.g4dn.xlarge",

In [None]:
endpoint_config_name = f"{model_name}-config"
endpoint_name = f"{model_name}-endpoint"

endpoint_config_response = sm_client.create_endpoint_config(
    EndpointConfigName=endpoint_config_name,
    ProductionVariants=[
        {
            "VariantName": "variant1",
            "ModelName": model_name,
            "InstanceType": "ml.g4dn.xlarge",
            "InitialInstanceCount": 1,
            # "VolumeSizeInGB" : 400,
            # "ModelDataDownloadTimeoutInSeconds": 2400,
            "ContainerStartupHealthCheckTimeoutInSeconds": 10*60,
        },
    ],
)
endpoint_config_response

In [None]:
create_endpoint_response = sm_client.create_endpoint(
    EndpointName=f"{endpoint_name}", EndpointConfigName=endpoint_config_name
)
print(f"Created Endpoint: {create_endpoint_response['EndpointArn']}")

In [None]:
import time

resp = sm_client.describe_endpoint(EndpointName=endpoint_name)
status = resp["EndpointStatus"]
print("Status: " + status)

while status == "Creating":
    time.sleep(60)
    resp = sm_client.describe_endpoint(EndpointName=endpoint_name)
    status = resp["EndpointStatus"]
    print("Status: " + status)

print("Arn: " + resp["EndpointArn"])
print("Status: " + status)

## 5. 模型测试

In [None]:
def get_vector_by_sm_endpoint(questions, sm_client, endpoint_name):
    # parameters = {
    #     "max_new_tokens": 50,
    #     "temperature": 0,
    #     "min_length": 10,
    #     "no_repeat_ngram_size": 2,
    # }

    response_model = sm_client.invoke_endpoint(
        EndpointName=endpoint_name,
        Body=json.dumps(
            {
                "inputs": questions,
                # "parameters": parameters
            }
        ),
        ContentType="application/json",
    )
    json_str = response_model['Body'].read().decode('utf8')
    json_obj = json.loads(json_str)
    embeddings = json_obj['sentence_embeddings']
    return embeddings

In [None]:
prompts1 = """专属技能碎片在哪里获得？"""
prompts1 = """中国首都在哪里？"""

emb1 = get_vector_by_sm_endpoint(prompts1, smr_client, endpoint_name)

In [None]:
prompts2 = """专属技能碎片可以通过多种途径获得，例如礼包商城-特惠礼包界面可以购买专属技能碎片礼包"""
emb2 = get_vector_by_sm_endpoint(prompts2, smr_client, endpoint_name)

In [None]:
util.cos_sim(emb1,emb2)