In [None]:
!pip install git+https://github.com/FlagOpen/FlagEmbedding.git -Uq
!pip install -Uq sentence-transformers 
!pip install faiss-cpu -Uq

### Generate random negative label

In [42]:
# use chatgpt synthesis data
import random
import json
import os
FAQ_FILE = 'chatgpt_synthesis.jsonl'

def generate_train_data(train_data, output_suffix='train'):
    size_train_data = len(train_data)
    qq_labels = []
    oqgd_labels = []
    gqod_labels = []
    for line in train_data:
        json_obj = json.loads(line)
        
        o_query = json_obj['origin_question'].strip()
        o_answer = json_obj['origin_answer']
        g_query = json_obj['generate_question']
        g_answer = json_obj['generate_answer']
        o_doc = f"Question: {o_query}\nAnswer: {o_answer}"
        g_doc = f"Question: {g_query}\nAnswer: {g_answer}"
        
        # 可能的召回策略是：
        # 知识库构建时，o_query，g_query，o_doc，g_doc 均做向量化。
        # 查询时去做QQ召回，QD召回 (认为QA召回一定不如QD召回)
        
        # 求 origin_Q - generated_Q 的相似性， 判断QQ召回的可行性
        qq_labels.append((o_query, g_query)) 
        
        #求 origin_Q - generated_D 的相似性， 判断QD召回的可行性
        oqgd_labels.append((o_query, g_doc)) 
        
        #求 origin_Q - origin_D 的相似性， 判断QD召回的可行性
        gqod_labels.append((g_query, o_doc))
        
    qq1_file = open(f"chatgpt_synthesis/qq1_{output_suffix}.jsonl", 'w')
    for idx, (query_a, query_b) in enumerate(qq_labels):
        N = range(len(qq_labels))
        m = 20
        idx_list = random.sample(N, m)
        neg_list = [ qq_labels[i][1] for i in idx_list if i != idx ]
        record = json.dumps({ "query": query_a, "pos": [query_b], "neg": neg_list }, ensure_ascii=False)
        qq1_file.write(record)
        qq1_file.write('\n')
        
    qq2_file = open(f"chatgpt_synthesis/qq2_{output_suffix}.jsonl", 'w')
    for idx, (query_a, query_b) in enumerate(qq_labels):
        N = range(len(qq_labels))
        m = 20
        idx_list = random.sample(N, m)
        neg_list = [ qq_labels[i][0] for i in idx_list if i != idx ]
        record = json.dumps({ "query": query_a, "pos": [query_b], "neg": neg_list }, ensure_ascii=False)
        qq2_file.write(record)
        qq2_file.write('\n')
    
    oqgd_file = open(f"chatgpt_synthesis/oqgd_{output_suffix}.jsonl", 'w')
    for idx, (query_o, doc_g) in enumerate(oqgd_labels):
        N = range(len(oqgd_labels))
        m = 20
        idx_list = random.sample(N, m)
        neg_list = [ oqgd_labels[i][1] for i in idx_list if i != idx ]
        record = json.dumps({ "query": "为这个句子生成表示以用于检索相关文章：" + query_o, "pos": [doc_g], "neg": neg_list }, ensure_ascii=False)
        oqgd_file.write(record)
        oqgd_file.write('\n')
        
    gqod_file = open(f"chatgpt_synthesis/gqod_{output_suffix}.jsonl", 'w')
    for idx, (query_g, doc_o) in enumerate(gqod_labels):
        N = range(len(gqod_labels))
        m = 20
        idx_list = random.sample(N, m)
        neg_list = [ gqod_labels[i][1] for i in idx_list if i != idx ]
        record = json.dumps({ "query": "为这个句子生成表示以用于检索相关文章：" + query_g, "pos": [doc_o], "neg": neg_list }, ensure_ascii=False)
        gqod_file.write(record)
        gqod_file.write('\n')
    

def generate_FAQ(data_arr, sep='\n=====\n', faq_name='enhanced_faq'):
    def generate_item(data_arr):
        for line in data_arr:
            json_obj = json.loads(line)

            o_query = json_obj['origin_question'].strip()
            o_answer = json_obj['origin_answer']
            g_query = json_obj['generate_question']
            g_answer = json_obj['generate_answer']

            faq_template = "Question: {}\nAnswer: {}"

            if len(o_query) > 5:
                faq1 = faq_template.format(o_query, o_answer)
                yield faq1
            faq2 = faq_template.format(g_query, o_answer)
            yield faq2
            faq3 = faq_template.format(g_query, g_answer)
            yield faq2
            
    with open(f"{faq_name}.faq", 'w') as outfile:
        outfile.write(sep.join(generate_item(data_arr)))
            
    
test_data= None
train_data = None
with open(FAQ_FILE, 'r') as file:
    data_arr = file.readlines()
    data_count = len(data_arr)
    train_count = int(data_count * 0.9)
    test_count = data_count - train_count
    test_data = data_arr[:test_count]
    train_data = data_arr[test_count:]
    valid_data = data_arr[train_count:]
    
if not os.path.exists('chatgpt_synthesis'):
    os.mkdir('chatgpt_synthesis')
generate_train_data(test_data, 'test')
generate_train_data(train_data, 'train')
generate_train_data(valid_data, 'valid')
generate_FAQ(data_arr, faq_name="chatgpt_enhanced_faq")

### check train/test data

In [None]:
!cat ./chatgpt_synthesis/*train.jsonl > chatgpt_synthesis/train_merged.jsonl

In [None]:
!wc -l chatgpt_synthesis/train_merged.jsonl

In [None]:
!cat ./chatgpt_synthesis/*test.jsonl > chatgpt_synthesis/test_merged.jsonl

In [None]:
!wc -l chatgpt_synthesis/test_merged.jsonl

In [None]:
!cat ./chatgpt_synthesis/*valid.jsonl > chatgpt_synthesis/valid_merged.jsonl

In [None]:
!wc -l chatgpt_synthesis/valid_merged.jsonl

### Generate hard negative label (optional)

In [None]:
!python -m FlagEmbedding.baai_general_embedding.finetune.hn_mine \
--model_name_or_path BAAI/bge-large-zh \
--input_file chatgpt_synthesis/train_merged.jsonl \
--output_file chatgpt_synthesis/train_merged_minedHN.jsonl \
--range_for_sampling 2-200

In [None]:
out_file = open('random_sample_traindata_hardneg.jsonl', 'w')
with open('random_sample_traindata_minedHN.jsonl', 'r') as file:
    for line in file.readlines():
        json_obj = json.loads(line)
        out_file.write(json.dumps(json_obj, ensure_ascii=False))
        out_file.write('\n')
        
out_file.close()

### Finetune with Torch

In [None]:
!torchrun --nproc_per_node 4 \
-m FlagEmbedding.baai_general_embedding.finetune.run \
--output_dir ./finetune_bge_large_zh15 \
--model_name_or_path BAAI/bge-large-zh-v1.5 \
--train_data ./chatgpt_synthesis/train_merged.jsonl \
--learning_rate 1e-5 \
--fp16 \
--num_train_epochs 5 \
--per_device_train_batch_size 1 \
--normlized True \
--temperature 0.02 \
--query_max_len 128 \
--passage_max_len 512 \
--train_group_size 9 \
--logging_steps 100 

### Deploy Model endpoint

In [None]:
import sagemaker
from sagemaker import image_uris
import boto3
import os
import time
import json

role = sagemaker.get_execution_role()  # execution role for the endpoint
sess = sagemaker.session.Session()  # sagemaker session for interacting with different AWS APIs
bucket = sess.default_bucket()  # bucket to house artifacts

region = sess._region_name
account_id = sess.account_id()

s3_client = boto3.client("s3")
sm_client = boto3.client("sagemaker")
smr_client = boto3.client("sagemaker-runtime")

In [None]:
s3_model_prefix = "LLM-RAG/workshop/finetuned-bge15-large-zh-model"  # folder where model checkpoint will go
model_snapshot_path = "./finetune_bge_large_zh15"
s3_code_prefix = "LLM-RAG/workshop/finetuned-bge15-large-zh-code"
print(f"s3_code_prefix: {s3_code_prefix}")
print(f"model_snapshot_path: {model_snapshot_path}")

In [None]:
!aws s3 cp --recursive {model_snapshot_path} s3://{bucket}/{s3_model_prefix}

In [None]:
inference_image_uri = (
    f"763104351884.dkr.ecr.{region}.amazonaws.com/djl-inference:0.23.0-deepspeed0.9.5-cu118"
)

#中国区需要替换为下面的image_uri
# inference_image_uri = (
#     f"727897471807.dkr.ecr.{region}.amazonaws.com.cn/djl-inference:0.21.0-deepspeed0.8.3-cu117"
# )

print(f"Image going to be used is ---- > {inference_image_uri}")

In [None]:
!mkdir -p finetuned-bge15-large-zh-code

In [None]:
%%writefile finetuned-bge15-large-zh-code/model.py
from djl_python import Input, Output
import torch
import logging
import math
import os
from FlagEmbedding import FlagModel

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(f'--device={device}')

def load_model(properties):
    tensor_parallel = properties["tensor_parallel_degree"]
    model_location = properties['model_dir']
    if "model_id" in properties:
        model_location = properties['model_id']
    logging.info(f"Loading model in {model_location}")

    model =  FlagModel(model_location)
    
    return model

model = None

def handle(inputs: Input):
    global model
    if not model:
        model = load_model(inputs.get_properties())

    if inputs.is_empty():
        return None
    data = inputs.get_as_json()
    
    input_sentences = None
    inputs = data["inputs"]
    if isinstance(inputs, list):
        input_sentences = inputs
    else:
        input_sentences =  [inputs]
        
    is_query = data["is_query"]
    instruction = data["instruction"]
    logging.info(f"inputs: {input_sentences}")
    logging.info(f"is_query: {is_query}")
    logging.info(f"instruction: {instruction}")
    
    if is_query and instruction:
        input_sentences = [ instruction + sent for sent in input_sentences ]
        
    sentence_embeddings =  model.encode(input_sentences)
        
    result = {"sentence_embeddings": sentence_embeddings}
    return Output().add_as_json(result)

In [None]:
print(f"option.s3url ==> s3://{bucket}/{s3_model_prefix}/")

#### 设置 serving.properties，requirements.txt

In [None]:
with open('finetuned-bge15-large-zh-code/serving.properties', 'w') as file:
    prop = f"""engine=Python
option.tensor_parallel_degree=1
option.s3url = s3://{bucket}/{s3_model_prefix}/"""
    file.write(prop)

In [None]:
with open('finetuned-bge15-large-zh-code/requirements.txt', 'w') as file:
    requirements = """transformers==4.28.1\nFlagEmbedding"""
    file.write(requirements)

In [None]:
!rm s2e_model.tar.gz
!cd finetuned-bge15-large-zh-code && rm -rf ".ipynb_checkpoints"
!tar czvf s2e_model.tar.gz finetuned-bge15-large-zh-code

In [None]:
s3_code_artifact = sess.upload_data("s2e_model.tar.gz", bucket, s3_code_prefix)
print(f"S3 Code or Model tar ball uploaded to --- > {s3_code_artifact}")

### 4. 创建模型 & 创建endpoint

In [None]:
from sagemaker.utils import name_from_base
import boto3

model_name = name_from_base("bge15-finetuned") #Note: Need to specify model_name
print(model_name)
print(f"Image going to be used is ---- > {inference_image_uri}")

create_model_response = sm_client.create_model(
    ModelName=model_name,
    ExecutionRoleArn=role,
    PrimaryContainer={
        "Image": inference_image_uri,
        "ModelDataUrl": s3_code_artifact
    },
    
)
model_arn = create_model_response["ModelArn"]

print(f"Created Model: {model_arn}")

In [None]:
endpoint_config_name = f"{model_name}-config"
endpoint_name = f"{model_name}-endpoint"

endpoint_config_response = sm_client.create_endpoint_config(
    EndpointConfigName=endpoint_config_name,
    ProductionVariants=[
        {
            "VariantName": "variant1",
            "ModelName": model_name,
            "InstanceType": "ml.g4dn.xlarge",
            "InitialInstanceCount": 1,
            # "VolumeSizeInGB" : 400,
            # "ModelDataDownloadTimeoutInSeconds": 2400,
            "ContainerStartupHealthCheckTimeoutInSeconds": 15*60,
        },
    ],
)
endpoint_config_response

In [None]:
create_endpoint_response = sm_client.create_endpoint(
    EndpointName=f"{endpoint_name}", EndpointConfigName=endpoint_config_name
)
print(f"Created Endpoint: {create_endpoint_response['EndpointArn']}")

In [None]:
import time

resp = sm_client.describe_endpoint(EndpointName=endpoint_name)
status = resp["EndpointStatus"]
print("Status: " + status)

while status == "Creating":
    time.sleep(60)
    resp = sm_client.describe_endpoint(EndpointName=endpoint_name)
    status = resp["EndpointStatus"]
    print("Status: " + status)

print("Arn: " + resp["EndpointArn"])
print("Status: " + status)

In [None]:
def get_vector_by_sm_endpoint(questions, sm_client, endpoint_name):
    parameters = {
    }

    response_model = sm_client.invoke_endpoint(
        EndpointName=endpoint_name,
        Body=json.dumps(
            {
                "inputs": questions,
                "is_query": True,
                "instruction" :  "Represent this sentence for searching relevant passages:"
            }
        ),
        ContentType="application/json",
    )
    # 中文instruction => 为这个句子生成表示以用于检索相关文章：
    json_str = response_model['Body'].read().decode('utf8')
    json_obj = json.loads(json_str)
    embeddings = json_obj['sentence_embeddings']
    return embeddings

In [None]:
prompts1 = ["what is the default brightness setting on this device?", "how are you going"]

emb = get_vector_by_sm_endpoint(prompts1, smr_client, endpoint_name)
print(len(emb[0]))
print(emb)