In [7]:
%%capture

!pip install tiktoken==0.3.3
!pip install tqdm

In [None]:
!pip install transformers

In [4]:
from tqdm import tqdm
import tiktoken
import requests
import logging
import os

In [76]:
tokenizer = tiktoken.get_encoding('cl100k_base')
DOC_DIR_PATH = './docs'
CHUNK_SIZE = 256

In [77]:
type(tokenizer)

tiktoken.core.Encoding

In [7]:
logger = logging.getLogger('sagemaker')
logger.setLevel(logging.DEBUG)
logger.addHandler(logging.StreamHandler())

In [10]:
def doc_iterator(dir_path: str):
    for root, _, filenames in os.walk(dir_path):
        for filename in filenames:
            file_path = os.path.join(root, filename)
            if os.path.isfile(file_path):
                with open(file_path, 'r') as file:
                    file_contents = file.read()
                    yield filename, file_contents

### 按照token等长切分文档

In [12]:
!mkdir -p chunks

In [93]:
def segment_doc_by_token_length(input_path, tokenizer):
    n_docs = 0
    n_passages = 0

    for doc_name, doc in tqdm(doc_iterator(DOC_DIR_PATH)):
        print(f"doc_name: {doc_name}")
        doc_id = doc_name.split('.')[0]
        tokens = tokenizer.encode(doc)
        chunks = []
        chunk_id = 1
        n_docs += 1
        for i in range(0, len(tokens), CHUNK_SIZE):
            chunk_tokens = tokens[i: i+CHUNK_SIZE]
            if not len(chunk_tokens) < 256:
                chunk = tokenizer.decode(chunk_tokens)
                with open(f'./chunks/{doc_id}_{chunk_id}', 'w') as f:
                    f.write(chunk)
                chunk_id += 1
                n_passages += 1
    logger.info(f'{n_docs} documents segmented into {n_passages} passages')

In [94]:
segment_doc_by_token_length('./docs', tokenizer)

1it [00:00,  5.47it/s]

doc_name: Cleanroom_FAQ.txt
doc_name: Cleanroom_FAQ-checkpoint.txt


2it [00:00,  6.14it/s]
2 documents segmented into 30 passages


In [87]:
import json
import boto3
import numpy as np

smr_client = boto3.client("sagemaker-runtime")

def get_st_embedding(smr_client, text_input):
    endpoint_name = "st-paraphrase-mpnet-base-v2-2023-04-14-04-17-29-625-endpoint"
    parameters = {
      #"early_stopping": True,
      #"length_penalty": 2.0,
      "max_new_tokens": 50,
      "temperature": 0,
      "min_length": 10,
      "no_repeat_ngram_size": 2,
    }

    response_model = smr_client.invoke_endpoint(
                EndpointName=endpoint_name,
                Body=json.dumps(
                {
                    "inputs": [text_input],
                    "parameters": parameters
                }
                ),
                ContentType="application/json",
            )
    
    json_str = response_model['Body'].read().decode('utf8')
    json_obj = json.loads(json_str)
    embeddings = json_obj["sentence_embeddings"]
    
    return embeddings[0]

def get_bloom_embedding(smr_client, text_input):
    TEXT_EMBEDDING_MODEL_ENDPOINT_NAME='huggingface-textembedding-bloom-7b1-fp1-2023-04-13-11-29-28-700'
    payload = {'text_inputs': [text_input]}
    payload = json.dumps(payload).encode('utf-8')

    response = smr_client.invoke_endpoint(EndpointName=TEXT_EMBEDDING_MODEL_ENDPOINT_NAME, 
                                                ContentType='application/json', 
                                                Body=payload)
    body = json.loads(response['Body'].read())
    embedding = body['embedding'][0]
    
    return embedding

def calulate_cosine(vector1,vector2):
    """
    Calculate cosine similarity between two vectors
    """
    return np.dot(vector1,vector2)/(np.linalg.norm(vector1)*np.linalg.norm(vector2))

def calulate_semantic_distance(smr_client, q_str, a_str, get_emb_func):
    q_vec = get_emb_func(smr_client, q_str)
    a_vec = get_emb_func(smr_client, a_str)
    return calulate_cosine(q_vec, a_vec)

### 按照Question&Answer Pair 测试paraphrase-mpnet-base-v2的语义召回能力

In [None]:
for doc_name, doc in tqdm(doc_iterator(DOC_DIR_PATH)):
    if doc_name == "Cleanroom_FAQ.txt":
        lines = doc.splitlines()
        q_lines = [ line for line in lines if line.startswith('Question') ]
        a_lines = [ line for line in lines if line.startswith('Answer') ]
        for q_idx, q_line in enumerate(q_lines):
            max_cos = 0.0
            max_a_line = ""
            for a_idx, a_line in enumerate(a_lines):
                cos_val = calulate_semantic_distance(smr_client, q_line, a_line, get_st_embedding)
                if cos_val > max_cos:
                    max_cos = cos_val
                    max_a_line = a_line
            print(f'{max_cos} | {q_line} | {max_a_line}')

### 按照Question&Answer Pair 测试bloom的语义召回能力

In [None]:
for doc_name, doc in tqdm(doc_iterator(DOC_DIR_PATH)):
    if doc_name == "Cleanroom_FAQ.txt":
        lines = doc.splitlines()
        q_lines = [ line for line in lines if line.startswith('Question') ]
        a_lines = [ line for line in lines if line.startswith('Answer') ]
        for q_idx, q_line in enumerate(q_lines):
            max_cos = 0.0
            max_a_line = ""
            for a_idx, a_line in enumerate(a_lines):
                cos_val = calulate_semantic_distance(smr_client, q_line, a_line, get_bloom_embedding)
                if cos_val > max_cos:
                    max_cos = cos_val
                    max_a_line = a_line
            print(f'{max_cos} | {q_line} | {max_a_line}')

### 按照Question&Chunk 测试paraphrase-mpnet-base-v2的语义召回能力

In [None]:
CHUNK_DIR_PATH='./chunks'
for doc_name, doc in tqdm(doc_iterator(DOC_DIR_PATH)):
    if doc_name == "Cleanroom_FAQ.txt":
        lines = doc.splitlines()
        q_lines = [ line for line in lines if line.startswith('Question') ]
        a_lines = [ line for line in lines if line.startswith('Answer') ]
        for q_idx, q_line in enumerate(q_lines):
            max_cos = 0.0
            max_a_doc = ""
            for doc_name, a_doc in tqdm(doc_iterator(CHUNK_DIR_PATH)):
                cos_val = calulate_semantic_distance(smr_client, q_line, a_doc, get_st_embedding)
                if cos_val > max_cos:
                    max_cos = cos_val
                    max_a_doc = a_doc
            print(f'{max_cos} | {q_line} | {max_a_doc}')

### 按照Question&Chunk 测试bloom的语义召回能力

In [None]:
CHUNK_DIR_PATH='./chunks'
for doc_name, doc in tqdm(doc_iterator(DOC_DIR_PATH)):
    if doc_name == "Cleanroom_FAQ.txt":
        lines = doc.splitlines()
        q_lines = [ line for line in lines if line.startswith('Question') ]
        a_lines = [ line for line in lines if line.startswith('Answer') ]
        for q_idx, q_line in enumerate(q_lines):
            max_cos = 0.0
            max_a_doc = ""
            for doc_name, a_doc in tqdm(doc_iterator(CHUNK_DIR_PATH)):
                cos_val = calulate_semantic_distance(smr_client, q_line, a_doc, get_bloom_embedding)
                if cos_val > max_cos:
                    max_cos = cos_val
                    max_a_doc = a_doc
            print(f'{max_cos} | {q_line} | {max_a_doc}')

### 按照段落进行分组(Token Size 限制)

In [100]:
def segment_doc_by_paragraph(input_path, tokenizer):
    paragraphs = []
    paragraph_embeddings = []
    q_line_vec_arr = []

    for doc_name, doc in tqdm(doc_iterator(DOC_DIR_PATH)):
        if doc_name == "Cleanroom_FAQ.txt":
            lines = doc.splitlines()
            max_len = len(lines)
            print(f"max_len : {max_len}")

            q_line_vec_arr = [ (line, get_st_embedding(smr_client, line)) for line in lines if line.startswith('Question') ]

            for line_idx in range(len(lines)):
                if lines[line_idx] == '':
                    continue
                span = 0
                len_token = 0
                while len_token < 128:
                    # print(f"line_idx+span : {line_idx+span}, span : {span}")
                    delta_token_len = len(tokenizer.encode(lines[line_idx+span]))
                    span += 1
                    if line_idx+span == max_len:
                        break
                    len_token += delta_token_len

                paragraph = '\n'.join(lines[line_idx:line_idx+span])
                paragraphs.append(paragraph)
                paragraph_emb = get_st_embedding(smr_client, paragraph)
                # print(paragraph_emb)
                paragraph_embeddings.append(paragraph_emb)
                
    return paragraphs, paragraph_embeddings, q_line_vec_arr

In [72]:
paragraphs, paragraph_embeddings, q_line_vec_arr = segment_doc_by_paragraph('./docs', tokenizer)

### 按照Question&Paragraph 测试paraphrase-mpnet-base-v2的语义召回能力

In [None]:
print("start to calulate similiarity")
for q_doc, q_vec in q_line_vec_arr:
    max_cos = 0.0
    a_doc = ""
    for idx in range(len(paragraphs)):
        cos_val = calulate_cosine(q_vec, paragraph_embeddings[idx])
        if cos_val > max_cos:
            max_cos = cos_val
            a_doc = paragraphs[idx]
    print(f"***{q_doc}***\n{a_doc}\n[score]:{max_cos}\n-----\n")

### 存贮段落到磁盘进行搜索分析

In [None]:
!mkdir -p paragraphs

In [None]:
# 把这些paragraphs写入文件
for idx, paragraph in enumerate(paragraphs):
    with open(f'./paragraphs/{idx}.txt', 'w') as f:
        f.write(paragraph)