In [1]:
%load_ext autoreload 
%autoreload 2

In [2]:
!pip install python_docx

Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
Collecting python_docx
  Downloading python-docx-0.8.11.tar.gz (5.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting lxml>=2.3.2
  Downloading lxml-4.9.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl (7.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hBuilding wheels for collected packages: python_docx
  Building wheel for python_docx (setup.py) ... [?25ldone
[?25h  Created wheel for python_docx: filename=python_docx-0.8.11-py3-none-any.whl size=184490 sha256=07d5d6f36867ab26f5556964c5be844786193250a5fbcb296b8f8ede37640ff0
  Stored in directory: /home/ec2-user/.cache/pip/wheels/65/e1/9b/0c38fe6cfe02a9fe31cb6b

In [3]:
#Sagemaker Endpoint Deploy
from sagemaker.huggingface import HuggingFaceModel
import sagemaker

role = sagemaker.get_execution_role()
# Hub Model configuration. https://huggingface.co/models
hub = {
	'HF_MODEL_ID':'shibing624/text2vec-base-chinese',
	'HF_TASK':'feature-extraction'
}

# create Hugging Face Model Class
huggingface_model = HuggingFaceModel(
	transformers_version='4.17.0',
	pytorch_version='1.10.2',
	py_version='py38',
	env=hub,
	role=role, 
)

# deploy model to SageMaker Inference
predictor = huggingface_model.deploy(
	endpoint_name='huggingface-inference-text2vec-base-chinese-v1',
	initial_instance_count=1, # number of instances
	# instance_type='ml.m5.xlarge' # ec2 instance type
	instance_type='ml.p3.2xlarge'
)

---------------!

In [9]:
# Inference testing
import time

hfp = sagemaker.huggingface.model.HuggingFacePredictor('huggingface-inference-text2vec-base-chinese-v1')

t0 = time.time()
for i in range(10):
    hfp.predict({'inputs':''.join(['打印' for _ in range(100)])})[0][0][0]
print(time.time()-t0)

6.187071323394775


In [10]:
#Preprocess Data
import os
import docx
import pandas as pd
from docx import Document
import sagemaker
from sagemaker.huggingface import HuggingFaceModel
import json
import boto3
import requests

def is_all_black(s):
    for si in s:
        if si != ' ':
            return False
    return True

def read_doc(path):
    title = get_title(path)
    titles = []
    paragraphs = []
    sentences = []
    paragraphs_id = []
    sentences_id = []
    
    document = Document(path)  # 读入文件
    for i in range(len(document.paragraphs)):
        p0 = document.paragraphs[i].text
        p = document.paragraphs[i].text.replace('. ', '。')
        if p != '':
            ss = p.split('。')
            for j in range(len(ss)):
                if ss[j] != '' and is_all_black(ss[j])==False:
                    titles.append(title)
                    paragraphs.append(p0)
                    sentences.append(ss[j])
                    paragraphs_id.append(i)
                    sentences_id.append(j)
    df = pd.DataFrame({'title':titles, 'paragraph':paragraphs, 'sentence':sentences,
                      'paragraph_id':paragraphs_id, 'sentence_id':sentences_id})          
    return df

def get_title(path):
    try:
        title = os.path.split(os.path.splitext(path)[0])[1].replace('——', '-').split('-')[1]
    except:
        title = os.path.split(os.path.splitext(path)[0])[1]
    return title

In [11]:
hfp = sagemaker.huggingface.model.HuggingFacePredictor('huggingface-inference-text2vec-base-chinese-v1')

def get_vector(q):
    if len(q) > 400:
        return [-1000 for _ in range(768)]
    return hfp.predict({'inputs':[q]})[0][0][0]

def embbeding(df):
    df['title_vector'] = ''
    df['sentence_vector'] = ''
    title_vector = str(get_vector(df.iloc[0, 0]))
    for i in range(len(df)):
        df.iloc[i, 5] = title_vector
        df.iloc[i, 6] = str(get_vector(df.iloc[i, 2]))
        print('\r embbeding %i out of %i finished'%(i, len(df)), end='')
    return df

In [12]:
#import data to OpenSearch
import boto3
import requests
import json


host = '' # cluster endpoint, for example: my-test-domain.us-east-1.es.amazonaws.com/
region = '' # e.g. cn-north-1
index_name = ""
username = ""
password = ""

service = 'es'
credentials = boto3.Session().get_credentials()


awsauth = (username, password)


url = host+'_bulk'

headers = { "Content-Type": "application/json" }

def import_data(df, id_start=0, before_import=0):
    payloads = ''
    for i in range(id_start, len(df)+id_start):
        first = json.dumps({ "index": { "_index": index_name, "_id": str(i+before_import) } }, ensure_ascii=False) + "\n"
        second = json.dumps({"title": str(df.iloc[i-id_start, 0]), 
                     "paragraph": str(df.iloc[i-id_start, 1]), 
                     "sentence": str(df.iloc[i-id_start, 2]), 
                     "paragraph_id": str(df.iloc[i-id_start, 3]), 
                     "sentence_id": str(df.iloc[i-id_start, 4]), 
                     "title_vector": json.loads(df.iloc[i-id_start, 5]),
                     "sentence_vector": json.loads(df.iloc[i-id_start, 6])},
                   ensure_ascii=False) + "\n"
        payloads += first + second
    # print(payloads)
    r = requests.post(url, auth=awsauth, headers=headers, data=payloads.encode()) # requests.get, post, and delete have similar syntax
#     print(r.text)

In [13]:
#Preprocess Data and Import

hfp = sagemaker.huggingface.model.HuggingFacePredictor('huggingface-inference-text2vec-base-chinese-v1-gpu')

folder_path = ''

slice = 10

names = os.listdir(folder_path)
before_import = 0
for j in range(len(names)):
    name = names[j]
    df = read_doc(os.path.join(folder_path, name))
    df = embbeding(df)
    for i in range(len(df)//slice+1):
        import_data(df[slice*i:slice*(i+1)], slice*i, before_import)
        print('\r import %i out of %i finished'%(i, len(df)//slice+1), end='')
    before_import += len(df)
    print(' file %i out of %i finished'%(j, len(names)//slice+1))
    

FileNotFoundError: [Errno 2] No such file or directory: ''

In [14]:
#Create Dynamo DB
client = boto3.client('dynamodb', region_name='us-west-2')

try:
    resp = client.create_table(
        TableName="FeedbackRecordsSEWCFAQ",
        # Declare your Primary Key in the KeySchema argument
        KeySchema=[
            {
                "AttributeName": "SearchInputs",
                "KeyType": "HASH"
            },
            {
                "AttributeName": "_id",
                "KeyType": "RANGE"
            }
        ],
        # Any attributes used in KeySchema or Indexes must be declared in AttributeDefinitions
        AttributeDefinitions=[
            {
                "AttributeName": "SearchInputs",
                "AttributeType": "S"
            },
            {
                "AttributeName": "_id",
                "AttributeType": "S"
            }
        ],
        # ProvisionedThroughput controls the amount of data you can read or write to DynamoDB per second.
        # You can control read and write capacity independently.
        ProvisionedThroughput={
            "ReadCapacityUnits": 50,
            "WriteCapacityUnits": 50
        }
    )
    print("Table created successfully!")
except Exception as e:
    print("Error creating table:")
    print(e)

Table created successfully!
