In [1]:
%load_ext autoreload 
%autoreload 2

In [2]:
!pip install python_docx
!pip install --upgrade boto3
!pip install --upgrade sagemaker

Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
Collecting python_docx
  Downloading python-docx-0.8.11.tar.gz (5.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m51.1 MB/s[0m eta [36m0:00:00[0m:00:01[0m0:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting lxml>=2.3.2
  Downloading lxml-4.9.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl (7.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m43.4 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hBuilding wheels for collected packages: python_docx
  Building wheel for python_docx (setup.py) ... [?25ldone
[?25h  Created wheel for python_docx: filename=python_docx-0.8.11-py3-none-any.whl size=184490 sha256=061816c1880f685e9969ac1f17503b95097ee76072101c0fcba9acc6e84143a1
  Stored in directory: /home/ec2-user/.cache/pip/wheels/65/e1/9b/0c38fe6cfe02a9fe31cb6b

## Hyperparameter

In [45]:
# The name of index
index_name = ' '

# The name of embbeding model endpoint, usually you can keep it as default
eb_endpoint = 'huggingface-inference-text2vec-base-chinese-v1'

# Ebbeding vector dimension, usually you can keep it as default
v_dimension = 768

# Docs file folder to be processed and ingested
folder_path = ' '

In [46]:
import os
import docx
import pandas as pd
from docx import Document
import sagemaker
from sagemaker.huggingface import HuggingFaceModel
import json
import boto3
import requests

hfp = sagemaker.huggingface.model.HuggingFacePredictor('huggingface-inference-text2vec-base-chinese-v1')

#===================Function Definition=================
def is_all_black(s):
    for si in s:
        if si != ' ':
            return False
    return True

def read_doc(path):
    title = get_title(path)
    titles = []
    paragraphs = []
    sentences = []
    paragraphs_id = []
    sentences_id = []
    
    document = Document(path)  # 读入文件
    for i in range(len(document.paragraphs)):
        p0 = document.paragraphs[i].text
        p = document.paragraphs[i].text.replace('. ', '。')
        if p != '':
            ss = p.split('。')
            for j in range(len(ss)):
                if ss[j] != '' and is_all_black(ss[j])==False:
                    titles.append(title)
                    paragraphs.append(p0)
                    sentences.append(ss[j])
                    paragraphs_id.append(i)
                    sentences_id.append(j)
    df = pd.DataFrame({'title':titles, 'paragraph':paragraphs, 'sentence':sentences,
                      'paragraph_id':paragraphs_id, 'sentence_id':sentences_id})          
    return df

def get_title(path):
    try:
        title = os.path.split(os.path.splitext(path)[0])[1].replace('——', '-').split('-')[1]
    except:
        title = os.path.split(os.path.splitext(path)[0])[1]
    return title

def get_vector(q):
    if len(q) > 400:
        return [-1000 for _ in range(768)]
    return hfp.predict({'inputs':[q]})[0][0][0]

def embbeding(df):
    df['title_vector'] = ''
    df['sentence_vector'] = ''
    title_vector = str(get_vector(df.iloc[0, 0]))
    for i in range(len(df)):
        df.iloc[i, 5] = title_vector
        df.iloc[i, 6] = str(get_vector(df.iloc[i, 2]))
        print('\r embbeding %i out of %i finished'%(i, len(df)), end='')
    return df

# ==============OpenSearch Related=====================
# retrieve secret manager value by key using boto3
sm_client = boto3.client('secretsmanager')
master_user = sm_client.get_secret_value(SecretId='opensearch-host-url')['SecretString']
data= json.loads(master_user)
es_host_name = data.get('host')
host = es_host_name+'/' if es_host_name[-1] != '/' else es_host_name# cluster endpoint, for example: my-test-domain.us-east-1.es.amazonaws.com/
region = boto3.Session().region_name # e.g. cn-north-1
# sm_client = boto3.client('secretsmanager')
master_user = sm_client.get_secret_value(SecretId='opensearch-master-user')['SecretString']
data= json.loads(master_user)
username = data.get('username')
password = data.get('password')
# service = 'es'
# credentials = boto3.Session().get_credentials()
awsauth = (username, password)
url = host+'_bulk'
headers = { "Content-Type": "application/json" }

payloads = {
"settings": { "index": {
"knn": True,
"knn.algo_param.ef_search": 100 }
}, "mappings": {
"properties": { 
  "title_vector": {
"type": "knn_vector", "dimension": v_dimension, "method": {
"name": "hnsw", "space_type": "l2", "engine": "nmslib", "parameters": {
"ef_construction": 256,
"m": 48 }
} },
"sentence_vector": {
"type": "knn_vector", "dimension": v_dimension, "method": {
"name": "hnsw", "space_type": "l2", "engine": "nmslib", "parameters": {
"ef_construction": 256,
"m": 48 }
} },
"title": { "type": "text"}, 
"sentence": {"type": "text" }, 
"paragraph": {"type": "text" }, 
"sentence_id": {"type": "text" }, 
"paragraph_id": {"type": "text" }
} }
}

# Create Index
r = requests.put(host+index_name, auth=awsauth, headers=headers, json=payloads)

def import_data(df, id_start=0, before_import=0):
    payloads = ''
    for i in range(id_start, len(df)+id_start):
        first = json.dumps({ "index": { "_index": index_name, "_id": str(i+before_import) } }, ensure_ascii=False) + "\n"
        second = json.dumps({"title": str(df.iloc[i-id_start, 0]), 
                     "paragraph": str(df.iloc[i-id_start, 1]), 
                     "sentence": str(df.iloc[i-id_start, 2]), 
                     "paragraph_id": str(df.iloc[i-id_start, 3]), 
                     "sentence_id": str(df.iloc[i-id_start, 4]), 
                     "title_vector": json.loads(df.iloc[i-id_start, 5]),
                     "sentence_vector": json.loads(df.iloc[i-id_start, 6])},
                   ensure_ascii=False) + "\n"
        payloads += first + second
    # print(payloads)
    r = requests.post(url, auth=awsauth, headers=headers, data=payloads.encode()) # requests.get, post, and delete have similar syntax
#     print(r.text)

#==============Main Preprocess Data and Import===============

slice = 10
names = os.listdir(folder_path)
before_import = 0
for j in range(len(names)):
    name = names[j]
    if os.path.splitext(name)[1] not in ['.doc','.docx']:continue
    df = read_doc(os.path.join(folder_path, name))
    df = embbeding(df)
    for i in range(len(df)//slice+1):
        import_data(df[slice*i:slice*(i+1)], slice*i, before_import)
        print('\r import %i out of %i finished'%(i, len(df)//slice+1), end='')
    before_import += len(df)
    print(' file %i out of %i finished'%(j, len(names)//slice+1))

 import 11 out of 12 finished file 0 out of 1 finished
