In [None]:
%load_ext autoreload 
%autoreload 2

!pip install --upgrade boto3
!pip install --upgrade sagemaker
!pip install python_docx
!pip install langchain
!pip install pypdf
!pip install docx2txt
!pip install unstructured

In [None]:
import os
import shutil
from langchain.chains import RetrievalQA
from langchain.chains.question_answering import load_qa_chain
from langchain.document_loaders import TextLoader
from langchain.document_loaders import UnstructuredMarkdownLoader
from langchain.document_loaders import PyPDFLoader
from langchain.document_loaders import Docx2txtLoader
from langchain.document_loaders import UnstructuredPowerPointLoader
from langchain.document_loaders.csv_loader import CSVLoader
from langchain.document_loaders import UnstructuredExcelLoader
from langchain.text_splitter import NLTKTextSplitter
from langchain.text_splitter import CharacterTextSplitter
from chinese_text_splitter import ChineseTextSplitter
import json
from typing import Dict, List, Tuple
from tqdm import tqdm
from datetime import datetime

import boto3
import nltk
import pandas as pd
import logging

import sagemaker
from sagemaker.huggingface import HuggingFaceModel
import requests

nltk.download('punkt')
logger = logging.getLogger()
logger.setLevel(logging.CRITICAL)

## Hyperparameter

In [None]:
# The name of index
sm_client = boto3.client('secretsmanager')
index_name = sm_client.get_secret_value(SecretId='opensearch-index-name')['SecretString']
data= json.loads(index_name)
index_name = data.get('index')
print('pre-defined index name in deployment/cdk.json-->',index_name)
#index_name = ''

# Language, 'chinese' or 'english'
language = 'chinese'

# The name of embbeding model endpoint, usually you can keep it as default
eb_endpoint = 'huggingface-inference-eb'

# Ebbeding vector dimension, usually you can keep it as default
v_dimension = 768

# Docs file folder to be processed and ingested
folder_path = 'docs/'
print('Please put data in this folder-->',folder_path)

# Paragraph size / Chunck size
chunck_size = 200

# The imported data of the same index_name, usually you can keep it as 0 if you are creating a new index
before_import = 0

In [None]:
hfp = sagemaker.huggingface.model.HuggingFacePredictor(eb_endpoint)

#===================Function Definition=================

def load_file(filepath,language):
    
    if filepath.lower().endswith(".pdf"):
        loader = PyPDFLoader(filepath)
    elif filepath.lower().endswith(".docx"):
        loader = Docx2txtLoader(filepath)
    elif filepath.lower().endswith(".pptx"):
        loader = UnstructuredPowerPointLoader(filepath)
    elif filepath.lower().endswith(".csv"):
        loader = CSVLoader(filepath)
    elif filepath.lower().endswith(".xlsx"):
        loader = UnstructuredExcelLoader(filepath)
    elif filepath.lower().endswith(".txt"):
        loader = TextLoader(filepath)
    else:
        loader = TextLoader(filepath)

    if language == "chinese":
        textsplitter = ChineseTextSplitter()
    elif language == "english":
        textsplitter = NLTKTextSplitter(chunk_size=chunck_size, chunk_overlap=10)

    docs = loader.load_and_split(textsplitter)
    return docs


def get_title(path):
    try:
        title = os.path.split(os.path.splitext(path)[0])[1].replace('——', '-').split('-')[1]
    except:
        title = os.path.split(os.path.splitext(path)[0])[1]
    return title

def read_doc(path, chunck_size = chunck_size):
    doc = load_file(path, language)
    title = get_title(path)
    titles = []
    paragraphs = []
    sentences = []
    para = ''
    con = 0
    for d in doc:
#         print('*********')
        con += 1
        titles.append(title)
        sentences.append(d.page_content)
        para += d.page_content
        if len(para) >= chunck_size:
            paragraphs += [para for _ in range(con)]
            para = ''
            con = 0
    paragraphs += [para for _ in range(con)]
    print(len(titles), len(sentences),len(paragraphs))
    df = pd.DataFrame({'title':titles, 'paragraph':paragraphs, 'sentence':sentences})
    return df

def get_vector(q):
    try:
        vector = hfp.predict({'inputs':[q]})[0][0][0]
        return vector
    except:
        return [-1000 for _ in range(v_dimension)]
    return hfp.predict({'inputs':[q]})[0][0][0]

def embbeding(df):
    df['title_vector'] = ''
    df['sentence_vector'] = ''
    title_vector = str(get_vector(df.iloc[0, 0]))
    for i in range(len(df)):
#         df.iloc[i, 5] = title_vector
        df.iloc[i, 3] = str(get_vector(df.iloc[i, 2]))
        print('\r embbeding %i out of %i finished'%(i, len(df)), end='')
    return df

# ==============OpenSearch Related=====================
# retrieve secret manager value by key using boto3
sm_client = boto3.client('secretsmanager')
master_user = sm_client.get_secret_value(SecretId='opensearch-host-url')['SecretString']
data= json.loads(master_user)
es_host_name = data.get('host')
host = es_host_name+'/' if es_host_name[-1] != '/' else es_host_name# cluster endpoint, for example: my-test-domain.us-east-1.es.amazonaws.com/
region = boto3.Session().region_name # e.g. cn-north-1
# sm_client = boto3.client('secretsmanager')
master_user = sm_client.get_secret_value(SecretId='opensearch-master-user')['SecretString']
data= json.loads(master_user)
username = data.get('username')
password = data.get('password')
# service = 'es'
# credentials = boto3.Session().get_credentials()
awsauth = (username, password)
url = host+'_bulk'
headers = { "Content-Type": "application/json" }

payloads = {
"settings": { "index": {
"knn": True,
"knn.algo_param.ef_search": 100 }
}, "mappings": {
"properties": { 
  "title_vector": {
"type": "knn_vector", "dimension": v_dimension, "method": {
"name": "hnsw", "space_type": "l2", "engine": "nmslib", "parameters": {
"ef_construction": 256,
"m": 128 }
} },
"sentence_vector": {
"type": "knn_vector", "dimension": v_dimension, "method": {
"name": "hnsw", "space_type": "l2", "engine": "nmslib", "parameters": {
"ef_construction": 256,
"m": 128 }
} },
"title": { "type": "text"}, 
"sentence": {"type": "text" }, 
"paragraph": {"type": "text" }, 
"sentence_id": {"type": "text" }, 
"paragraph_id": {"type": "text" }
} }
}

# Create Index
r = requests.put(host+index_name, auth=awsauth, headers=headers, json=payloads)

def import_data(df, id_start=0, before_import=0):
    payloads = ''
    for i in range(id_start, len(df)+id_start):
        first = json.dumps({ "index": { "_index": index_name, "_id": str(i+before_import) } }, ensure_ascii=False) + "\n"
        second = json.dumps({"title": str(df.iloc[i-id_start, 0]), 
                     "paragraph": str(df.iloc[i-id_start, 1]), 
                     "sentence": str(df.iloc[i-id_start, 2]), 
                     "sentence_vector": json.loads(df.iloc[i-id_start, 3])},
                   ensure_ascii=False) + "\n"
        payloads += first + second
    # print(payloads)
    r = requests.post(url, auth=awsauth, headers=headers, data=payloads.encode()) # requests.get, post, and delete have similar syntax
#     print(r.text)

#==============Main Preprocess Data and Import===============

slice = 10
names = os.listdir(folder_path)
# before_import = 0
failed_files = []
for j in range(len(names)):
    name = names[j]
#     if os.path.splitext(name)[1] not in ['.doc','.docx']:continue
    print('filename-->',name)
    if os.path.isdir(name):continue
    try:
        df = read_doc(os.path.join(folder_path, name))
        df = embbeding(df)
        for i in range(len(df)//slice+1):
            import_data(df[slice*i:slice*(i+1)], slice*i, before_import)
            print('\r import %i out of %i finished'%(i, len(df)//slice+1), end='')
        before_import += len(df)
        print(' file %i out of %i finished'%(j, len(names)//slice+1))
    except Exception as ex:
        # traceback.print_exc(file=sys.stdout)
        failed_files.append(name)
        print(f"=================Exception================={ex}")