In [29]:
import boto3
import json

#### Maker sure the name of embedding model is correct

In [30]:
endpoint_name = "jumpstart-dft-mx-tcembedding-robertafin-large-uncased-2"

In [31]:
payload = """
99\n2022 ANNUAL REPORT\nShareholders’ Rights\nTo safeguard the Shareholders’ interests and rights, separate resolutions are proposed at the Shareholders’ meetings on \neach substantial issue, including the election of individual directors, for the Shareholders’ consideration and voting. All resolutions put forward at the Shareholders’ meetings will be voted by poll pursuant to the Listing Rules and poll results will be posted on the websites of the Company (www.mi.com) and the Stock Exchange after each Shareholders’ meeting.\nPursuant to the Articles of the Company, extraordinary general meetings shall be convened on the written requisition \nof any one or more Shareholders holding, as of the date of deposit of the requisition, in aggregate shares representing not less than one-tenth of the paid up capital of the Company which carry the right of voting at general meetings of the Company. A written requisition shall be deposited at the principal office of the Company in Hong Kong to the Board or the joint company secretaries for the purpose of requiring an extraordinary general meeting to be called by the Board for the transaction of any business specified in such requisition. Such meeting shall be held within two months after the deposit of such requisition. If within 21 days of such deposit, the Board fails to proceed to convene such meeting, the requisitionist(s) themselves may convene the general meeting in the same manner, and all reasonable expenses \nincurred by the requisitionist(s) as a result of the failure of the Board shall be reimbursed to them by the Company.\nFor the avoidance of doubt, Shareholders must deposit and send the original duly signed written requisition, notice \nor statement (as the case may be) to the Company’s principal place of business in Hong Kong and provide their full name, contact details and identification in order to give effect thereto. Shareholders’ information may be disclosed as required by law.\nThere is no provision allowing the Shareholders to move new resolutions at general meetings under the Cayman \nIslands Companies Law or the Articles. Shareholders who wish to move a resolution may request the Company to \nconvene a general meeting following the procedures set out in the preceding paragraph.\nWith respect to the Shareholders’ right in proposing persons for election as Directors, please refer to the procedures \navailable on the website of the Company.\nDuring the Reporting Period, the Company has amended its Memorandum and Articles of Association by way of a \nspecial resolution passed on June 2, 2022. Details of the amendments are set out in the circular dated April 27, 2022 to the Shareholders.\nThe up-to-date version of the Memorandum and Articles of Association is available on the websites of the Company \nand the Stock Exchange.\nEvents after the Reporting Period\nSave as disclosed in this Corporate Governance Report, there has been no other significant event subsequent to the Reporting Period and up to the Latest Practicable Date that might affect the Group. 
"""

#### Please note that certain embedding models may have a shorter input token size, please make sure that the chunk size is relevant to your embedding model

In [32]:
def chunk_words(sequence, chunk_size):
    sequence = sequence.split()
    return [' '.join(sequence[i:i+chunk_size]) for i in range(0, len(sequence), chunk_size)]

def query_endpoint(payload):
    embeddings = []
    client = boto3.client("sagemaker-runtime")
    chunk_payload = chunk_words(payload, 400)
    for i, chunk in enumerate(chunk_payload):
        #print("Chunk ",i)
        #print("Content ",chunk)
        response = client.invoke_endpoint(
            EndpointName=endpoint_name,
            ContentType="application/x-text",
            Body=json.dumps(chunk),
        )    
        response = response["Body"].read().decode("utf8")
        response = json.loads(response)
        embeddings_chunk = response["embedding"]
        embeddings.append(embeddings_chunk)
    return embeddings

In [33]:
from sklearn.preprocessing import normalize
import numpy as np 

def parse_response(query_response):
    """Parse response and return the embedding."""
    embeddings = np.array(query_endpoint(query_response))
    #avg_embeddings = np.mean(embeddings, axis=0)
    # try max pooling of embedding vector
    avg_embeddings = np.max(embeddings, axis=0)

    avg_embeddings = avg_embeddings.reshape(1, -1)
    # normalization before inner product
    avg_embeddings = normalize(avg_embeddings, axis=1, norm='l2')
    return np.squeeze(avg_embeddings)

In [34]:
embedding = parse_response(payload)

In [35]:
print(embedding)
print(abs(np.linalg.norm(embedding) - 1) < 1e-5)

[ 0.00119371  0.02671994  0.01099941 ...  0.02109275 -0.01836102
  0.05742953]
True


In [36]:
# Define a name for the output file
pdf_json_file_name = './data/content.json'

with open(pdf_json_file_name, 'r') as f:
    data = json.load(f)

In [37]:
# Loop through pages
for page_index, page in enumerate(data):
    if page_index % 100 == 0:
        print(f'Processing page {page_index}')
    data[page_index]["embedding"] = parse_response(data[page_index]["content"]).tolist()

Processing page 0
Processing page 100
Processing page 200
Processing page 300


In [38]:
output_file_name = './data/embedding.json'
with open(output_file_name, 'w') as f:
    # Use json.dump to write pdfText to the file
    json.dump(data, f, indent=4)