#### Once we have embedded all content to vectors, we can save them to vector database

Here we use pgvector from Amazon RDS

In [1]:
!pip install -U psycopg2-binary pgvector tqdm 

Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
Collecting psycopg2-binary
  Downloading psycopg2_binary-2.9.9-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.0/3.0 MB[0m [31m45.7 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting pgvector
  Downloading pgvector-0.2.3-py2.py3-none-any.whl (9.3 kB)
Installing collected packages: psycopg2-binary, pgvector
  Attempting uninstall: psycopg2-binary
    Found existing installation: psycopg2-binary 2.9.7
    Uninstalling psycopg2-binary-2.9.7:
      Successfully uninstalled psycopg2-binary-2.9.7
  Attempting uninstall: pgvector
    Found existing installation: pgvector 0.2.2
    Uninstalling pgvector-0.2.2:
      Successfully uninstalled pgvector-0.2.2
Successfully installed pgvector-0.2.3 psycopg2-binary-2.9.9


In [9]:
import psycopg2
from pgvector.psycopg2 import register_vector
import boto3 
import json 

client = boto3.client('secretsmanager')

response = client.get_secret_value(
    SecretId='rdspg-vector-secret'
)
database_secrets = json.loads(response['SecretString'])

In [10]:
dbhost = database_secrets['host']
dbport = database_secrets['port']
dbuser = database_secrets['username']
dbpass = database_secrets['password']

dbconn = psycopg2.connect(host=dbhost, user=dbuser, password=dbpass, port=dbport, connect_timeout=10)
dbconn.set_session(autocommit=True)

cur = dbconn.cursor()
cur.execute("CREATE EXTENSION IF NOT EXISTS vector;")
register_vector(dbconn)
cur.execute("DROP TABLE IF EXISTS pdfembedding;")

In [11]:
cur.execute("""CREATE TABLE IF NOT EXISTS pdfembedding(
               id bigserial primary key, 
               content text, 
               pdf_file_name text, 
               page_number int, 
               pdf_file_path text, 
               content_embeddings vector(1536));""")

In [12]:
data_file_name = "./data/embedding.json"

In [13]:
with open(data_file_name, 'r') as f:
    data = json.load(f)

#### Check the embedding size equals to 1536

In [15]:
print(len(data[0]['embedding']) == 1536)

True


In [16]:
for item in data:
    cur.execute("""INSERT INTO pdfembedding
                      (content, pdf_file_name, page_number, pdf_file_path, content_embeddings) 
                  VALUES(%s, %s, %s, %s, %s);""", 
                  (item.get('content'), item.get('pdf_file_name'), item.get('page_number'), item.get('pdf_file_path'), item.get('embedding') ))

In [17]:
cur.execute("""CREATE INDEX ON pdfembedding 
               USING ivfflat (content_embeddings vector_l2_ops) WITH (lists = 100);""")

In [18]:
cur.execute("VACUUM ANALYZE pdfembedding;")

In [19]:
cur.execute("""SELECT id, content, pdf_file_name, page_number, pdf_file_path, content_embeddings 
            FROM pdfembedding 
            limit 3;""",
           )
r = cur.fetchall()
r

[(1,
  '2022 ANNUAL REPORT\n2022 ANNUAL REPORT',
  'annual_report.pdf',
  0,
  './pdfs/annual_report.pdf',
  array([ 0.07473046, -0.03020543, -0.0295342 , ...,  0.01029222,
          0.03624651, -0.01622143], dtype=float32)),
 (2,
  'This annual report (in both English and Chinese versions) \nhas been posted on the Company’s website at www.mi.com and the Stock Exchange’s website at www.hkexnews.hk. Shareholders who have chosen to rely on copies of the corporate communications (including but not limited to annual report and \n(where applicable) summary financial report, interim report and \n(where applicable) summary interim report, notice of meeting, listing document, circular and proxy form) posted on the aforesaid websites in lieu of any or all the printed copies thereof may request the printed copy of the annual report.\nShareholders who have chosen or are deemed to have consented \nto receive the corporate communications using electronic means and who have difficulty in receiving o

In [20]:
cur.close()
dbconn.close()
print ("Vector embeddings has been successfully loaded into PostgreSQL")
     

Vector embeddings has been successfully loaded into PostgreSQL


## Evaluate PostgreSQL vector Search Results

In this step we will use SageMaker realtime inference to generate embeddings for the query and use the embeddings to search the PostgreSQL to retrive the nearest neighbours and retrive the relevent content.

In [21]:
dbconn = psycopg2.connect(host=dbhost, user=dbuser, password=dbpass, port=dbport, connect_timeout=10)
dbconn.set_session(autocommit=True)
cur = dbconn.cursor()

#### The embedding model used for knowledge base (pdf) embedding should be the same for query embedding

In [22]:
endpoint_name = "jumpstart-dft-mx-tcembedding-robertafin-large-uncased-2"

In [23]:
query = """
99\n2022 ANNUAL REPORT\nShareholders’ Rights\nTo safeguard the Shareholders’ interests and rights, separate resolutions are proposed at the Shareholders’ meetings on \neach substantial issue, including the election of individual directors, for the Shareholders’ consideration and voting. All resolutions put forward at the Shareholders’ meetings will be voted by poll pursuant to the Listing Rules and poll results will be posted on the websites of the Company (www.mi.com) and the Stock Exchange after each Shareholders’ meeting.\nPursuant to the Articles of the Company, extraordinary general meetings shall be convened on the written requisition \nof any one or more Shareholders holding, as of the date of deposit of the requisition, in aggregate shares representing not less than one-tenth of the paid up capital of the Company which carry the right of voting at general meetings of the Company. A written requisition shall be deposited at the principal office of the Company in Hong Kong to the Board or the joint company secretaries for the purpose of requiring an extraordinary general meeting to be called by the Board for the transaction of any business specified in such requisition. Such meeting shall be held within two months after the deposit of such requisition. If within 21 days of such deposit, the Board fails to proceed to convene such meeting, the requisitionist(s) themselves may convene the general meeting in the same manner, and all reasonable expenses \nincurred by the requisitionist(s) as a result of the failure of the Board shall be reimbursed to them by the Company.\nFor the avoidance of doubt, Shareholders must deposit and send the original duly signed written requisition, notice \nor statement (as the case may be) to the Company’s principal place of business in Hong Kong and provide their full name, contact details and identification in order to give effect thereto. Shareholders’ information may be disclosed as required by law.\nThere is no provision allowing the Shareholders to move new resolutions at general meetings under the Cayman \nIslands Companies Law or the Articles. Shareholders who wish to move a resolution may request the Company to \nconvene a general meeting following the procedures set out in the preceding paragraph.\nWith respect to the Shareholders’ right in proposing persons for election as Directors, please refer to the procedures \navailable on the website of the Company.\nDuring the Reporting Period, the Company has amended its Memorandum and Articles of Association by way of a \nspecial resolution passed on June 2, 2022. Details of the amendments are set out in the circular dated April 27, 2022 to the Shareholders.\nThe up-to-date version of the Memorandum and Articles of Association is available on the websites of the Company \nand the Stock Exchange.\nEvents after the Reporting Period\nSave as disclosed in this Corporate Governance Report, there has been no other significant event subsequent to the Reporting Period and up to the Latest Practicable Date that might affect the Group. 
"""

In [27]:
from sklearn.preprocessing import normalize
import numpy as np 
import boto3

runtime_client = boto3.client('bedrock-runtime')
embed_llm_id = "amazon.titan-embed-text-v1" 


def get_embedding(sent, bedrock_runtime, embed_llm_id):
    body = json.dumps({"inputText": sent})
    #claude_prompt = f"\n\nHuman:{sent}\n\nAssistant:"
    #body = json.dumps({ "prompt": claude_prompt, "temperature": 0.5, "top_p": 1, "top_k": 250, "max_tokens_to_sample": 200, "stop_sequences": ["\n\nHuman:"] })
    #print(body)
    accept = "application/json"
    contentType = "application/json"

    response = bedrock_runtime.invoke_model(
        body=body, modelId=embed_llm_id, accept=accept, contentType=contentType
    )
    response_body = json.loads(response.get("body").read())
    embedding = response_body.get("embedding")

    return np.squeeze(np.array(embedding).reshape(1, -1)).tolist()

def chunk_words(sequence, chunk_size):
    sequence = sequence.split()
    return [' '.join(sequence[i:i+chunk_size]) for i in range(0, len(sequence), chunk_size)]

def query_endpoint(payload, runtime_client,embed_llm_id):
    embeddings = []
    chunk_payload = chunk_words(payload, 400)
    for i, chunk in enumerate(chunk_payload):
        #print("Chunk ",i)
        #print("Content ",chunk)
        embeddings_chunk = get_embedding(chunk, runtime_client,embed_llm_id)
        embeddings.append(embeddings_chunk)
    return embeddings


def parse_response(query_response, runtime_client,embed_llm_id):
    """Parse response and return the embedding."""
    embeddings = np.array(query_endpoint(query_response, runtime_client, embed_llm_id))
    #avg_embeddings = np.mean(embeddings, axis=0)
    # try max pooling of embedding vector
    avg_embeddings = np.max(embeddings, axis=0)

    avg_embeddings = avg_embeddings.reshape(1, -1)
    # normalization before inner product
    avg_embeddings = normalize(avg_embeddings, axis=1, norm='l2')
    return np.squeeze(avg_embeddings)

In [28]:
query_response = parse_response(query, runtime_client,embed_llm_id)
print(len(query_response))

1536


In [29]:
query_vector = query_response

In [30]:
print(type(query_vector))
print(query_vector)

<class 'numpy.ndarray'>
[ 0.06288187 -0.0086542  -0.02921784 ...  0.0128622   0.05716533
 -0.00952756]


#### Let's extract the top 3 most relevant pages from our database

In [31]:
# Execute the query
cur.execute("""
    SELECT id, content, pdf_file_name, page_number, pdf_file_path, content_embeddings
    FROM pdfembedding
    ORDER BY content_embeddings <=> %s limit 5;
    """, 
    (query_vector,)
)

In [32]:
# Fetch the results
results = cur.fetchall()

In [33]:
results

[(101,
  '99\n2022 ANNUAL REPORT\nShareholders’ Rights\nTo safeguard the Shareholders’ interests and rights, separate resolutions are proposed at the Shareholders’ meetings on \neach substantial issue, including the election of individual directors, for the Shareholders’ consideration and voting. All resolutions put forward at the Shareholders’ meetings will be voted by poll pursuant to the Listing Rules and poll results will be posted on the websites of the Company (www.mi.com) and the Stock Exchange after each Shareholders’ meeting.\nPursuant to the Articles of the Company, extraordinary general meetings shall be convened on the written requisition \nof any one or more Shareholders holding, as of the date of deposit of the requisition, in aggregate shares representing not less than one-tenth of the paid up capital of the Company which carry the right of voting at general meetings of the Company. A written requisition shall be deposited at the principal office of the Company in Hong K

In [34]:
cur.close()
dbconn.close()