In [1]:
!pip install -r requirements.txt



In [15]:
import os
import glob
import json
import boto3
import codecs
import base64
import logging
import botocore
import numpy as np
import globals as g
from pathlib import Path
from opensearchpy import OpenSearch, RequestsHttpConnection, AWSV4SignerAuth
from utils import upload_to_s3, get_cfn_outputs, download_image_files_from_s3

logging.basicConfig(format='[%(asctime)s] p%(process)s {%(filename)s:%(lineno)d} %(levelname)s - %(message)s', level=logging.INFO)
logger = logging.getLogger(__name__)

In [22]:
# download images from S3, we would be converting these to embeddings
image_files = download_image_files_from_s3(g.BUCKET_NAME, g.BUCKET_IMG_PREFIX, g.IMAGE_DIR, g.IMAGE_FILE_EXTN)
logger.info(f"downloaded {len(image_files)} from s3")

[2024-01-06 21:12:00,034] p1281 {utils.py:30} INFO - downloaded sagemaker-us-east-1-015469603702/multimodal/img/CMP301_TrainDeploy_E1_20230607_SPEdited_image_1.jpg to img/CMP301_TrainDeploy_E1_20230607_SPEdited_image_1.jpg
[2024-01-06 21:12:00,096] p1281 {utils.py:30} INFO - downloaded sagemaker-us-east-1-015469603702/multimodal/img/CMP301_TrainDeploy_E1_20230607_SPEdited_image_10.jpg to img/CMP301_TrainDeploy_E1_20230607_SPEdited_image_10.jpg
[2024-01-06 21:12:00,149] p1281 {utils.py:30} INFO - downloaded sagemaker-us-east-1-015469603702/multimodal/img/CMP301_TrainDeploy_E1_20230607_SPEdited_image_11.jpg to img/CMP301_TrainDeploy_E1_20230607_SPEdited_image_11.jpg
[2024-01-06 21:12:00,197] p1281 {utils.py:30} INFO - downloaded sagemaker-us-east-1-015469603702/multimodal/img/CMP301_TrainDeploy_E1_20230607_SPEdited_image_12.jpg to img/CMP301_TrainDeploy_E1_20230607_SPEdited_image_12.jpg
[2024-01-06 21:12:00,270] p1281 {utils.py:30} INFO - downloaded sagemaker-us-east-1-015469603702/multi

In [23]:
def encode_image_to_base64(image_file_path: str):
    with open(image_file_path, "rb") as image_file:
        b64_image = base64.b64encode(image_file.read()).decode('utf8')
        b64_image_path = os.path.join(g.B64_ENCODED_IMAGES_DIR, f"{Path(image_file_path).stem}.b64")
        with open(b64_image_path, "wb") as b64_image_file:
            b64_image_file.write(bytes(b64_image, 'utf-8'))

In [24]:
os.makedirs(g.B64_ENCODED_IMAGES_DIR, exist_ok=True)
for image_file_path in glob.glob(os.path.join(g.IMAGE_DIR, f"*{g.IMAGE_FILE_EXTN}")):
    encode_image_to_base64(image_file_path)

In [25]:
def get_embeddings(bedrock: botocore.client, image: str) -> np.ndarray:
    body = json.dumps(dict(inputImage=image))
    try:
        response = bedrock.invoke_model(
            body=body, modelId=g.FMC_MODEL_ID, accept=g.ACCEPT_ENCODING, contentType=g.CONTENT_ENCODING
        )
        response_body = json.loads(response.get("body").read())
        embeddings = np.array([response_body.get("embedding")]).astype(np.float32)
    except Exception as e:
        logger.error(f"exception while image(truncated)={image[:10]}, exception={e}")
        embeddings = None

    return embeddings

In [29]:
%%time

embeddings_list = []
bedrock = boto3.client(service_name="bedrock-runtime", region_name=g.AWS_REGION, endpoint_url=g.FMC_URL)
for image_file_path in glob.glob(os.path.join(g.B64_ENCODED_IMAGES_DIR, "*.b64")):
    print(image_file_path)
    # MAX image size supported is 2048 * 2048 pixels
    with open(image_file_path, "rb") as image_file:
        input_image_b64 = image_file.read().decode('utf-8')
    
    embeddings = get_embeddings(bedrock, input_image_b64)
    if embeddings is None:
        logger.error(f"error creating embeddings for {os.path.basename(image_file_path)}")
        continue

    data = {
        "image_path": f"s3://{g.BUCKET_NAME}/{g.BUCKET_IMG_PREFIX}/{Path(image_file_path).stem}{g.IMAGE_FILE_EXTN}",
        "metadata": {
          "slide_filename": g.SLIDE_DECK,
          "model_id": g.FMC_MODEL_ID,
          "slide_description": ""
        },
        "vector_embedding": embeddings[0].tolist()
      }
    
    embeddings_list.append(data)

    json.dump(embeddings_list,
              codecs.open(f"{Path(g.SLIDE_DECK).stem}.json", 'w', encoding='utf-8'), 
              separators=(',', ':'), 
              sort_keys=True, 
              indent=2)

img/b64_images/CMP301_TrainDeploy_E1_20230607_SPEdited_image_3.b64
img/b64_images/CMP301_TrainDeploy_E1_20230607_SPEdited_image_22.b64
img/b64_images/CMP301_TrainDeploy_E1_20230607_SPEdited_image_6.b64
img/b64_images/CMP301_TrainDeploy_E1_20230607_SPEdited_image_17.b64
img/b64_images/CMP301_TrainDeploy_E1_20230607_SPEdited_image_25.b64
img/b64_images/CMP301_TrainDeploy_E1_20230607_SPEdited_image_10.b64
img/b64_images/CMP301_TrainDeploy_E1_20230607_SPEdited_image_15.b64
img/b64_images/CMP301_TrainDeploy_E1_20230607_SPEdited_image_2.b64
img/b64_images/CMP301_TrainDeploy_E1_20230607_SPEdited_image_8.b64
img/b64_images/CMP301_TrainDeploy_E1_20230607_SPEdited_image_28.b64
img/b64_images/CMP301_TrainDeploy_E1_20230607_SPEdited_image_14.b64
img/b64_images/CMP301_TrainDeploy_E1_20230607_SPEdited_image_4.b64
img/b64_images/CMP301_TrainDeploy_E1_20230607_SPEdited_image_11.b64
img/b64_images/CMP301_TrainDeploy_E1_20230607_SPEdited_image_9.b64
img/b64_images/CMP301_TrainDeploy_E1_20230607_SPEdited

In [31]:
outputs = get_cfn_outputs(g.CFN_STACK_NAME)
os_collection_endpoint = outputs['MultimodalCollectionEndpoint'].split('//')[1]
host = os_collection_endpoint
session = boto3.Session()
credentials = session.get_credentials()
auth = AWSV4SignerAuth(credentials, g.AWS_REGION, g.OS_SERVICE)

os_client = OpenSearch(
    hosts = [{'host': host, 'port': 443}],
    http_auth = auth,
    use_ssl = True,
    verify_certs = True,
    connection_class = RequestsHttpConnection,
    pool_maxsize = 20
)

[2024-01-06 21:14:36,671] p1281 {credentials.py:1075} INFO - Found credentials from IAM Role: BaseNotebookInstanceEc2InstanceRole


jcbl0nhke4fyxk2bfjz2.us-east-1.aoss.amazonaws.com


In [32]:
outputs

{'MultimodalNotebookInstanceId': 'arn:aws:sagemaker:us-east-1:015469603702:notebook-instance/MultimodalNotebookInstance-AhdR74NPXvN8',
 'SQSQueueARN': 'arn:aws:sqs:us-east-1:015469603702:InputfromS3',
 'SQSQueueURL': 'https://sqs.us-east-1.amazonaws.com/015469603702/InputfromS3',
 'OpenSearchIndexName': 'multimodalslidesindex',
 'MultimodalCollectionEndpoint': 'https://jcbl0nhke4fyxk2bfjz2.us-east-1.aoss.amazonaws.com'}

In [37]:
index_name = outputs['OpenSearchIndexName']
index_body = """
{
  "settings": {
    "index.knn": true
  },
  "mappings": {
    "properties": {
      "vector_embedding": {
        "type": "knn_vector",
        "dimension": 1024,
        "method": {
          "name": "hnsw",
          "engine": "nmslib",
          "parameters": {}
        }
      },
      "image_path": {
        "type": "text"
      },
       "metadata": { 
        "properties" :
          {
            "slide_filename" : {
              "type" : "text"
            },
            "model_id" : {
              "type" : "text"
            },
            "slide_description":{
              "type": "text"
            }
          }
      }
    }
  }
}
"""
index_body = json.loads(index_body)

response = os_client.indices.create(index_name, body=index_body)
response


[2024-01-06 21:23:08,332] p1281 {base.py:259} INFO - PUT https://jcbl0nhke4fyxk2bfjz2.us-east-1.aoss.amazonaws.com:443/multimodalslidesindex [status:200 request:0.506s]


{'acknowledged': True,
 'shards_acknowledged': True,
 'index': 'multimodalslidesindex'}

In [38]:
upload_to_s3(f"{Path(g.SLIDE_DECK).stem}.json", g.BUCKET_NAME, g.BUCKET_EMB_PREFIX)

[2024-01-06 21:24:27,519] p1281 {utils.py:18} INFO - File CMP301_TrainDeploy_E1_20230607_SPEdited.json uploaded to sagemaker-us-east-1-015469603702/multimodal/osi-embeddings-json/CMP301_TrainDeploy_E1_20230607_SPEdited.json.
