In [None]:
import sys
!{sys.executable} -m pip install -r requirements.txt

In [None]:
import os
import glob
import json
import boto3
import codecs
import base64
import logging
import numpy as np
from typing import Dict
from pathlib import Path
from pandas.core.series import Series
from langchain.llms.bedrock import Bedrock

logging.basicConfig(format='[%(asctime)s] p%(process)s {%(filename)s:%(lineno)d} %(levelname)s - %(message)s', level=logging.INFO)
logger = logging.getLogger(__name__)

In [None]:
%run -i globals.py

In [None]:
bedrock = boto3.client(
    service_name="bedrock-runtime", region_name="us-east-1", endpoint_url=FMC_URL
)
s3 = boto3.client("s3")

In [None]:
download_from_s3()

In [None]:
def encode_image_to_base64(image_file_path: str):
    with open(image_file_path, "rb") as image_file:
        b64_image = base64.b64encode(image_file.read()).decode('utf8')
        b64_image_path = os.path.join(B64_ENCODED_IMAGES_DIR, f"{Path(image_file_path).stem}.b64")
        with open(b64_image_path, "wb") as b64_image_file:
            b64_image_file.write(bytes(b64_image, 'utf-8'))

In [None]:
for image_file_path in glob.glob(os.path.join(IMAGE_DIR, "*.jpg")):
    encode_image_to_base64(image_file_path)

In [None]:
def get_embeddings(image: str) -> np.ndarray:
    body = json.dumps(
        {
            "inputImage": image
        }
    )

    modelId = FMC_MODEL_ID
    accept = ACCEPT_ENCODING
    contentType = CONTENT_ENCODING

    try:
        response = bedrock.invoke_model(
            body=body, modelId=modelId, accept=accept, contentType=contentType
        )
        response_body = json.loads(response.get("body").read())
        # print(response_body)
        embeddings = np.array([response_body.get("embedding")]).astype(np.float32)
    except Exception as e:
        logger.error(f"exception while image(truncated)={image[:10]}, exception={e}")
        embeddings = None

    return embeddings

In [None]:
print(f"./{Path(SLIDE_DECK).stem}.json")

In [None]:
%%time

embeddings_list = []
for image_file_path in glob.glob(os.path.join(B64_ENCODED_IMAGES_DIR, "*.b64")):
    print(image_file_path)
    # MAX image size supported is 2048 * 2048 pixels
    with open(image_file_path, "rb") as image_file:
        input_image_b64 = image_file.read().decode('utf-8')
    
    embeddings = get_embeddings(input_image_b64)
    if embeddings is None:
        logger.error(f"error creating embeddings for {os.path.basename(image_file_path)}")
        continue

    data = {
        "image_path": f"s3://{BUCKET_NAME}/{BUCKET_PREFIX}/{Path(image_file_path).stem}.jpg",
        "metadata": {
          "slide_filename": SLIDE_DECK,
          "model_id": FMC_MODEL_ID,
          "slide_description": ""
        },
        "vector_embedding": embeddings[0].tolist()
      }
    
    embeddings_list.append(data)

    json.dump(embeddings_list, codecs.open(f"./{Path(SLIDE_DECK).stem}.json", 'w', encoding='utf-8'), 
          separators=(',', ':'), 
          sort_keys=True, 
          indent=4)

In [None]:
upload_to_s3(f"./{Path(SLIDE_DECK).stem}.json", BUCKET_EMB_PREFIX)