## Step 1: Setup

In [None]:
import sys
!{sys.executable} -m pip install -r requirements.txt

In [None]:
import os
import time
import glob
import json
import time
import boto3
import codecs
import base64
import logging
import botocore
import sagemaker
import jsonlines
import numpy as np
import pandas as pd 
import globals as g
import requests as req
from typing import List
from pathlib import Path
from requests_auth_aws_sigv4 import AWSSigV4
from opensearchpy import OpenSearch, RequestsHttpConnection, AWSV4SignerAuth
from utils import get_cfn_outputs, get_bucket_name, download_image_files_from_s3, get_text_embedding

logging.basicConfig(format='[%(asctime)s] p%(process)s {%(filename)s:%(lineno)d} %(levelname)s - %(message)s', level=logging.INFO)
logger = logging.getLogger(__name__)

In [None]:
sagemaker_session = sagemaker.Session()
sm_client = sagemaker_session.sagemaker_client
sm_runtime_client = sagemaker_session.sagemaker_runtime_client

In [None]:
bedrock = boto3.client(service_name="bedrock-runtime", region_name=g.AWS_REGION, endpoint_url=g.TITAN_URL)

## Step 2: Create the OpenSearch Service Serverless index

In [None]:
outputs = get_cfn_outputs(g.CFN_STACK_NAME)
host = outputs['MultimodalCollectionEndpoint'].split('//')[1]
# index_name = outputs['OpenSearchIndexName']
index_name = "blog3slides"
logger.info(f"opensearchhost={host}, index={index_name}")

osi_endpoint = f"https://outputs['OpenSearchPipelineEndpoint']/data/ingest"

In [None]:
session = boto3.Session()
credentials = session.get_credentials()
auth = AWSV4SignerAuth(credentials, g.AWS_REGION, g.OS_SERVICE)

os_client = OpenSearch(
    hosts = [{'host': host, 'port': 443}],
    http_auth = auth,
    use_ssl = True,
    verify_certs = True,
    connection_class = RequestsHttpConnection,
    pool_maxsize = 20
)

In [None]:
index_body = """
{
  "settings": {
    "index.knn": true
  },
  "mappings": {
    "properties": {
      "vector_embedding": {
        "type": "knn_vector",
        "dimension": 1536,
        "method": {
          "name": "hnsw",
          "engine": "nmslib",
          "parameters": {}
        }
      },
      "image_url": {
        "type": "text"
      },
      "slide_text": {
        "type": "text"
      },
       "metadata": { 
        "properties" :
          {
            "deck_name" : {
              "type" : "text"
            },
            "deck_url" : {
              "type" : "text"
            }
          }
      }
    }
  }
}
"""

# We would get an index already exists exception if the index already exists, and that is fine.
index_body = json.loads(index_body)
try:
    response = os_client.indices.create(index_name, body=index_body)
    logger.info(f"response received for the create index -> {response}")
except Exception as e:
    logger.error(f"error in creating index={index_name}, exception={e}")

## Step 3: Download images locally, get embeddings and ingest into OSI pipeline

In [None]:
def download_image(url, image_dir):
    logger.info(f"downloading image at {url}")
    local_file: str = os.path.join(image_dir, os.path.basename(url))
    r = req.get(url, allow_redirects=True)
    if r.status_code == 200:
        logger.info(f"{url} downloaded successfully")
        with open(local_file, "wb") as f:
            f.write(r.content)
        logger.info(f"{url} written to {local_file}")
        return local_file
    return ""

In [None]:
def encode_image_to_base64(image_file_path: str) -> str:
    with open(image_file_path, "rb") as image_file:
        b64_image = base64.b64encode(image_file.read()).decode('utf8')
        b64_image_path = os.path.join(g.B64_ENCODED_IMAGES_DIR, f"{Path(image_file_path).stem}.b64")
        with open(b64_image_path, "wb") as b64_image_file:
            b64_image_file.write(bytes(b64_image, 'utf-8'))
    return b64_image_path

In [None]:
def get_img_desc(image_file_path: str, prompt: str):
    # read the file, MAX image size supported is 2048 * 2048 pixels
    with open(image_file_path, "rb") as image_file:
        input_image_b64 = image_file.read().decode('utf-8')
  
    body = json.dumps(
        {
            "anthropic_version": "bedrock-2023-05-31",
            "max_tokens": 1000,
            "messages": [
                {
                    "role": "user",
                    "content": [
                        {
                            "type": "image",
                            "source": {
                                "type": "base64",
                                "media_type": "image/jpeg",
                                "data": input_image_b64
                            },
                        },
                        {"type": "text", "text": prompt},
                    ],
                }
            ],
        }
    )
    
    response = bedrock.invoke_model(
        modelId=g.CLAUDE_MODEL_ID,
        body=body
    )

    resp_body = json.loads(response['body'].read().decode("utf-8"))
    resp_text = resp_body['content'][0]['text'].replace('"', "'")

    return resp_text

In [None]:
prompt = """
Please provide a detailed description of the image. Describe the overall layout and design of the slide. Identify and describe any tables, charts, or other visual elements present, including the specific data or information contained within them. Provide as much detail as possible about the content and format of the slide. Your response should be extremely detailed and data oriented. Be completely accurate
"""

print(prompt)

In [None]:
os.makedirs(g.IMAGE_DIR, exist_ok=True)
os.makedirs(g.B64_ENCODED_IMAGES_DIR, exist_ok=True)

cols = ['url']
with jsonlines.open('qa.jsonl') as f:
    for line in f.iter():
        deck_name = line['deck_name']
        deck_url = line['deck_url']
        img_df = pd.DataFrame(line['image_urls'], columns=cols)
        for ind, row in img_df.iterrows():
            img_url = row['url']
            img_path = download_image(img_url, g.IMAGE_DIR)
            if img_path != "":
                b64_img_path = encode_image_to_base64(img_path)

                logger.info(f"going to convert {img_url} into embeddings")
                resp_text = get_img_desc(b64_img_path, prompt)
                embedding = get_text_embedding(bedrock, resp_text)

                # convert the data we want to ingest for this image into a JSON, this include the metadata as well
                # the metadata can be used later as part of hybrid search from the vector db
                data = json.dumps([{
                    "image_url": img_url,
                    "slide_text": resp_text,
                    "metadata": {
                      "deck_name": deck_name,
                      "deck_url": deck_url
                    },
                    "vector_embedding": embedding
                  }])

                r = req.request(
                method='POST', 
                url=osi_endpoint, 
                data=data,
                auth=AWSSigV4('osis'))

                logger.info("Ingesting data into pipeline")
                logger.info(f"Response: {img_url} - {r.text}")