## Step 1: Setup

In [None]:
import sys
!{sys.executable} -m pip install -r requirements.txt

In [1]:
import os
import time
import glob
import json
import time
import boto3
import codecs
import base64
import logging
import botocore
import sagemaker
import jsonlines
import numpy as np
import pandas as pd 
import globals as g
import requests as req
from typing import List
from pathlib import Path
from requests_auth_aws_sigv4 import AWSSigV4
from opensearchpy import OpenSearch, RequestsHttpConnection, AWSV4SignerAuth
from utils import get_cfn_outputs, get_bucket_name, download_image_files_from_s3, get_text_embedding
from utils import download_image_from_url, encode_image_to_base64, get_img_desc

logging.basicConfig(format='[%(asctime)s] p%(process)s {%(filename)s:%(lineno)d} %(levelname)s - %(message)s', level=logging.INFO)
logger = logging.getLogger(__name__)

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


In [2]:
bedrock = boto3.client(service_name="bedrock-runtime", region_name=g.AWS_REGION, endpoint_url=g.TITAN_URL)

## Step 2: Create the OpenSearch Service Serverless index

In [3]:
outputs = get_cfn_outputs(g.CFN_STACK_NAME)
host = outputs['MultimodalCollectionEndpoint'].split('//')[1]
# index_name = outputs['OpenSearchIndexName']
index_name = "blog3slides-app2"
logger.info(f"opensearchhost={host}, index={index_name}")

osi_endpoint = f"https://{outputs['OpenSearchPipelineEndpoint']}/data/ingest"

[2024-05-28 22:19:20,858] p19090 {3829838013.py:5} INFO - opensearchhost=7uiiz7d87b3q8u2kfmtd.us-east-1.aoss.amazonaws.com, index=blog3slides-app2


In [5]:
session = boto3.Session()
credentials = session.get_credentials()
auth = AWSV4SignerAuth(credentials, g.AWS_REGION, g.OS_SERVICE)

os_client = OpenSearch(
    hosts = [{'host': host, 'port': 443}],
    http_auth = auth,
    use_ssl = True,
    verify_certs = True,
    connection_class = RequestsHttpConnection,
    pool_maxsize = 20
)

[2024-05-28 22:19:31,300] p19090 {credentials.py:1075} INFO - Found credentials from IAM Role: BaseNotebookInstanceEc2InstanceRole


In [8]:
index_body = """
{
  "settings": {
    "index.knn": true
  },
  "mappings": {
    "properties": {
      "vector_embedding": {
        "type": "knn_vector",
        "dimension": 1536,
        "method": {
          "name": "hnsw",
          "engine": "nmslib",
          "parameters": {}
        }
      },
      "image_url": {
        "type": "text"
      },
      "slide_text": {
        "type": "text"
      },
       "metadata": { 
        "properties" :
          {
            "deck_name" : {
              "type" : "text"
            },
            "deck_url" : {
              "type" : "text"
            }
          }
      }
    }
  }
}
"""

# We would get an index already exists exception if the index already exists, and that is fine.
index_body = json.loads(index_body)
try:
    response = os_client.indices.create(index_name, body=index_body)
    logger.info(f"response received for the create index -> {response}")
except Exception as e:
    logger.error(f"error in creating index={index_name}, exception={e}")

[2024-05-28 22:20:25,477] p19090 {base.py:259} INFO - PUT https://7uiiz7d87b3q8u2kfmtd.us-east-1.aoss.amazonaws.com:443/blog3slides-app2 [status:200 request:0.386s]
[2024-05-28 22:20:25,478] p19090 {3324873869.py:43} INFO - response received for the create index -> {'acknowledged': True, 'shards_acknowledged': True, 'index': 'blog3slides-app2'}


In [7]:
# os_client.indices.delete(index_name)

[2024-05-28 22:20:17,136] p19090 {base.py:259} INFO - DELETE https://7uiiz7d87b3q8u2kfmtd.us-east-1.aoss.amazonaws.com:443/blog3slides-app2 [status:200 request:0.092s]


{'acknowledged': True}

## Step 3: Download images locally, get embeddings and ingest into OSI pipeline

In [9]:
prompt = """
Please provide a detailed description of the image. Describe the overall layout and design of the slide. Identify and describe any tables, charts, or other visual elements present, including the specific data or information contained within them. Provide as much detail as possible about the content and format of the slide. Your response should be extremely detailed and data oriented. Be completely accurate
"""

print(prompt)


Please provide a detailed description of the image. Describe the overall layout and design of the slide. Identify and describe any tables, charts, or other visual elements present, including the specific data or information contained within them. Provide as much detail as possible about the content and format of the slide. Your response should be extremely detailed and data oriented. Be completely accurate



In [10]:
os.makedirs(g.IMAGE_DIR, exist_ok=True)
os.makedirs(g.B64_ENCODED_IMAGES_DIR, exist_ok=True)

cols = ['url']
with jsonlines.open('qa.jsonl') as f:
    for line in f.iter():
        deck_name = line['deck_name']
        deck_url = line['deck_url']
        img_df = pd.DataFrame(line['image_urls'], columns=cols)
        for ind, row in img_df.iterrows():
            img_url = row['url']
            img_path = download_image_from_url(img_url, g.IMAGE_DIR)
            if img_path != "":
                b64_img_path = encode_image_to_base64(img_path)

                logger.info(f"going to convert {img_url} into embeddings")
                resp_text = get_img_desc(bedrock, b64_img_path, prompt)
                embedding = get_text_embedding(bedrock, resp_text, g.TITAN_MODEL_ID)

                # convert the data we want to ingest for this image into a JSON, this include the metadata as well
                # the metadata can be used later as part of hybrid search from the vector db
                data = json.dumps([{
                    "image_url": img_url,
                    "slide_text": resp_text,
                    "metadata": {
                      "deck_name": deck_name,
                      "deck_url": deck_url
                    },
                    "vector_embedding": embedding
                  }])

                r = req.request(
                method='POST', 
                url=osi_endpoint, 
                data=data,
                auth=AWSSigV4('osis'))

                logger.info("Ingesting data into pipeline")
                logger.info(f"Response: {img_url} - {r.text}")

[2024-05-28 22:20:37,957] p19090 {utils.py:57} INFO - downloading image at https://image.slidesharecdn.com/2012-02-20fy11roadshow-120221022442-phpapp02/95/feb-20-2012-nestl-2011-fullyear-roadshow-presentation-1-1024.jpg
[2024-05-28 22:20:37,989] p19090 {utils.py:61} INFO - https://image.slidesharecdn.com/2012-02-20fy11roadshow-120221022442-phpapp02/95/feb-20-2012-nestl-2011-fullyear-roadshow-presentation-1-1024.jpg downloaded successfully
[2024-05-28 22:20:37,990] p19090 {utils.py:64} INFO - https://image.slidesharecdn.com/2012-02-20fy11roadshow-120221022442-phpapp02/95/feb-20-2012-nestl-2011-fullyear-roadshow-presentation-1-1024.jpg written to img/feb-20-2012-nestl-2011-fullyear-roadshow-presentation-1-1024.jpg
[2024-05-28 22:20:37,994] p19090 {975082958.py:16} INFO - going to convert https://image.slidesharecdn.com/2012-02-20fy11roadshow-120221022442-phpapp02/95/feb-20-2012-nestl-2011-fullyear-roadshow-presentation-1-1024.jpg into embeddings
[2024-05-28 22:20:49,802] p19090 {credenti