# Data ingestion

***This notebook works best with the `conda_python3` on the `ml.t3.xlarge` instance***.

---

In this notebook we download the images corresponding to the slide deck that we uploaded into Amazon S3 in the [1_data_prep.ipynb](./1_data_prep) notebook, convert them into embeddings and then ingest these embeddings into a vector database i.e. [Amazon OpenSearch Service Serverless](https://aws.amazon.com/opensearch-service/features/serverless/).

1. We use the [Anthropic’s Claude 3 Sonnet foundation model](https://aws.amazon.com/about-aws/whats-new/2024/03/anthropics-claude-3-sonnet-model-amazon-bedrock/) available on Bedrock to convert image to text.

1. We then use [Amazon Titan Text Embeddings](https://docs.aws.amazon.com/bedrock/latest/userguide/titan-embedding-models.html) model to convert the text into embeddings.

1. The embeddings are then ingested into OpenSearch Service Serverless using the [Amazon OpenSearch Ingestion](https://docs.aws.amazon.com/opensearch-service/latest/developerguide/ingestion.html) pipeline. We ingest the embeddings into an OpenSearch Serverless index via the OpenSearch Ingestion API.

1. The OpenSearch Service Serverless Collection is created via the AWS CloudFormation stack for this blog post.


## Step 1. Setup

Install the required Python packages and import the relevant files.

In [None]:
import sys
!{sys.executable} -m pip install -r requirements.txt

In [None]:
# import the libraries that are needed to run this notebook
import os
import re
import ray
import time
import glob
import json
import yaml
import time
import nltk
import boto3
import codecs
import base64
import logging
import requests
import botocore
import sagemaker
import numpy as np
import globals as g
from pathlib import Path
from nltk.tag import pos_tag
from typing import List, Dict
from nltk.tokenize import word_tokenize
from requests_auth_aws_sigv4 import AWSSigV4
from opensearchpy import OpenSearch, RequestsHttpConnection, AWSV4SignerAuth
from utils import get_cfn_outputs, get_bucket_name, download_image_files_from_s3, get_text_embedding

In [None]:
# set a logger
logging.basicConfig(format='[%(asctime)s] p%(process)s {%(filename)s:%(lineno)d} %(levelname)s - %(message)s', level=logging.INFO)
logger = logging.getLogger(__name__)

In [None]:
bedrock = boto3.client(service_name="bedrock-runtime", region_name=g.AWS_REGION, endpoint_url=g.TITAN_URL)

In [None]:
module_path=os.getcwd()
g.__path__=module_path

In [None]:
if ray.is_initialized():
    ray.shutdown()
# ray.init(runtime_env={"working_dir": "./"})
ray.init()
# ray.init(num_cpus=40)

In [None]:
# global constants
CONFIG_FILE_PATH = "config.yaml"

In [None]:
# read the config yaml file
fpath = CONFIG_FILE_PATH
with open(fpath, 'r') as yaml_in:
    config = yaml.safe_load(yaml_in)
logger.info(f"config read from {fpath} -> {json.dumps(config, indent=2)}")

In [None]:
# endpoint_url=g.TITAN_URL
region: str = config['aws']['region']
endpoint_url: str = config['bedrock_model_info']['bedrock_ep_url'].format(region=region)
claude_model_id: str = config['bedrock_model_info']['claude_sonnet_model_id']

In [None]:
bucket_name: str = get_bucket_name(config['aws']['cfn_stack_name'])
logger.info(f"Bucket name being used to store extracted images and texts from data: {bucket_name}")
s3 = boto3.client('s3')

In [None]:
sagemaker_session = sagemaker.Session()
sm_client = sagemaker_session.sagemaker_client
sm_runtime_client = sagemaker_session.sagemaker_runtime_client

In [None]:
outputs = get_cfn_outputs(config['aws']['cfn_stack_name'])
host = outputs['MultimodalCollectionEndpoint'].split('//')[1]
text_index_name = outputs['OpenSearchTextIndexName']
img_index_name = outputs['OpenSearchImgIndexName']
logger.info(f"opensearchhost={host}, text index={text_index_name}, image index={img_index_name}")
osi_text_endpoint = f"https://{outputs['OpenSearchPipelineTextEndpoint']}/data/ingest"
osi_img_endpoint = f"https://{outputs['OpenSearchPipelineImgEndpoint']}/data/ingest"

We use the OpenSearch client to create an index.

In [None]:
session = boto3.Session()
credentials = session.get_credentials()
auth = AWSV4SignerAuth(credentials, g.AWS_REGION, g.OS_SERVICE)

# Represents the OSI client for images
img_os_client = OpenSearch(
    hosts = [{'host': host, 'port': 443}],
    http_auth = auth,
    use_ssl = True,
    verify_certs = True,
    connection_class = RequestsHttpConnection,
    pool_maxsize = 20
)

# Represents the OSI client for images
text_os_client = OpenSearch(
    hosts = [{'host': host, 'port': 443}],
    http_auth = auth,
    use_ssl = True,
    verify_certs = True,
    connection_class = RequestsHttpConnection,
    pool_maxsize = 20
)

In [None]:
index_body = """
{
  "settings": {
    "index.knn": true
  },
  "mappings": {
    "properties": {
      "vector_embedding": {
        "type": "knn_vector",
        "dimension": 1536,
        "method": {
          "name": "hnsw",
          "engine": "nmslib",
          "parameters": {}
        }
      },
      "file_path": {
        "type": "text"
      },
      "file_text": {
        "type": "text"
      },
      "page_number": {
        "type": "text"
      },
      "metadata": {
        "properties": {
          "filename": {
            "type": "text"
          },
          "entities": {
            "type": "keyword"
          }
        }
      }
    }
  }
}

"""

# We would get an index already exists exception if the index already exists, and that is fine.
index_body = json.loads(index_body)
try:
    # Check if the image index exists
    if not img_os_client.indices.exists(img_index_name):
        img_response = img_os_client.indices.create(img_index_name, body=index_body)
        logger.info(f"response received for the create index for images -> {img_response}")
    else:
        logger.info(f"The image index '{img_index_name}' already exists.")

    # Check if the text index exists
    if not text_os_client.indices.exists(text_index_name):
        txt_response = text_os_client.indices.create(text_index_name, body=index_body)
        logger.info(f"response received for the create index for texts -> {txt_response}")
    else:
        logger.info(f"The text index '{text_index_name}' already exists.")
except Exception as e:
    logger.error(f"Error in creating index, exception: {e}")

## Step 2. Download the images files from S3 and convert to Base64

Now we download the image files from the S3 bucket. Once downloaded these files are converted into [Base64](https://en.wikipedia.org/wiki/Base64) encoding so that we can create embeddings from the images.

In [None]:
os.makedirs(g.PDF_IMAGE_DIR, exist_ok=True)
os.makedirs(g.PDF_TEXT_DIR, exist_ok=True)
if config['content_info']['content_type'] == 'pdf':
    # download images from S3, we would be converting these to embeddings
    image_files: List = download_image_files_from_s3(bucket_name, g.BUCKET_IMG_PREFIX, g.PDF_IMAGE_DIR, g.IMAGE_FILE_EXTN)
    text_files: List = download_image_files_from_s3(bucket_name, g.BUCKET_PDF_TEXT_PREFIX, g.PDF_TEXT_DIR, g.TEXT_FILE_EXTN)
    logger.info(f"downloaded {len(image_files) + len(text_files)} files from s3")
elif config['content_info']['content_type'] == 'slide_deck':
    # download images from S3, we would be converting these to embeddings
    image_files: List = download_image_files_from_s3(bucket_name, g.BUCKET_IMG_PREFIX, g.IMAGE_DIR, g.IMAGE_FILE_EXTN)
    logger.info(f"downloaded {len(image_files)} from s3")
else:
    logger.error(f"No content type provided. Must be either a 'pdf' or a 'slide_deck'")

#### Convert jpg files into Base64.

In [None]:
def encode_image_to_base64(image_file_path: str) -> str:
    with open(image_file_path, "rb") as image_file:
        b64_image = base64.b64encode(image_file.read()).decode('utf8')
        b64_image_path = os.path.join(g.B64_ENCODED_IMAGES_DIR, f"{Path(image_file_path).stem}.b64")
        with open(b64_image_path, "wb") as b64_image_file:
            b64_image_file.write(bytes(b64_image, 'utf-8'))
    return b64_image_path

## Step 3. Get embeddings for the base64 encoded images

Now we are ready to use Amazon Bedrock via the  Anthropic’s Claude 3 Sonnet foundation model and Amazon Titan Text Embeddings model to convert the base64 version of the images into embeddings. We ingest embeddings into the pipeline using the [requests](https://pypi.org/project/requests/) HTTP library

You must sign all HTTP requests to the pipeline using [Signature Version 4](https://docs.aws.amazon.com/general/latest/gr/signature-version-4.html).

In [None]:
def get_img_desc(image_file_path: str, prompt: str):
    # bedrock = boto3.client(service_name="bedrock-runtime", region_name=g.AWS_REGION, endpoint_url=g.TITAN_URL)
    bedrock = boto3.client(service_name="bedrock-runtime", region_name=region, endpoint_url=endpoint_url)
    # read the file, MAX image size supported is 2048 * 2048 pixels
    with open(image_file_path, "rb") as image_file:
        input_image_b64 = image_file.read().decode('utf-8')

    body = json.dumps(
        {
            "anthropic_version": "bedrock-2023-05-31",
            "max_tokens": 1000,
            "messages": [
                {
                    "role": "user",
                    "content": [
                        {
                            "type": "image",
                            "source": {
                                "type": "base64",
                                "media_type": "image/jpeg",
                                "data": input_image_b64
                            },
                        },
                        {"type": "text", "text": prompt},
                    ],
                }
            ],
        }
    )

    response = bedrock.invoke_model(
        modelId=claude_model_id,
        body=body
    )

    resp_body = json.loads(response['body'].read().decode("utf-8"))
    resp_text = resp_body['content'][0]['text'].replace('"', "'")

    return resp_text

### Download image files from S3 

In [None]:
if config['content_info']['content_type'] == 'pdf':
    # this is for the pdf file images
    image_files: List = download_image_files_from_s3(bucket_name, g.BUCKET_IMG_PREFIX, g.PDF_IMAGE_DIR, g.IMAGE_FILE_EXTN)
    logger.info(f"downloaded {len(image_files)} from s3")
elif config['content_info']['content_type'] == 'slide_deck':
    # download images from S3, we would be converting these to embeddings
    image_files: List = download_image_files_from_s3(bucket_name, g.BUCKET_IMG_PREFIX, g.IMAGE_DIR, g.IMAGE_FILE_EXTN)
    logger.info(f"downloaded {len(image_files)} from s3")

In [None]:
os.makedirs(g.B64_ENCODED_IMAGES_DIR, exist_ok=True)
if config['content_info']['content_type'] == 'pdf':
    file_list: List = glob.glob(os.path.join(g.PDF_IMAGE_DIR, f"*{g.IMAGE_FILE_EXTN}"))
    logger.info(f"there are {len(file_list)} pdf image files in the {g.PDF_IMAGE_DIR} directory for conversion to base64")
elif config['content_info']['content_type'] == 'slide_deck':
    file_list: List = glob.glob(os.path.join(g.IMAGE_DIR, f"*{g.IMAGE_FILE_EXTN}"))
    logger.info(f"there are {len(file_list)} files in the {g.IMAGE_DIR} directory for conversion to base64")

# convert each file to base64 and store the base64 in a new file
b64_image_file_list = list(map(encode_image_to_base64, file_list))
logger.info(f"base64 conversion done, there are {len(b64_image_file_list)} base64 encoded files")

### Download text files from S3 

In [None]:
if config['content_info']['content_type'] == 'pdf':
    # this is for the pdf file images
    image_files: List = download_image_files_from_s3(bucket_name, g.BUCKET_PDF_TEXT_PREFIX, g.PDF_TEXT_DIR, g.TEXT_FILE_EXTN)
    logger.info(f"downloaded {len(image_files)} text files from s3")
else:
    logger.error(f"No text files extracted from the content given")

In [None]:
prompt = """
Please provide a detailed description of the image. Describe the overall layout and design of the image. Identify and describe any tables, charts, or other visual elements present, including the specific data or information contained within them. Provide as much detail as possible about the content and format of the image. Your response should be extremely detailed and data oriented. Give the description for all four portions of the image, the upper right, upper left, lower right and lower left and include all key points data in each if possible. Be completely accurate.
"""

logger.info(f"prompt used to get image description: {prompt}")

### Hybrid Search: Extract Entities from the image, and prefilter the image description with those entities
---

The purpose of using Hybrid search is to optimize the RAG workflow in retrieving the right image description for specific questions. Some images (full or split in different parts), might not contain the information that is being asked by the question, because of the surrounding embeddings in the vector DB, so Hybrid search helps optimizing that. In this case, we will extract the entities of an image description (including the file name to be precise), then extract the entities of the question being asked, to get the most accurate response possible.

In [None]:
entity_extraction_prompt = """
Please provide a detailed description of the entities present in the image. Entities, are specific pieces of information or objects within a text that carry particular significance. These can be real-world entities like names of people, places, organizations, or dates. Refer to the types of entities: Named entities: These include names of people, organizations, locations, and dates. You can have specific identifiers within this, such as person names or person occupations.

Custom entities: These are entities specific to a particular application or domain, such as product names, medical terms, or technical jargon.

Temporal entities: These are entities related to time, such as dates, times, and durations.

Product entities: Names of products might be grouped together into product entities.

Data entities: Names of the data and metrics present. This includes names of metrics in charts, graphs and tables, and throughout the image.

Now based on the image, create a list of these entities. Your response should be accurate. Do not make up an answer.
"""

logger.info(f"prompt used to extract entities from the image: {entity_extraction_prompt}")

### Part 1: Loop through b64 images to 1/get image desc from Claude3, 2/get embedding from Titan text. Call OSI pipeline API to ingest embedding.

In [None]:
def get_img_txt_embeddings(bedrock: botocore.client, prompt_data: str) -> np.ndarray:
    body = json.dumps({
        "inputText": prompt_data,
    })
    try:
        response = bedrock.invoke_model(
            body=body, modelId=config['bedrock_model_info']['titan_model_id'], 
            accept=config['encoding_info']['accept_encoding'], contentType=config['encoding_info']['content_encoding']
        )
        response_body = json.loads(response['body'].read())
        embedding = response_body.get('embedding')
    except Exception as e:
        logger.error(f"exception={e}")
        embedding = None
    return embedding

In [None]:
# function to get the image description and store the embeddings of that text in the image index
def process_image_data(i: int, 
                       file_path: str, 
                       osi_endpoint, 
                       total: int) -> Dict:
    bedrock = boto3.client(service_name="bedrock-runtime", region_name=region, endpoint_url=endpoint_url)
    json_data: Optional[Dict] = None
    # name of the images that are saved (either split in 4 ways or saved as a single page)
    image_name: Optional[str] = None
    try:
        image_file_extn: str = config['content_info']['image_extn']
        bucket_img_prefix: str = os.path.join(config['pdf_dir_info']['bucket_prefix'], 
                                              config['pdf_dir_info']['bucket_img_prefix'])
        logger.info(f"going to convert {file_path} into embeddings")
        # first, get the entities from the image to prefilter the image description with the entities
        entities_extracted = get_img_desc(file_path, entity_extraction_prompt)
        # get the image description and prefilter the image description with the entities extracted from the image
        content_description = entities_extracted + get_img_desc(file_path, prompt)
        print(f"file_path: {file_path}, image description (prefiltered with entities extracted): {content_description}")
        # embedding = get_text_embedding(bedrock, content_description)
        embedding = get_img_txt_embeddings(bedrock, content_description)

        if config['content_info']['content_type'] == 'slide_deck':
            input_image_s3 = f"s3://{bucket_name}/{bucket_img_prefix}/{Path(file_path).stem}{image_file_extn}"
            obj_name = f"{Path(file_path).stem}{image_file_extn}"
        elif config['content_info']['content_type'] == 'pdf':
            input_image_s3 = f"s3://{bucket_name}/{bucket_img_prefix}/{Path(file_path).stem}{image_file_extn}"
            obj_name = f"{Path(file_path).stem}{image_file_extn}"

        data = json.dumps([{
            "file_path": input_image_s3,
            "file_text": content_description,
            "page_number": re.search(r"page_(\d+)_?", obj_name).group(1),
            "metadata": {
                "filename": obj_name,
                "entities": entities_extracted
            },
            "vector_embedding": embedding
        }])
        json_data = {
            "file_type": config['content_info']['image_extn'],
            "file_name": obj_name,
            "text": content_description,
            "entities": entities_extracted,
            "page_number": re.search(r"page_(\d+)_?", obj_name).group(1)
            # "page_number": re.search(r"_(\d+)_?", obj_name).group(1)
            }
        image_dir: str = config['pdf_dir_info']['json_img_dir']
        os.makedirs(image_dir, exist_ok=True)
        fpath = os.path.join(image_dir, f"{Path(file_path).stem}.json")
        print(f"json_file_path: {fpath}")
        Path(fpath).write_text(json.dumps(json_data, default=str, indent=2))
        r = requests.request(
            method='POST', 
            url=osi_endpoint, 
            data=data,
            auth=AWSSigV4('osis'))
        logger.info("Ingesting data into pipeline")
        logger.info(f"image desc: {r.text}")
    except Exception as e:
        logger.error(f"Error processing image {file_path}: {e}")
        json_data: Optional[Dict] = None
    return json_data

In [None]:
@ray.remote
def async_process_image_data(i: int, file_path: str, osi_endpoint, total: int):
    logging.basicConfig(format='[%(asctime)s] p%(process)s {%(filename)s:%(lineno)d} %(levelname)s - %(message)s', level=logging.INFO)
    logger = logging.getLogger(__name__)
    return process_image_data(i, file_path, osi_endpoint, total)

In [None]:
import time
erroneous_page_count: int = 0
n: int = config['parallel_inference_count']
image_chunks = [b64_image_file_list[i:i + n] for i in range(0, len(b64_image_file_list), n)]
for chunk_index, image_chunk in enumerate(image_chunks):
    try:
        st = time.perf_counter()
        logger.info(f"------ getting text description for chunk {chunk_index}/{len(image_chunks)} -----")
        # Iterate over each file path in the chunk and process it individually
        logger.info(f"getting inference for list {chunk_index+1}/{len(image_chunks)}, size of list={len(image_chunk)} ")
        results = ray.get([async_process_image_data.remote(index, file_path, osi_img_endpoint, len(image_chunk)) for index, file_path in enumerate(image_chunk)])
        elapsed_time = time.perf_counter() - st
        logger.info(f"------ completed chunk={chunk_index}/{len(image_chunks)} completed in {elapsed_time} ------ ")
    except Exception as e:
        logger.error(f"Error processing chunk {chunk_index}: {e}")
        erroneous_page_count += len(image_chunk)

logger.info(f"Number of erroneous pdf pages that are not processed: {erroneous_page_count}")

### Part 2: Loop through text files to 1/get text desc from Claude3, 2/get embedding from Titan text. Call OSI pipeline API to ingest embedding.

In [None]:
# Get a list of all files in the current directory
pdf_txt_file_list = os.listdir(g.PDF_TEXT_DIR)

# Get relative file paths by joining directory path with each file name
pdf_txt_file_list = [os.path.join(g.PDF_TEXT_DIR, file) for file in pdf_txt_file_list]
print(pdf_txt_file_list)

#### Entities extraction from PDF texts using `nltk`

In [None]:
from nltk import pos_tag, word_tokenize
from nltk.tree import Tree
from nltk.chunk import ne_chunk

def get_continuous_chunks(text):
    """
    This function uses nltk to get the entities from texts that are extracted from pdf files
    """
    chunked = ne_chunk(pos_tag(word_tokenize(text)))
    continuous_chunk = []
    current_chunk = []
    for i in chunked:
        if type(i) == Tree:
            current_chunk.append(" ".join([token for token, pos in i.leaves()]))
        if current_chunk:
            named_entity = " ".join(current_chunk)
            if named_entity not in continuous_chunk:
                continuous_chunk.append(named_entity)
                current_chunk = []
        else:
            continue
    return continuous_chunk

In [None]:
def process_text_data(txt_file: str, txt_page_index: int):
    with open(txt_file, 'r') as file:
        extracted_pdf_text = file.read()

    # Extract entities from text
    entities = get_continuous_chunks(extracted_pdf_text)
    # Convert entities list to string
    entities_str = ", ".join(entities)
    logger.info(f"entities extracted from {txt_file}: {entities_str}")

    # Your existing code for embeddings and JSON data
    embedding = get_text_embedding(bedrock, extracted_pdf_text)
    input_text_s3 = f"s3://{bucket_name}/{g.BUCKET_PDF_TEXT_PREFIX}/{Path(txt_file).stem}{g.TEXT_FILE_EXTN}"
    obj_name = f"{Path(txt_file).stem}{g.TEXT_FILE_EXTN}"
    data = json.dumps([{
        "file_path": input_text_s3, 
        "file_text": extracted_pdf_text,
        "page_number": txt_page_index, 
        "metadata": {
            "filename": obj_name, 
            "entities": entities_str 
        },
        "vector_embedding": embedding
    }])
    json_data = {
        "file_type": g.TEXT_FILE_EXTN,
        "file_name": Path(txt_file).stem,
        "text": extracted_pdf_text, 
        "page_number": re.search(r"text_(\d+)_?", obj_name).group(1),
        "entities": entities_str  
    }
    os.makedirs(g.JSON_TEXT_DIR, exist_ok=True)
    fpath = os.path.join(g.JSON_TEXT_DIR, f"{Path(txt_file).stem}.json")
    print(f"json_file_path: {fpath}")
    Path(fpath).write_text(json.dumps(json_data, default=str, indent=2))
    r = requests.request(
        method='POST',
        url=osi_text_endpoint,
        data=data,
        auth=AWSSigV4('osis'))

    logger.info("Ingesting data into pipeline")
    logger.info(f"Response: {txt_page_index} - {r.text}")

In [None]:
txt_page_index: int = 1
os.makedirs(g.JSON_TEXT_DIR, exist_ok=True)
for txt_file in pdf_txt_file_list:
    logger.info(f"going to convert {txt_file} into embeddings")
    process_text_data(txt_file, txt_page_index)
    txt_page_index += 1