# Creating a vectorstore with Amazon Bedrock multimodal-embeddings

This notebook gives a step-by-step tutorial to ingest data in [Opensearch Serverless](https://aws.amazon.com/opensearch-service/features/serverless/). These vector embeddings will be used by the Agent to search for similar images in the provided vectorstore.

This notebook is required if you would like to the agent to be able to take the `/image_look_up` action, otherwise you can directly run the `Step2_langgraph_agent.ipynb` notebook.

### Environment setup

This has been tested in `conda_python3` Jupyter Notebook kernel with `ml.t3.medium`

### Prerequisite

Ensure you have an AWS account with permission to:

- Create security policy, access policy, collection, index, index mapping on OpenSearchServerless

- BatchGetCollection on OpenSearchServerless

#### Install the requirements

In [None]:
!pip install -q opensearch-py --quiet
!pip install -q requests_aws4auth --quiet

#### Download the dataset locally

In [None]:
!git clone https://github.com/orbitalsonic/Fashion-Dataset-Images-Western-Dress.git

In [None]:
# Only keep 10 images to save time
import os
import shutil

current_dir = os.getcwd()
image_extensions = ('.jpg', '.jpeg', '.png')

relative_path = "Fashion-Dataset-Images-Western-Dress/WesternDress_Images"
image_folder = os.path.join(current_dir, relative_path)
image_files = [f for f in os.listdir(image_folder) if f.endswith(image_extensions)]
image_files.sort()
images_to_keep = image_files[:10]

for image in image_files:
    if image not in images_to_keep:
        file_path = os.path.join(image_folder, image)
        os.remove(file_path)

### Add all the dependencies/imports

In [None]:
import os
import boto3
from opensearchpy import AWSV4SignerAuth, OpenSearch, RequestsHttpConnection
from dependencies.opensearch_utils import OpensearchIngestion

boto3_session = boto3.Session()
identity_arn = boto3_session.client('sts').get_caller_identity()['Arn']
print("Current IAM Role ARN:", identity_arn)

In [None]:
# create a client for OSS
client = boto3.client('opensearchserverless')
service = 'aoss'
region = boto3_session.region_name
credentials = boto3_session.get_credentials()
AWSAUTH = AWSV4SignerAuth(credentials, region, "aoss")

####  Load parameters for Opensearch Serverless collection and embedding setup

In [None]:
ssm_client = boto3.client('ssm')

response = ssm_client.get_parameters(
    Names=[
        'AOSSCollectionName', 'AOSSEmbeddingSize', 'AOSSHost', 'AOSSIndexName'
    ]
)
param_dict = {}
for parameter in response['Parameters']:
    param_dict[parameter['Name']] = parameter['Value']
param_dict

#### Initialize an Opensearch client

In [None]:
# Create the client with SSL/TLS enabled.
OSSclient = OpenSearch(
    hosts=[{'host': param_dict['AOSSHost'], 'port': 443}],
    http_auth=AWSAUTH,
    use_ssl=True,
    verify_certs=True,
    connection_class=RequestsHttpConnection,
    pool_maxsize=20,
    timeout=3000,
)

#### Create OpensearchIngestion class
Opensearch Ingestion class (created in opensearch_utils.py) contains helper functions for the document processing and ingestion into the index

In [None]:
oss_instance = OpensearchIngestion(
    client=OSSclient,
    session=boto3_session
)

#### Ingest the images

In [None]:
dataset_path = "Fashion-Dataset-Images-Western-Dress/WesternDress_Images/"

In [None]:
failed = []
for image_name in os.listdir(dataset_path):
    image = dataset_path+image_name
    try:
        (data, embedding) = oss_instance.create_titan_multimodal_embeddings(image_path=image)
        img_id = image.rsplit("/",1)[1].split(".")[0]
        # print(img_id)
        body = {
            "vector_field": embedding["embedding"],
            "image_b64": data["inputImage"], 
            }
    except Exception as e:
        print(f"Exception thrown in image {image}: {e}")
        continue
    # Ingest the images one by one.
    status = oss_instance.client.index(
        index=param_dict['AOSSIndexName'], 
        body=body, 
    )
    if status["result"] != "created":
        failed.append(image)
        
print(f"Ingestion Complete. Failed ingestion for the following: {failed}")

##### Clean up will be done together with all other agent assets