## Create an Index in OpenSearch Serverless Collection

In [None]:
host = '[COLLECTION_ID].us-east-1.aoss.amazonaws.com'  # serverless collection endpoint, without https://

In [None]:
!pip install opensearch-py

In [None]:
from opensearchpy import OpenSearch, RequestsHttpConnection, AWSV4SignerAuth
import boto3

region = 'us-east-1'  # e.g. us-east-1

service = 'aoss'
credentials = boto3.Session().get_credentials()
auth = AWSV4SignerAuth(credentials, region, service)

# create an opensearch client and use the request-signer
client = OpenSearch(
    hosts=[{'host': host, 'port': 443}],
    http_auth=auth,
    use_ssl=True,
    verify_certs=True,
    connection_class=RequestsHttpConnection,
    pool_maxsize=20,
)

# create an index
index_name = 'x-ray-image-embedding-vector-index'

embedding_dim=384

index_body = {
  "settings": {
    "index": {
      "knn": True
    }
  },
  "mappings": { #how do we store, 
    "properties": {
        "embedding": {
          "type": "knn_vector", #we are going to put 
          "dimension": embedding_dim,
          "method": {
            "name": "hnsw",
            "space_type": "l2",
            "engine": "nmslib",
            "parameters": {
              "ef_construction": 128,
              "m": 24
            }
         }
     }
    }
  }
}

response = client.indices.create(index=index_name, body=index_body)

print('\nCreating index:')
print(response)

## Ingest Multimodal Embeddings into OpenSearch Index

In [None]:
import json
import base64

# calls Amazon Bedrock to get a vector from either an image, text, or both
def get_multimodal_vector(input_image_base64=None, input_text=None):
    bedrock = boto3.client(service_name='bedrock-runtime')
    request_body = {}
    if input_text:
        request_body["inputText"] = input_text
    if input_image_base64:
        request_body["inputImage"] = input_image_base64
    request_body["embeddingConfig"] = {"outputEmbeddingLength": 384}
    body = json.dumps(request_body)
    response = bedrock.invoke_model(
        body=body, 
        modelId="amazon.titan-embed-image-v1", 
        accept="application/json", 
        contentType="application/json"
    )
    response_body = json.loads(response.get('body').read())
    embedding = response_body.get("embedding")
    return embedding

# creates a vector from an image file path
def get_vector_from_file(file_path, label=None):
    with open(file_path, "rb") as image_file:
        input_image_base64 = base64.b64encode(image_file.read()).decode('utf8')    
    vector = get_multimodal_vector(input_image_base64 = input_image_base64, input_text=label)
    return vector

In [None]:
import csv

with open('./miccai2023_nih-cxr-lt_labels_train.csv') as input1, open('./miccai2023_nih-cxr-lt_labels_test.csv') as input2, open('./miccai2023_nih-cxr-lt_labels_val.csv') as input3, open("./labels.json", "w") as output:
    reader1 = csv.reader(input1)
    reader2 = csv.reader(input2)
    reader3 = csv.reader(input3)

    res={}

    i=0
    for row in reader1:
        if i==0:
            categories=row
            i+=1
        else:
            caption=''
            for j, col in enumerate(row): 
                if j==0:
                    key=col
                else:
                    if col=='1':
                        caption+=categories[j]+' '

            res[key]=caption.strip()
            i+=1

    i=0
    for row in reader2:
        if i==0:
            categories=row
            i+=1
        else:
            caption=''
            for j, col in enumerate(row): 
                if j==0:
                    key=col
                else:
                    if col=='1':
                        caption+=categories[j]+' '

            res[key]=caption.strip()
            i+=1

    i=0
    for row in reader3:
        if i==0:
            categories=row
            i+=1
        else:
            caption=''
            for j, col in enumerate(row): 
                if j==0:
                    key=col
                else:
                    if col=='1':
                        caption+=categories[j]+' '

            res[key]=caption.strip()
            i+=1

    output.write(json.dumps(res))

In [None]:
# Opening JSON file
with open('labels.json') as json_file:
    labels = json.load(json_file)

    # Print the type of data variable
    print("Type:", type(labels))
    
print(len(labels))

Below Step takes about 10 minutes, you can save time by decreasing the sample_size.

In [None]:
# Vectorize dataset and load it to OpenSearch
from PIL import Image
import os

image_folder = 'images/images'

image_files = [f for f in os.listdir(image_folder) if os.path.isfile(os.path.join(image_folder, f))]

sample_size=1000

i=0

for image_file in image_files:
    
    if i<sample_size:
        # Construct the full path to the image file
        image_path = os.path.join(image_folder, image_file)

        if image_file in labels:
            label=labels[image_file]
        else:
            label=''

        img_embedding = get_vector_from_file(image_path, label)

        image_document = {
            'filename': image_file,
            'diagnoses': label,
            'embedding': img_embedding
        }

        response = client.index(
            index = index_name,
            body = image_document
        )

        print(f"Inserted: {image_file}")
        
        i+=1
    
    else:
        break
    

print("All images inserted into OpenSearch.")

## Query OpenSearch index using image embedding

In [None]:
# embed query testing for an image
query_embedding = get_vector_from_file('images/images/00000001_000.png')

In [None]:
query_body = {
    "query": {"knn": {"embedding": {"vector": query_embedding, "k": 3}}},
    "_source": False,
    "fields": ["filename", "diagnoses"],
}

results = client.search(
    body=query_body,
    index=index_name
)

In [None]:
print(results)

In [None]:
for hit in results['hits']['hits']:
    
    print(hit['_score'])
    print(hit['fields']['diagnoses'])

## [Optional] Next Step

For above experiment we used the off-the-shelf version of titan multimodal embedding model to create the image embedding when querying the opensearch for similar images. What do you think the performance will be if we used the fine tuned model? Give it a try!

## Cleanup

In [None]:
# delete the index
delete_response = client.indices.delete(
    index_name
)

print('\nDeleting index:')
print(delete_response)