In [30]:
import boto3
import json
import os
import base64
import logging

In [31]:
# global constants
LISTINGS_FILE: str = os.path.join("listings", "metadata", "listings_0.json")
LANGUAGE_TO_FILTER: str = "en_US"
IMAGE_ID_TO_FNAME_MAPPING_FILE: str = "images.csv"
ABO_S3_BUCKET: str = "amazon-berkeley-objects"
ABO_S3_PREFIX:str = "images/original"
ABO_S3_BUCKET_PREFIX: str = f"s3://{ABO_S3_BUCKET}/{ABO_S3_PREFIX}"
IMAGE_DATASET_FNAME: str = f"aob_{LANGUAGE_TO_FILTER}.csv"
DATA_DIR: str = "data"
IMAGES_DIR: str = os.path.join(DATA_DIR, "images", LANGUAGE_TO_FILTER)
B64_ENCODED_IMAGES_DIR: str = os.path.join(DATA_DIR, "b64_images", LANGUAGE_TO_FILTER)
VECTOR_DB_DIR: str = os.path.join(DATA_DIR, "vectordb", LANGUAGE_TO_FILTER)
SUCCESSFULLY_EMBEDDED_DIR: str = os.path.join(DATA_DIR, "successfully_embedded", LANGUAGE_TO_FILTER)
IMAGE_DATA_W_SUCCESSFUL_EMBEDDINGS_FPATH: str = os.path.join(SUCCESSFULLY_EMBEDDED_DIR, "data.csv")
os.makedirs(DATA_DIR, exist_ok=True)
os.makedirs(IMAGES_DIR, exist_ok=True)
os.makedirs(B64_ENCODED_IMAGES_DIR, exist_ok=True)
os.makedirs(VECTOR_DB_DIR, exist_ok=True)
os.makedirs(SUCCESSFULLY_EMBEDDED_DIR, exist_ok=True)
FMC_URL: str = "https://bedrock-runtime.us-east-1.amazonaws.com"
FMC_MODEL_ID: str = "amazon.titan-embed-image-v1"
CLAUDE_V2_MODEL_ID: str  = "anthropic.claude-v2"
ACCEPT_ENCODING: str = "application/json"
CONTENT_ENCODING: str = "application/json"
VECTORDB_INDEX_FILE: str = f"aob_{LANGUAGE_TO_FILTER}_index"
VECTOR_DB_INDEX_FPATH: str = os.path.join(VECTOR_DB_DIR, VECTORDB_INDEX_FILE)
K: int = 4
N: int = 10000
MAX_IMAGE_HEIGHT: int = 2048
MAX_IMAGE_WIDTH: int = 2048

In [32]:
logging.basicConfig(format='[%(asctime)s] p%(process)s {%(filename)s:%(lineno)d} %(levelname)s - %(message)s', level=logging.INFO)
logger = logging.getLogger(__name__)

In [40]:
session = boto3.session.Session(profile_name="aws_sol")

In [41]:
FMC_URL="https://bedrock-runtime.us-east-1.amazonaws.com"
bedrock = session.client(
    service_name="bedrock-runtime", region_name="us-east-1", endpoint_url=FMC_URL
)

[2024-04-09 10:56:07,932] p44920 {credentials.py:1278} INFO - Found credentials in shared credentials file: ~/.aws/credentials


In [42]:
bedrock

<botocore.client.BedrockRuntime at 0x144686e60>

In [43]:
from pandas.core.series import Series
import numpy as np
import numpy
from typing import Dict
def get_embeddings(text: str, image: str) -> numpy.ndarray:
   
    # You can specify either text or image or both
    body = json.dumps(
        {
            "inputText": text,
            "inputImage": image
        }
    )
        
    modelId = FMC_MODEL_ID
    accept = ACCEPT_ENCODING
    contentType = CONTENT_ENCODING

    try:
        response = bedrock.invoke_model(
            body=body, modelId=modelId, accept=accept, contentType=contentType
        )
        response_body = json.loads(response.get("body").read())        
        embeddings = np.array([response_body.get("embedding")]).astype(np.float32)        
    except Exception as e:
        logger.error(f"exception while encoding text={text}, image(truncated)={image[:10]}, exception={e}")
        embeddings = None
    return embeddings

In [44]:
import base64
# convert all the downloaded files into base64 encoding
def encode_image_to_base64(image_file_path: str):
    with open(image_file_path, "rb") as image_file:
        b64_image = base64.b64encode(image_file.read()).decode('utf8')
    return b64_image

In [56]:
#enc_img = encode_image_to_base64('/Users/akshayranganath/Downloads/g2.jpeg')
enc_img = encode_image_to_base64('/Users/akshayranganath/Downloads/mj-panda.jpeg')

In [57]:
embeddings = get_embeddings(image=enc_img, text='A picture of a panda playing soccer')

In [58]:
import chromadb
chroma_client = chromadb.Client()

In [59]:
#collection = chroma_client.create_collection(name="my_collection")

UniqueConstraintError: Collection my_collection already exists

In [60]:
collection.add(
    embeddings=embeddings,
    documents=["panda-soccer"],
    metadatas=[{"source": "/Users/akshayranganath/Downloads/mj-panda.jpeg"}],
    ids=["id2"]
)

In [61]:
collection.count()

2

In [63]:
collection.query(query_embeddings=embeddings, n_results=2, )



{'ids': [['id2', 'id1']],
 'distances': [[0.0, 0.7329332232475281]],
 'metadatas': [[{'source': '/Users/akshayranganath/Downloads/mj-panda.jpeg'},
   {'source': '/Users/akshayranganath/Downloads/g2.jpeg'}]],
 'embeddings': None,
 'documents': [['panda-soccer', 'greeting-card1']],
 'uris': None,
 'data': None}