# Data preparation

***This notebook works well with the `Data Science 3.0 Python 3` kernel and `ml.t3.medium` instance type.***

Data download and data preparation.

In [1]:
import sys
#!{sys.executable} -m pip install -r requirements.txt

In [2]:
import os
import json
import boto3
import logging
import pandas as pd
from PIL import Image
from globals import *
from typing import List
from download_images import download_images


ModuleNotFoundError: No module named 'globals'

In [3]:
LISTINGS_FILE: str = os.path.join("listings", "metadata", "listings_0.json")
LANGUAGE_TO_FILTER: str = "en_US"
IMAGE_ID_TO_FNAME_MAPPING_FILE: str = "images.csv"
ABO_S3_BUCKET: str = "amazon-berkeley-objects"
ABO_S3_PREFIX:str = "images/original"
ABO_S3_BUCKET_PREFIX: str = f"s3://{ABO_S3_BUCKET}/{ABO_S3_PREFIX}"
IMAGE_DATASET_FNAME: str = f"aob_{LANGUAGE_TO_FILTER}.csv"
DATA_DIR: str = "data"
IMAGES_DIR: str = os.path.join(DATA_DIR, "images", LANGUAGE_TO_FILTER)
B64_ENCODED_IMAGES_DIR: str = os.path.join(DATA_DIR, "b64_images", LANGUAGE_TO_FILTER)
VECTOR_DB_DIR: str = os.path.join(DATA_DIR, "vectordb", LANGUAGE_TO_FILTER)
SUCCESSFULLY_EMBEDDED_DIR: str = os.path.join(DATA_DIR, "successfully_embedded", LANGUAGE_TO_FILTER)
IMAGE_DATA_W_SUCCESSFUL_EMBEDDINGS_FPATH: str = os.path.join(SUCCESSFULLY_EMBEDDED_DIR, "data.csv")
os.makedirs(DATA_DIR, exist_ok=True)
os.makedirs(IMAGES_DIR, exist_ok=True)
os.makedirs(B64_ENCODED_IMAGES_DIR, exist_ok=True)
os.makedirs(VECTOR_DB_DIR, exist_ok=True)
os.makedirs(SUCCESSFULLY_EMBEDDED_DIR, exist_ok=True)
FMC_URL: str = "https://bedrock-runtime.us-east-1.amazonaws.com"
FMC_MODEL_ID: str = "amazon.titan-embed-image-v1"
CLAUDE_V2_MODEL_ID: str  = "anthropic.claude-v2"
ACCEPT_ENCODING: str = "application/json"
CONTENT_ENCODING: str = "application/json"
VECTORDB_INDEX_FILE: str = f"aob_{LANGUAGE_TO_FILTER}_index"
VECTOR_DB_INDEX_FPATH: str = os.path.join(VECTOR_DB_DIR, VECTORDB_INDEX_FILE)
K: int = 4
N: int = 10000
MAX_IMAGE_HEIGHT: int = 2048
MAX_IMAGE_WIDTH: int = 2048

In [4]:
!rm -rf listings
!rm -f abo-listings.tar
!rm -f images.csv.gz images.csv


In [7]:
!curl --remote-name https://amazon-berkeley-objects.s3.us-east-1.amazonaws.com/archives/abo-listings.tar


  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 83.4M  100 83.4M    0     0  6509k      0  0:00:13  0:00:13 --:--:-- 11.0M


In [8]:
!tar xvf abo-listings.tar


x LICENSE-CC-BY-4.0.txt
x listings/
x listings/README.md
x listings/metadata/
x listings/metadata/listings_7.json.gz
x listings/metadata/listings_4.json.gz
x listings/metadata/listings_2.json.gz
x listings/metadata/listings_c.json.gz
x listings/metadata/listings_6.json.gz
x listings/metadata/listings_0.json.gz
x listings/metadata/listings_9.json.gz
x listings/metadata/listings_e.json.gz
x listings/metadata/listings_1.json.gz
x listings/metadata/listings_5.json.gz
x listings/metadata/listings_3.json.gz
x listings/metadata/listings_d.json.gz
x listings/metadata/listings_f.json.gz
x listings/metadata/listings_8.json.gz
x listings/metadata/listings_a.json.gz
x listings/metadata/listings_b.json.gz


In [9]:
!gzip -d listings/metadata/listings_0.json.gz


In [10]:
!aws s3 cp s3://amazon-berkeley-objects/images/metadata/images.csv.gz .


download: s3://amazon-berkeley-objects/images/metadata/images.csv.gz to ./images.csv.gz


In [11]:
!gzip -d images.csv.gz

In [14]:
# read all listing data available
listing = []

with open(LISTINGS_FILE, 'r') as json_file:
    listing = list(map(json.loads, list(json_file)))
    print(f"there are {len(listing)} listings in {LISTINGS_FILE}")

# id to file name mapping
id_to_fname_mapping = pd.read_csv(IMAGE_ID_TO_FNAME_MAPPING_FILE)
print(f"id_to_fname_mapping shape={id_to_fname_mapping.shape}")


there are 9232 listings in listings/metadata/listings_0.json
id_to_fname_mapping shape=(398212, 4)


In [15]:
# filter for language of interest
listing_filtered  = []
for l in listing:
    brand = l.get('brand')
    if brand is not None:
        for b in brand:
            if b['language_tag'] == LANGUAGE_TO_FILTER:
                listing_filtered.append(l)
print(f"there are {len(listing_filtered)} listings for {LANGUAGE_TO_FILTER} in {LISTINGS_FILE}")


there are 1549 listings for en_US in listings/metadata/listings_0.json


In [17]:
# create a dataset of images and descriptions
image_data_list = []
for l in listing_filtered:
    main_image_id = l.get('main_image_id')
    if main_image_id is None:
        continue
    bullet_point = l.get('bullet_point')
    tags = [] 
    if bullet_point is not None:
        for b in bullet_point:
            lt = b.get('language_tag')
            v = b.get('value')
            if lt == LANGUAGE_TO_FILTER:
                tags.append(v)
    description = ". ".join(tags)
    # logger.info(f"{main_image_id}, {description}")
 
    image_data_list.append(dict(image_id=main_image_id, description=description))
    
# create a dataframe so that we can join with the image path data
image_data = pd.merge(left=pd.DataFrame(image_data_list),
                      right=id_to_fname_mapping,
                      on="image_id",
                      how="left")
# image_data.path = image_data.path.map(lambda x: f"{ABO_S3_BUCKET_PREFIX}/{x}")
image_data.to_csv(IMAGE_DATASET_FNAME, index=False)


In [20]:
%%time
import sys
import subprocess

import os
import boto3
import asyncio
import logging
import pandas as pd
from typing import Dict, Generator
# convert all the downloaded files into base64 encoding
import glob
import base64


# download the files from s3 and convert them into base64 encoded images
def download_image_file(row_tuple: Dict, s3_bucket: str, s3_prefix: str, local_images_dir: str):
    s3 = boto3.client('s3')
    _, row = row_tuple
    # print(f"row type={type(row)}")
    path = row.get('path')
    if path is None:
        return
    local_path = os.path.join(local_images_dir, os.path.basename(path))
    key = f"{s3_prefix}/{path}"
    print(f"going to download {s3_bucket}/{key} to {local_path}")
    with open(local_path, 'wb') as f:
        s3.download_fileobj(s3_bucket, key, f)

async def adownload_image_file(row_tuple: Dict, s3_bucket: str, s3_prefix: str, local_images_dir: str):
    return await asyncio.to_thread(download_image_file, row_tuple, s3_bucket, s3_prefix, local_images_dir)

async def adownload_all_image_files(rows: Generator, s3_bucket: str, s3_prefix: str, local_images_dir: str):
    return  await asyncio.gather(*[adownload_image_file(r, s3_bucket, s3_prefix, local_images_dir) for r in rows])


def download_images(image_count: int, image_data_fname: str, s3_bucket: str, s3_prefix: str, local_images_dir: str):
    image_data = pd.read_csv(image_data_fname)
    image_count = len(image_data) if image_count > len(image_data) else image_count
    _ = asyncio.run(adownload_all_image_files(image_data.sample(n=image_count).iterrows(), s3_bucket, s3_prefix, local_images_dir))

download_images(10000, "aob_en_US.csv", "amazon-berkeley-objects", "images/original", "data/images/en_US")



image_file_list = glob.glob(os.path.join(IMAGES_DIR, "*.*"))
print(f"there are {len(image_file_list)} in {IMAGES_DIR}")

RuntimeError: asyncio.run() cannot be called from a running event loop

In [17]:
def resize_image_if_needed(image_file_path: str):
    image = Image.open(image_file_path)    
    if (image.size[0] * image.size[1]) > (MAX_IMAGE_HEIGHT * MAX_IMAGE_WIDTH):
        logger.info(f"{image_file_path} has dimensions {image.size} which is larger than {MAX_IMAGE_HEIGHT}x{MAX_IMAGE_WIDTH} combined, will scale this image")
        image.thumbnail((MAX_IMAGE_HEIGHT, MAX_IMAGE_WIDTH))
        image.save(image_file_path)

def encode_image_to_base64(image_file_path: str):
    with open(image_file_path, "rb") as image_file:
        b64_image = base64.b64encode(image_file.read()).decode('utf8')
        b64_image_path = os.path.join(B64_ENCODED_IMAGES_DIR, f"{os.path.basename(image_file_path)}.b64")
        with open(b64_image_path, "wb") as b64_image_file:
            b64_image_file.write(bytes(b64_image, 'utf-8'))

_ = list(map(resize_image_if_needed, image_file_list))
_ = list(map(encode_image_to_base64, image_file_list))




[2023-11-30 16:41:36,340] p21 {1841016947.py:4} INFO - data/images/en_US/d1583fae.jpg has dimensions (2560, 2560) which is larger than 2048x2048 combined, will scale this image
[2023-11-30 16:41:36,615] p21 {1841016947.py:4} INFO - data/images/en_US/f2ca58fb.jpg has dimensions (2560, 2560) which is larger than 2048x2048 combined, will scale this image
[2023-11-30 16:41:36,902] p21 {1841016947.py:4} INFO - data/images/en_US/06935e1d.jpg has dimensions (2560, 2560) which is larger than 2048x2048 combined, will scale this image
[2023-11-30 16:41:37,235] p21 {1841016947.py:4} INFO - data/images/en_US/66c1272f.jpg has dimensions (2560, 1654) which is larger than 2048x2048 combined, will scale this image
[2023-11-30 16:41:37,457] p21 {1841016947.py:4} INFO - data/images/en_US/7df4ebd1.jpg has dimensions (2560, 1714) which is larger than 2048x2048 combined, will scale this image
[2023-11-30 16:41:37,651] p21 {1841016947.py:4} INFO - data/images/en_US/2a1a61ad.jpg has dimensions (2560, 2108) w