# Data preparation

***This notebook works well with the `Data Science 3.0 Python 3` kernel and `ml.t3.medium` instance type.***

Data download and data preparation.

In [2]:
import sys
!{sys.executable} -m pip install -r requirements.txt

Collecting pandas==2.1.3 (from -r requirements.txt (line 2))
  Obtaining dependency information for pandas==2.1.3 from https://files.pythonhosted.org/packages/1b/fa/4e5d054549faf1524230ffcd57ca98bb7350a4ed62ef722daabde4cb7632/pandas-2.1.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata
  Downloading pandas-2.1.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (18 kB)
Collecting boto3==1.29.5 (from -r requirements.txt (line 3))
  Obtaining dependency information for boto3==1.29.5 from https://files.pythonhosted.org/packages/f2/23/c5545cb57abfc3a9782287f2845a26286f6f9f7bcec36f13569567f950fe/boto3-1.29.5-py3-none-any.whl.metadata
  Downloading boto3-1.29.5-py3-none-any.whl.metadata (6.7 kB)
Collecting pillow==10.1.0 (from -r requirements.txt (line 4))
  Obtaining dependency information for pillow==10.1.0 from https://files.pythonhosted.org/packages/e5/b9/5c6ad3241f1ccca4b781dfeddbab2dac4480f95aedc351a0e60c9f4c8aa9/Pillow-10.1.0-cp310-cp310-manylinux

In [3]:
import os
import json
import boto3
import logging
import pandas as pd
from PIL import Image
from globals import *
from typing import List
from download_images import download_images

logging.basicConfig(format='[%(asctime)s] p%(process)s {%(filename)s:%(lineno)d} %(levelname)s - %(message)s', level=logging.INFO)
logger = logging.getLogger(__name__)

In [4]:
# global constants
!pygmentize globals.py

[34mimport[39;49;00m [04m[36mos[39;49;00m[37m[39;49;00m
[37m[39;49;00m
[37m# global constants[39;49;00m[37m[39;49;00m
LISTINGS_FILE: [36mstr[39;49;00m = os.path.join([33m"[39;49;00m[33mlistings[39;49;00m[33m"[39;49;00m, [33m"[39;49;00m[33mmetadata[39;49;00m[33m"[39;49;00m, [33m"[39;49;00m[33mlistings_0.json[39;49;00m[33m"[39;49;00m)[37m[39;49;00m
LANGUAGE_TO_FILTER: [36mstr[39;49;00m = [33m"[39;49;00m[33men_US[39;49;00m[33m"[39;49;00m[37m[39;49;00m
IMAGE_ID_TO_FNAME_MAPPING_FILE: [36mstr[39;49;00m = [33m"[39;49;00m[33mimages.csv[39;49;00m[33m"[39;49;00m[37m[39;49;00m
ABO_S3_BUCKET: [36mstr[39;49;00m = [33m"[39;49;00m[33mamazon-berkeley-objects[39;49;00m[33m"[39;49;00m[37m[39;49;00m
ABO_S3_PREFIX:[36mstr[39;49;00m = [33m"[39;49;00m[33mimages/original[39;49;00m[33m"[39;49;00m[37m[39;49;00m
ABO_S3_BUCKET_PREFIX: [36mstr[39;49;00m = [33mf[39;49;00m[33m"[39;49;00m[33ms3://[39;49;00m[33m{[39;49;00mABO_S3_BUC

In [7]:
!rm -rf listings
!rm -f abo-listings.tar
!rm -f images.csv.gz images.csv


In [8]:
!wget https://amazon-berkeley-objects.s3.us-east-1.amazonaws.com/archives/abo-listings.tar


--2023-11-30 16:36:47--  https://amazon-berkeley-objects.s3.us-east-1.amazonaws.com/archives/abo-listings.tar
Resolving amazon-berkeley-objects.s3.us-east-1.amazonaws.com (amazon-berkeley-objects.s3.us-east-1.amazonaws.com)... 16.182.38.18, 52.216.176.62, 52.217.166.202, ...
Connecting to amazon-berkeley-objects.s3.us-east-1.amazonaws.com (amazon-berkeley-objects.s3.us-east-1.amazonaws.com)|16.182.38.18|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 87480320 (83M) [application/x-tar]
Saving to: ‘abo-listings.tar’


2023-11-30 16:36:50 (26.9 MB/s) - ‘abo-listings.tar’ saved [87480320/87480320]



In [9]:
!tar xvf abo-listings.tar


LICENSE-CC-BY-4.0.txt
tar: LICENSE-CC-BY-4.0.txt: Cannot change ownership to uid 1808267, gid 100: Invalid argument
listings/
listings/README.md
tar: listings/README.md: Cannot change ownership to uid 1808267, gid 100: Invalid argument
listings/metadata/
listings/metadata/listings_7.json.gz
tar: listings/metadata/listings_7.json.gz: Cannot change ownership to uid 1808267, gid 100: Invalid argument
listings/metadata/listings_4.json.gz
tar: listings/metadata/listings_4.json.gz: Cannot change ownership to uid 1808267, gid 100: Invalid argument
listings/metadata/listings_2.json.gz
tar: listings/metadata/listings_2.json.gz: Cannot change ownership to uid 1808267, gid 100: Invalid argument
listings/metadata/listings_c.json.gz
tar: listings/metadata/listings_c.json.gz: Cannot change ownership to uid 1808267, gid 100: Invalid argument
listings/metadata/listings_6.json.gz
tar: listings/metadata/listings_6.json.gz: Cannot change ownership to uid 1808267, gid 100: Invalid argument
listings/metada

In [10]:
!gzip -d listings/metadata/listings_0.json.gz


In [11]:
!aws s3 cp s3://amazon-berkeley-objects/images/metadata/images.csv.gz .


download: s3://amazon-berkeley-objects/images/metadata/images.csv.gz to ./images.csv.gz


In [12]:
!gzip -d images.csv.gz

In [13]:
# read all listing data available
listing: List = []

with open(LISTINGS_FILE, 'r') as json_file:
    listing = list(map(json.loads, list(json_file)))
    logger.info(f"there are {len(listing)} listings in {LISTINGS_FILE}")

# id to file name mapping
id_to_fname_mapping = pd.read_csv(IMAGE_ID_TO_FNAME_MAPPING_FILE)
logger.info(f"id_to_fname_mapping shape={id_to_fname_mapping.shape}")


[2023-11-30 16:38:36,717] p21 {426480771.py:6} INFO - there are 9232 listings in listings/metadata/listings_0.json
[2023-11-30 16:38:37,436] p21 {426480771.py:10} INFO - id_to_fname_mapping shape=(398212, 4)


In [14]:
# filter for language of interest
listing_filtered: List = []
for l in listing:
    brand = l.get('brand')
    if brand is not None:
        for b in brand:
            if b['language_tag'] == LANGUAGE_TO_FILTER:
                listing_filtered.append(l)
logger.info(f"there are {len(listing_filtered)} listings for {LANGUAGE_TO_FILTER} in {LISTINGS_FILE}")


[2023-11-30 16:38:43,796] p21 {2851739776.py:9} INFO - there are 1549 listings for en_US in listings/metadata/listings_0.json


In [15]:
# create a dataset of images and descriptions
image_data_list: List = []
for l in listing_filtered:
    main_image_id = l.get('main_image_id')
    if main_image_id is None:
        continue
    bullet_point = l.get('bullet_point')
    tags: List = [] 
    if bullet_point is not None:
        for b in bullet_point:
            lt = b.get('language_tag')
            v = b.get('value')
            if lt == LANGUAGE_TO_FILTER:
                tags.append(v)
    description = ". ".join(tags)
    # logger.info(f"{main_image_id}, {description}")
 
    image_data_list.append(dict(image_id=main_image_id, description=description))
    
# create a dataframe so that we can join with the image path data
image_data = pd.merge(left=pd.DataFrame(image_data_list),
                      right=id_to_fname_mapping,
                      on="image_id",
                      how="left")
# image_data.path = image_data.path.map(lambda x: f"{ABO_S3_BUCKET_PREFIX}/{x}")
image_data.to_csv(IMAGE_DATASET_FNAME, index=False)


In [16]:
%%time
import sys
import subprocess

module_name:str = "download_images" # os.path.join(os.getcwd(), "download_images")
fn_name:str = "download_images"
cmd = f"from {module_name} import {fn_name}; {fn_name}({N}, \"{IMAGE_DATASET_FNAME}\", \"{ABO_S3_BUCKET}\", \"{ABO_S3_PREFIX}\", \"{IMAGES_DIR}\")"
logger.info(f"going to run the following as script -> \"{cmd}\"")
    
ret: int = subprocess.check_call([sys.executable, "-c", cmd])
logger.info(f"{fn_name} returned with exit code={ret}")
# convert all the downloaded files into base64 encoding
import glob
import base64

image_file_list = glob.glob(os.path.join(IMAGES_DIR, "*.*"))
logger.info(f"there are {len(image_file_list)} in {IMAGES_DIR}")

[2023-11-30 16:39:40,491] p21 {<timed exec>:7} INFO - going to run the following as script -> "from download_images import download_images; download_images(10000, "aob_en_US.csv", "amazon-berkeley-objects", "images/original", "data/images/en_US")"
[2023-11-30 16:41:02,931] p21 {<timed exec>:10} INFO - download_images returned with exit code=0
[2023-11-30 16:41:03,059] p21 {<timed exec>:16} INFO - there are 1489 in data/images/en_US


CPU times: user 16.2 ms, sys: 801 µs, total: 17 ms
Wall time: 1min 22s


In [17]:
def resize_image_if_needed(image_file_path: str):
    image = Image.open(image_file_path)    
    if (image.size[0] * image.size[1]) > (MAX_IMAGE_HEIGHT * MAX_IMAGE_WIDTH):
        logger.info(f"{image_file_path} has dimensions {image.size} which is larger than {MAX_IMAGE_HEIGHT}x{MAX_IMAGE_WIDTH} combined, will scale this image")
        image.thumbnail((MAX_IMAGE_HEIGHT, MAX_IMAGE_WIDTH))
        image.save(image_file_path)

def encode_image_to_base64(image_file_path: str):
    with open(image_file_path, "rb") as image_file:
        b64_image = base64.b64encode(image_file.read()).decode('utf8')
        b64_image_path = os.path.join(B64_ENCODED_IMAGES_DIR, f"{os.path.basename(image_file_path)}.b64")
        with open(b64_image_path, "wb") as b64_image_file:
            b64_image_file.write(bytes(b64_image, 'utf-8'))

_ = list(map(resize_image_if_needed, image_file_list))
_ = list(map(encode_image_to_base64, image_file_list))




[2023-11-30 16:41:36,340] p21 {1841016947.py:4} INFO - data/images/en_US/d1583fae.jpg has dimensions (2560, 2560) which is larger than 2048x2048 combined, will scale this image
[2023-11-30 16:41:36,615] p21 {1841016947.py:4} INFO - data/images/en_US/f2ca58fb.jpg has dimensions (2560, 2560) which is larger than 2048x2048 combined, will scale this image
[2023-11-30 16:41:36,902] p21 {1841016947.py:4} INFO - data/images/en_US/06935e1d.jpg has dimensions (2560, 2560) which is larger than 2048x2048 combined, will scale this image
[2023-11-30 16:41:37,235] p21 {1841016947.py:4} INFO - data/images/en_US/66c1272f.jpg has dimensions (2560, 1654) which is larger than 2048x2048 combined, will scale this image
[2023-11-30 16:41:37,457] p21 {1841016947.py:4} INFO - data/images/en_US/7df4ebd1.jpg has dimensions (2560, 1714) which is larger than 2048x2048 combined, will scale this image
[2023-11-30 16:41:37,651] p21 {1841016947.py:4} INFO - data/images/en_US/2a1a61ad.jpg has dimensions (2560, 2108) w