# Data preparation

***This notebook works well with the `Data Science 3.0 Python 3` kernel and `ml.t3.medium` instance type.***

Data download and data preparation.

In [None]:
import sys
!{sys.executable} -m pip install -r requirements.txt

In [None]:
import os
import json
import boto3
import logging
import pandas as pd
from PIL import Image
from globals import *
from typing import List
from download_images import download_images

logging.basicConfig(format='[%(asctime)s] p%(process)s {%(filename)s:%(lineno)d} %(levelname)s - %(message)s', level=logging.INFO)
logger = logging.getLogger(__name__)

In [None]:
# global constants
!pygmentize globals.py

In [None]:
!rm -rf listings
!rm -f abo-listings.tar
!rm -f images.csv.gz images.csv


In [None]:
!wget https://amazon-berkeley-objects.s3.us-east-1.amazonaws.com/archives/abo-listings.tar


In [None]:
!tar xvf abo-listings.tar


In [None]:
!gzip -d listings/metadata/listings_0.json.gz


In [None]:
!aws s3 cp s3://amazon-berkeley-objects/images/metadata/images.csv.gz .

In [None]:
!gzip -d images.csv.gz

In [None]:
# read all listing data available
listing: List = []

with open(LISTINGS_FILE, 'r') as json_file:
    listing = list(map(json.loads, list(json_file)))
    logger.info(f"there are {len(listing)} listings in {LISTINGS_FILE}")

# id to file name mapping
id_to_fname_mapping = pd.read_csv(IMAGE_ID_TO_FNAME_MAPPING_FILE)
logger.info(f"id_to_fname_mapping shape={id_to_fname_mapping.shape}")


In [None]:
# filter for language of interest
listing_filtered: List = []
for l in listing:
    brand = l.get('brand')
    if brand is not None:
        for b in brand:
            if b['language_tag'] == LANGUAGE_TO_FILTER:
                listing_filtered.append(l)
logger.info(f"there are {len(listing_filtered)} listings for {LANGUAGE_TO_FILTER} in {LISTINGS_FILE}")


In [None]:
# create a dataset of images and descriptions
image_data_list: List = []
for l in listing_filtered:
    main_image_id = l.get('main_image_id')
    if main_image_id is None:
        continue
    bullet_point = l.get('bullet_point')
    tags: List = [] 
    if bullet_point is not None:
        for b in bullet_point:
            lt = b.get('language_tag')
            v = b.get('value')
            if lt == LANGUAGE_TO_FILTER:
                tags.append(v)
    description = ". ".join(tags)
    # logger.info(f"{main_image_id}, {description}")
 
    image_data_list.append(dict(image_id=main_image_id, description=description))
    
# create a dataframe so that we can join with the image path data
image_data = pd.merge(left=pd.DataFrame(image_data_list),
                      right=id_to_fname_mapping,
                      on="image_id",
                      how="left")
# image_data.path = image_data.path.map(lambda x: f"{ABO_S3_BUCKET_PREFIX}/{x}")
image_data.to_csv(IMAGE_DATASET_FNAME, index=False)


In [None]:
%%time
import sys
import subprocess

module_name:str = "download_images" # os.path.join(os.getcwd(), "download_images")
fn_name:str = "download_images"
cmd = f"from {module_name} import {fn_name}; {fn_name}({N}, \"{IMAGE_DATASET_FNAME}\", \"{ABO_S3_BUCKET}\", \"{ABO_S3_PREFIX}\", \"{IMAGES_DIR}\")"
logger.info(f"going to run the following as script -> \"{cmd}\"")
    
ret: int = subprocess.check_call([sys.executable, "-c", cmd])
logger.info(f"{fn_name} returned with exit code={ret}")
# convert all the downloaded files into base64 encoding
import glob
import base64

image_file_list = glob.glob(os.path.join(IMAGES_DIR, "*.*"))
logger.info(f"there are {len(image_file_list)} in {IMAGES_DIR}")

In [None]:
def resize_image_if_needed(image_file_path: str):
    image = Image.open(image_file_path)    
    if (image.size[0] * image.size[1]) > (MAX_IMAGE_HEIGHT * MAX_IMAGE_WIDTH):
        logger.info(f"{image_file_path} has dimensions {image.size} which is larger than {MAX_IMAGE_HEIGHT}x{MAX_IMAGE_WIDTH} combined, will scale this image")
        image.thumbnail((MAX_IMAGE_HEIGHT, MAX_IMAGE_WIDTH))
        image.save(image_file_path)

def encode_image_to_base64(image_file_path: str):
    with open(image_file_path, "rb") as image_file:
        b64_image = base64.b64encode(image_file.read()).decode('utf8')
        b64_image_path = os.path.join(B64_ENCODED_IMAGES_DIR, f"{os.path.basename(image_file_path)}.b64")
        with open(b64_image_path, "wb") as b64_image_file:
            b64_image_file.write(bytes(b64_image, 'utf-8'))

_ = list(map(resize_image_if_needed, image_file_list))
_ = list(map(encode_image_to_base64, image_file_list))


