# Resizing images

In order to make the dataset more manageable we size down the images to width 300 and preserving the original aspect ratio, we will also create a csv file to record the resizing factor applied to each customer photo so we can updated the bounding boxes coordinates.

Resizing customer images to width 300 is likely to have a bigger impact on small items such as belts or bags, however for the purpose of this project our scope is large items such as dresses and tops and we assume that a better quality set of images would be provided in order to make it scalable to more clothing categories.

In [1]:
import os
import shutil
import pandas as pd
from PIL import Image
from tqdm import tqdm_notebook as tqdm
%matplotlib inline

pd.options.mode.chained_assignment = None  #disables .loc assignment warning

## Creating customer and retrieval dataframes

Files are separated into 3 classes: retrieval, train and test. Each of these classes have a json for each of the 11 clothing categories.  The function below merges all categories json files under the 3 classes, then train and test are also merged so we can do a custom data split.

In [2]:
def format_labels(path_labels, store_images=False):
    """formats original jsons into either retrival set or customer set"""
    
    json_files = os.listdir(path_labels)
    json_files = [os.path.join(path_labels, file) for file in json_files] #reading in all json files
    if store_images==False:
        all_files = [file for file in json_files if "retrieval" in file]
    else:
        all_files = [file for file in json_files if ("train" in file) | ("test" in file)]
    
    files_df = pd.DataFrame() #appending all retrieval / customer into a dataframe
    for file in all_files:
        files_df = files_df.append(pd.read_json(file))
    
    category_files = [file.split("_")[-1].split(".json")[0] for file in all_files] #finding category file
    category_nrows = [pd.read_json(file).shape[0] for file in all_files] #finding number of rows for each file
    
    files_df["category"] = ""
    for n, category in enumerate(category_files):
        if n == 0:
            files_df["category"].iloc[0:category_nrows[0]] = category
        index_0 = sum(category_nrows[:n])
        index_1 = sum(category_nrows[:n+1])
        files_df["category"].iloc[index_0:index_1] = category
    files_df = files_df.reset_index(drop=True)
    
    files_df["id"] = files_df["product"].astype(str) + "_" + files_df["category"] #creating key for pair matching

    if store_images == True: #fixing format of bboxes, originally a dictionary within a column
        files_df["bbox"] = files_df["bbox"].apply(lambda x: {k:v for k, v in sorted(x.items())}) #fixes missalignment in label order
        files_df["bbox"] = files_df["bbox"].apply(lambda x: "".join(map(lambda x: str(x) + ",", list(x.values()))))

        bboxes = files_df["bbox"].str.split(",",expand=True).drop(columns=[4])
        bboxes.columns=(["height", "left", "top", "width"])
        files_df = pd.concat([files_df,bboxes], axis=1).drop(columns=["bbox"])
        return files_df
    return files_df

In [3]:
path_labels = "../../labels"
customer_df = format_labels(path_labels, store_images=True)
retrieval_df = format_labels(path_labels)

In [4]:
customer_list = customer_df["photo"].unique().tolist()

## Resizing photos
~ 5h runtime

In [5]:
def image_resize(dataset_path, output_path, customer_list, width=300):
    """Resizing fixed to width 300 and same aspect ratio due to having high variability in current img size.
    aspect_ratios.csv file is created to updated the bounding boxes coordinates for customer photos on a later notebook"""
    
    aspect_ratio = pd.DataFrame()
    all_paths = os.listdir(dataset_path)
    
    for n, img_ in enumerate(tqdm(all_paths)):
        try:
            img_path = os.path.join(dataset_path, img_)
            img_object = Image.open(img_path)
            img_object = img_object.convert("RGB") #exception with transparent channel, see https://stackoverflow.com/questions/48248405/cannot-write-mode-rgba-as-jpeg
            if int(img_.split(".")[0]) in customer_list: #checking if image from customer and resizing
                width_percent = (width/float(img_object.size[0]))
                height_size = int((float(img_object.size[1])*float(width_percent)))
                img_object = img_object.resize((width,height_size), Image.ANTIALIAS)
                aspect_ratio = aspect_ratio.append({"ratio": width_percent, "img": img_}, ignore_index=True)
            output_img = os.path.join(output_path, img_)
            img_object.save(output_img)

        except OSError: #corrupted images will break it
            aspect_ratio = aspect_ratio.append({"ratio": "corrupted", "img": img_}, ignore_index=True)
            pass
    aspect_ratio.to_csv("../aspect_ratios.csv", index=False)

In [6]:
%%time
image_resize("../../photos", "../../photos_resized", customer_list)

HBox(children=(IntProgress(value=0, max=383169), HTML(value='')))




Wall time: 1h 2min 47s


The aspect_ratio.csv file will be used on the notebook 01-data-wrangling to update the bounding boxes coordinates.