# Raw data process

* ### Unzip the file

In [1]:
import os
import dill
import pandas as pd
import numpy as np

In [2]:
# unzip file
from zipfile import ZipFile

if not os.path.exists('raw_data/dataset-original'):
    with ZipFile('raw_data/dataset-original.zip', 'r') as zipObj:
        zipObj.extractall('raw_data')

In [3]:
# show the subsets of each trash class
cwd = os.getcwd()
origin_imgdir = os.path.join(cwd, 'raw_data/dataset-original')
subfolders = os.listdir(origin_imgdir)

In [4]:
# make list of directories
origin_subdirs = [os.path.join(origin_imgdir, s) for s in subfolders if not "." in s]
origin_subdirs

['/Users/loaner/Documents/github/trash-classifier/data/raw_data/dataset-original/paper',
 '/Users/loaner/Documents/github/trash-classifier/data/raw_data/dataset-original/metal',
 '/Users/loaner/Documents/github/trash-classifier/data/raw_data/dataset-original/cardboard',
 '/Users/loaner/Documents/github/trash-classifier/data/raw_data/dataset-original/trash',
 '/Users/loaner/Documents/github/trash-classifier/data/raw_data/dataset-original/glass',
 '/Users/loaner/Documents/github/trash-classifier/data/raw_data/dataset-original/plastic']

In [5]:
# count how many images in each sub class
cat_counts = {}
for cat in os.listdir(origin_imgdir):
    if "." not in cat:
        cat_counts[cat]= len([img for img in os.listdir(os.path.join(origin_imgdir, cat)) \
                               if img.endswith((".jpg", ".jpeg", "png"))])
        
cat_counts

{'paper': 594,
 'metal': 410,
 'cardboard': 403,
 'trash': 137,
 'glass': 501,
 'plastic': 482}

* ### Resize photos and resave

In [6]:
new_size_l = 384
new_size_w = 512

# make folder for resized images
resize_imgdir = os.path.join(cwd, 'raw_data/resized')
reseized_subdirs = []

if not os.path.exists(resize_imgdir):
    os.mkdir(resize_imgdir)
for s in subfolders:
    if "." not in s:
        resize_subfolder = os.path.join(resize_imgdir, s)
        reseized_subdirs.append(resize_subfolder)
        if not os.path.exists(resize_subfolder):
            os.mkdir(resize_subfolder)
reseized_subdirs

['/Users/loaner/Documents/github/trash-classifier/data/raw_data/resized/paper',
 '/Users/loaner/Documents/github/trash-classifier/data/raw_data/resized/metal',
 '/Users/loaner/Documents/github/trash-classifier/data/raw_data/resized/cardboard',
 '/Users/loaner/Documents/github/trash-classifier/data/raw_data/resized/trash',
 '/Users/loaner/Documents/github/trash-classifier/data/raw_data/resized/glass',
 '/Users/loaner/Documents/github/trash-classifier/data/raw_data/resized/plastic']

In [22]:
from skimage.io import imread, imsave
from skimage.transform import resize

def resize_save(i):
    original_path = origin_subdirs[i]
    dest_path = reseized_subdirs[i]
    
    for file in os.listdir(original_path):
        if file.endswith(('jpg', 'jpeg', 'png')):
            img = imread(os.path.join(original_path, file))
            
            # rotate all pictures to same orientation
            dim1 = len(img)
            dim2 = len(img[0])
            if dim1 > dim2:
                img = np.rot90(img)
                
            # resizing
            resized_img = resize(img, (new_size_l, new_size_w))
            imsave(os.path.join(dest_path, file), resized_img)

In [23]:
# multiprocessing image resize 
from multiprocessing import Pool

p = Pool(processes=6)

p.map(resize_save, range(len(origin_subdirs)))











































































[None, None, None, None, None, None]

* ### Challenge

The dataset is clearly imbalanced as the "trash" category is significantly less than other categories. Question: how to deal with imbalanced data