# prepare data for augmentation

create the following data structure:

data/
    train/
        apple/
            apple/
                apple_0.png
                apple_1.png
                ...
        banana/
            banana/
                banana_0.png
                banana_1.png
            ...
    validation/
        apple/
            apple/
                apple_0.png
                apple_1.png
                ...
        banana/
            banana/
                banana_0.png
                banana_1.png


In [None]:
import glob
import os
import shutil

In [None]:
# Create a dictionary for data path group by class label

data_dir = ".\\data"
set_name = "train"
data_paths = glob.glob(os.path.join(data_dir, set_name, '*.png'))

data_path_dict = {}

for data_path in data_paths:
    fname = os.path.basename(data_path)
    label = fname.split("_")[0]
    if label not in data_path_dict.keys():
        data_path_dict[label] = [data_path]
    else:
        data_path_dict[label].append(data_path)

In [None]:
class_labels = data_path_dict.keys()
print (class_labels)

In [None]:
# Move data to individual class folder
for key in data_path_dict.keys():
    for data_path in data_path_dict[key]:
        target_path = os.path.join(data_dir, set_name, key, key)
        fname = os.path.basename(data_path)
        if not os.path.exists(target_path):
            os.makedirs(target_path)
        shutil.move(data_path, os.path.join(target_path, fname))

# Data augmentation

All images are resized to (150, 150)
All augmented data are stored in folder .\\data\\train_aug

In [None]:
from keras.preprocessing.image import ImageDataGenerator

In [None]:
target_path = os.path.join(data_dir, "train_aug")

os.mkdir(target_path)

for key in class_labels:
    source_path = os.path.join(data_dir, set_name, key)
    
    datagen = ImageDataGenerator(
            rotation_range=40,
            width_shift_range=0.2,
            height_shift_range=0.2,
            rescale=1./255,
            shear_range=0.2,
            zoom_range=0.2,
            horizontal_flip=True,
            fill_mode='nearest')

    datagen_iter = datagen.flow_from_directory(
                source_path,
                target_size=(150, 150),
                batch_size=100,
                class_mode=None,
                shuffle=False,
                save_to_dir=target_path,
                save_prefix=key)
    count = 0
    for img in datagen_iter:
        if count == 10:
            break
        count += 1
        

In [None]:
# check number of sample per class
data_paths = glob.glob(os.path.join(data_dir, "train_aug", '*.png'))

class_count_dict = {}

for data_path in data_paths:
    fname = os.path.basename(data_path)
    label = fname.split("_")[0]
    if not label in class_count_dict.keys():
        class_count_dict[label] = 1
    else:
        class_count_dict[label] += 1        

In [None]:
for name, count in class_count_dict.items():
    print ("Class: {:<15} sample counts: {:<15}".format(name, count))

# Transform image to feature and lable vectors

In [None]:
import _pickle as pickle
import glob
import os
import numpy as np
import cv2

In [None]:
class data_manager(object):
    
    def __init__(self, data_dir, class_labels, image_size, set_name):
        
        self.data_dir = data_dir
        
        self.class_labels = class_labels
        
        self.num_class = len(self.class_labels)
        
        self.image_size = image_size
        
        self.set_name = set_name
        
        self.load_set()   
        
    
    def compute_label(self, label):
        '''
        Compute one-hot labels given the class size
        '''    
        one_hot = np.zeros(self.num_class)

        idx = self.class_labels.index(label)

        one_hot[idx] = 1.0

        return one_hot


    def compute_feature(self, image):
        '''
        Standardizing pixel value from [0, 255] to [-1, 1].
        ''' 
        image = cv2.resize(image, (self.image_size, self.image_size))
        
        # image = (image / 255.0) * 2.0 - 1.0

        return image      
    
            
    def load_set(self):
        
        features = []
        
        label = []
        
        data_paths = glob.glob(os.path.join(self.data_dir, self.set_name, '*.png'))
        
        idx = np.arange(len(data_paths))
        
        np.random.shuffle(idx)

        for i in idx:
            
            data_path = data_paths[i]

            fname = os.path.basename(data_path)

            img_label = fname.split("_")[0]

            if img_label in self.class_labels:

                img = cv2.imread(data_path)

                label.append(np.expand_dims(self.compute_label(img_label), axis=0))

                features.append(np.expand_dims(self.compute_feature(img), axis=0))
                
        self.X = np.concatenate(features)
        
        self.Y = np.concatenate(label)
        
        del features, label
        

In [None]:
data_dir = ".\\data"
CLASS_LABELS = ['apple','banana','nectarine','plum','peach','watermelon','pear','mango','grape',
                'orange','strawberry','pineapple','radish','carrot','potato','tomato','bellpepper',
                'broccoli','cabbage','cauliflower','celery','eggplant','garlic','spinach','ginger']
image_size = 90

In [None]:
dm_train = data_manager(data_dir, CLASS_LABELS, image_size, 'train_aug')
print (dm_train.X.shape)
print (dm_train.Y.shape)

In [None]:
np.save(".\\data\\X_train.npy", dm_train.X)
np.save(".\\data\\Y_train.npy", dm_train.Y)

In [None]:
dm_test = data_manager(data_dir, CLASS_LABELS, image_size, 'val')
print (dm_test.X.shape)
print (dm_test.Y.shape)

In [None]:
np.save(".\\data\\X_test.npy", dm_test.X)
np.save(".\\data\\Y_test.npy", dm_test.Y)