# Split train & test dataset

In [1]:
import os
import numpy as np
from tqdm import tqdm_notebook as tqdm
from sklearn.model_selection import train_test_split
from keras.preprocessing.image import ImageDataGenerator, img_to_array, array_to_img, load_img, save_img

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
seed_value = 1000

class FilterDataset:
    def __init__(self):
        root_path = "../../dataset"
        path_type = os.path.join(root_path, "split-test-plates-crop-original")
        path = os.path.join(path_type, "fine")
        self.__path_char = os.path.join(path, "char-10")
        
        path_split = os.path.join(path, "char-split-10")
        path_train = os.path.join(path_split, "train")
        path_test = os.path.join(path_split, "test")
        
        path_group = [path_split,
                      path_train,
                      path_test]
        
        self.__path_save = path_group[1:3]
        
        for path_g in path_group:
            if not os.path.exists(path_g):
                os.mkdir(path_g)
        
    def loading_data(self):
        print('[info] Loading data...')
        path = self.__path_char
        
        label_grouped = os.listdir(path)
        label_sorted = sorted(label_grouped)
        self.__label_images = []
        for label in tqdm(label_sorted): # Limited
            path_label = os.path.join(path, label)
            images = []
            for img_name in os.listdir(path_label):
                img = load_img(os.path.join(path_label, img_name), color_mode='grayscale')
                img_array = img_to_array(img)
                images.append(img_array)
            self.__label_images.append([label, images])
        
    def spliting_data(self):
        print('[info] Spliting...')
        randState = np.random.RandomState(seed_value)
        
        train_group = []
        test_group = []
        for label, image_group in self.__label_images:
            train_images, test_images = train_test_split(image_group, test_size=0.2, random_state=randState)
            train_group.append([label, train_images])
            test_group.append([label, test_images])
        print('character splited')
        self.__splited_characters = [train_group, test_group]
            
    def saving_data(self):
        print('[info] saving...')
        for saved_path, splited_char in zip(self.__path_save, self.__splited_characters):
            for label, images in splited_char:
                path = os.path.join(saved_path, label)
                if not os.path.exists(path):
                    os.mkdir(path)
                    
                name = 1
                for image in images:
                    save_img(os.path.join(path, str(name)+".jpg"), image)
                    name += 1
        print('saved')
                
if __name__=='__main__':
    fd = FilterDataset()
    fd.loading_data()
    fd.spliting_data()
    fd.saving_data()

[info] Loading data...


HBox(children=(IntProgress(value=0, max=36), HTML(value='')))


[info] Spliting...
character splited
[info] saving...
saved
