In [9]:
import os
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline

# Load Data
In this section, we will load the data to memory. We will have a collection of different data structures here for future reuse.

## Image Classification Data Load
The following code will load data from the directory structure below:

 \\- A
 
 \\- B
 
 \\- C
 
 Here, (A, B, C) are labels.

In [14]:
from scipy import ndimage
def load_file(path):
    temp = ndimage.imread(path).astype(float)
    return temp
def load_files(folder_path):
    files = os.listdir(folder_path)
    result = []
    for file in files:
        file_path = os.path.join(folder_path, file)
        try:
            temp = load_file(file_path)
            result.append(temp)
        except:
            pass
    return np.array(result)
def load_classification_data(data_path):
    labels = os.listdir(data_path)
    arr = []
    arr_label = []
    for label in labels:
        label_path = os.path.join(data_path, label)
        print(label_path)
        try:
            temp = load_files(label_path)
        except:
            pass
        arr.append(temp)
        length = len(temp)
        str_labels = label * length # Give ["A", "A", "A".....]
        arr_label.append(np.array(list(str_labels)))
    result_data = np.concatenate((arr), axis=0)
    result_label = np.concatenate((arr_label), axis=0)
    return result_data, result_label

In [34]:
data, label = load_classification_data("/Users/andyxie/workspace/data/notMNIST_small/")

/Users/andyxie/workspace/data/notMNIST_small/A
/Users/andyxie/workspace/data/notMNIST_small/B
/Users/andyxie/workspace/data/notMNIST_small/C
/Users/andyxie/workspace/data/notMNIST_small/D
/Users/andyxie/workspace/data/notMNIST_small/E
/Users/andyxie/workspace/data/notMNIST_small/F
/Users/andyxie/workspace/data/notMNIST_small/G
/Users/andyxie/workspace/data/notMNIST_small/H
/Users/andyxie/workspace/data/notMNIST_small/I
/Users/andyxie/workspace/data/notMNIST_small/J


In [19]:
data.dtype

dtype('float64')

# Casting

In [35]:
data = data.astype(np.float32)

# Normalization
In this section, we will normalize the data. This operation is very crucial, it speeds up the learning.

In [46]:
pixel_depth = 255
data_1 = (data - pixel_depth/2)/(pixel_depth/2)
data_1.max(), data_1.min()

(1.0, -1.0)

# One Hot

In [47]:
import pandas as pd
label_1 = pd.get_dummies(label).values
label_1[0]

array([1, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=uint8)

# Shuffle Data

In [49]:
from sklearn.utils import shuffle
data_2, label_2 = shuffle(data_1, label_1)

# Seperate train, test, validation data

In [52]:
data_final = data_2
label_final = label_2
from sklearn.model_selection import train_test_split
X_temp, X_test, y_temp, y_test = train_test_split(
    data_final, label_final, test_size=0.1)
X_train, X_validate, y_train, y_validate = train_test_split(
    X_temp, y_temp, test_size=0.33)

# Save Data

In [53]:
np.save(
    "/Users/andyxie/workspace/input/data_all_flat.npy", 
    (X_train, y_train, X_validate, y_validate, X_test, y_test)
)