In [3]:
import os
import sys
import numpy as np
#import cPickle as pickle
import pickle
from natsort import natsorted
from PIL import Image
import cv2
import matplotlib.pyplot as plt

In [4]:
DIR_TRAIN_DATA_IMG = "imageTrain10"
FILE_TRAIN_DATA_OUTPUT="train10.csv"
DIR_TEST_DATA_IMG="imageTest10"
IMAGE_SIZE = 128
IMAGE_NUM_CHANNELS = 1
CROP_SIZE = 200
NUM_LABELS = 37

In [5]:
def image_process(img_array):
    img_array = cv2.GaussianBlur(img_array, (17, 17), 0)
    img_array = cv2.bitwise_not(img_array)
    image=Image.fromarray(img_array)
    if CROP_SIZE > 0:
        (width, height) = image.size
        left = (width-CROP_SIZE) / 2
        top = (height-CROP_SIZE) / 2
        right = left + CROP_SIZE
        bottom = top + CROP_SIZE
        image = image.crop((left, top, right, bottom))
    if not (IMAGE_SIZE, IMAGE_SIZE) == image.size:
        image = image.resize((IMAGE_SIZE, IMAGE_SIZE), Image.ANTIALIAS)

    image = np.array(image)
    image = image.reshape(-1)
    return image


In [6]:
label_data = np.genfromtxt(FILE_TRAIN_DATA_OUTPUT, dtype=np.float32, delimiter=',', skip_header=1)
train_current_batch = np.zeros((IMAGE_SIZE * IMAGE_SIZE * IMAGE_NUM_CHANNELS, 0), dtype=np.uint8)
train_current_batch_label = np.zeros((NUM_LABELS, 0), dtype=np.float32)

In [7]:
sorted_idx = label_data[:, 0].argsort()
sorted_idx

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [8]:
label_data = label_data[sorted_idx, 1:]

In [9]:
label_data = label_data.T

In [10]:
train_names = [d for d in os.listdir(DIR_TRAIN_DATA_IMG) if d.endswith('.jpg')]

In [11]:
train_names

['100053.jpg',
 '100078.jpg',
 '100134.jpg',
 '100123.jpg',
 '100128.jpg',
 '100008.jpg',
 '100143.jpg',
 '100090.jpg',
 '100122.jpg',
 '100023.jpg']

In [16]:
train_names = natsorted(train_names)
num_trains = len(train_names)
train_names

['100008.jpg',
 '100023.jpg',
 '100053.jpg',
 '100078.jpg',
 '100090.jpg',
 '100122.jpg',
 '100123.jpg',
 '100128.jpg',
 '100134.jpg',
 '100143.jpg']

In [17]:
data_mean = np.zeros((IMAGE_SIZE * IMAGE_SIZE * IMAGE_NUM_CHANNELS, 1), dtype=np.float32)

In [18]:
train_order = np.random.permutation(num_trains)
train_order

array([4, 8, 3, 5, 1, 2, 7, 9, 6, 0])

In [19]:
for i in train_order:
        image_file_name = train_names[i]
        try:
            image = cv2.imread(os.path.join(DIR_TRAIN_DATA_IMG, image_file_name), cv2.IMREAD_GRAYSCALE)
            image = image_process(image)
        except ValueError:
            print("problem with train image {}".format(image_file_name))
            sys.exit(1)
        image = image.reshape(-1, 1)
        data_mean += image
        train_current_batch = np.hstack((train_current_batch, image))
        train_current_batch_label = np.hstack((train_current_batch_label,
                                               label_data[:, i].reshape(-1, 1)))

In [20]:
train_image = train_current_batch.T
train_image

array([[228, 222, 216, ..., 247, 247, 248],
       [251, 250, 249, ..., 250, 250, 248],
       [243, 245, 246, ..., 244, 245, 247],
       ..., 
       [251, 251, 252, ..., 251, 252, 253],
       [245, 246, 248, ..., 252, 252, 251],
       [250, 251, 251, ..., 250, 250, 250]], dtype=uint8)

In [22]:
train_values = train_current_batch_label.T

In [23]:
pickle_out = open("./pickles/X.pickle", "wb")
pickle.dump(train_image, pickle_out)
pickle_out.close()

In [25]:
pickle_out = open("./pickles/y.pickle", "wb")
pickle.dump(train_values, pickle_out)
pickle_out.close()