# Generate more data

In [58]:
import numpy as np
import time

## Read input data and generate more data
Generate more data:
- Vertical reflection
- Single pixel translations (up, down, left, right) with wrapping

Will results in (1 original) + (1 reflection) + (4 orig trans) + (4 refl trans) = 10x the data.

In [59]:
data_bacon_file = open("../processor/data/matrix/bacon.dat", "rb")
bacon_data = np.fromfile(data_bacon_file, dtype=np.uint8)
# 100 images, 4 channels (R,G,B,L), 64x64 pixels
bacon_data = bacon_data.reshape((100, 4, 64, 64))
data_bacon_file.close()

In [60]:
data_kevin_file = open("../processor/data/matrix/kevin.dat", "rb")
kevin_data = np.fromfile(data_kevin_file, dtype=np.uint8)
# 100 images, 4 channels (R,G,B,L), 64x64 pixels
kevin_data = kevin_data.reshape((100, 4, 64, 64))
data_kevin_file.close()

In [61]:
data_not_file = open("../processor/data/matrix/not.dat", "rb")
not_data = np.fromfile(data_not_file, dtype=np.uint8)
# 200 images, 4 channels (R,G,B,L), 64x64 pixels
not_data = not_data.reshape((200, 4, 64, 64))
data_not_file.close()

In [5]:
# Reflection about vertical axis
def reflect_about_vertical(data):
    reflected_images = []
    for image in data:
        reflected_channels = []
        for channel in image:
            reflected_rows = []
            for row in channel:
                reflected_rows.append(row[::-1].tolist())
            reflected_channels.append(reflected_rows)
        reflected_images.append(reflected_channels)
    return np.asarray(reflected_images, dtype=np.uint8)

In [62]:
refl_bacon_data = reflect_about_vertical(bacon_data)
refl_kevin_data = reflect_about_vertical(kevin_data)
refl_not_data = reflect_about_vertical(not_data)

In [63]:
# Translation with wrapping
def translate(data, direction=None):
    if direction is None or direction not in ["up", "down", "left", "right"]:
        return None
    
    trans_images = []
    for image in data:
        trans_channels = []
        for channel in image:
            
            if direction == "right":
                trans_rows = []
                for row in channel:
                    trans_rows.append([row[-1]] + row[:-1].tolist())
                trans_channels.append(trans_rows)
            elif direction == "left":
                trans_rows = []
                for row in channel:
                    trans_rows.append(row[1:].tolist() + [row[0]])
                trans_channels.append(trans_rows)
            elif direction == "up":
                tmp1 = channel[1:].tolist()
                tmp2 = channel[0].tolist()
                tmp1.append(tmp2)
                trans_channels.append(tmp1)
            elif direction == "down":
                tmp1 = [channel[-1].tolist()]
                tmp2 = channel[:-1].tolist()
                tmp1 = tmp1 + tmp2
                trans_channels.append(tmp1)
            
        trans_images.append(trans_channels)
    return np.asarray(trans_images, dtype=np.uint8)

In [None]:
# Hmm. This is a bit repetitive... but oh well

In [64]:
orig_trans_bacon_data = []
refl_trans_bacon_data = []
for d in ["up", "down", "left", "right"]:
    orig_trans_bacon_data.append(translate(bacon_data, d))
    refl_trans_bacon_data.append(translate(refl_bacon_data, d))

In [65]:
orig_trans_kevin_data = []
refl_trans_kevin_data = []
for d in ["up", "down", "left", "right"]:
    orig_trans_kevin_data.append(translate(kevin_data, d))
    refl_trans_kevin_data.append(translate(refl_kevin_data, d))

In [66]:
orig_trans_not_data = []
refl_trans_not_data = []
for d in ["up", "down", "left", "right"]:
    orig_trans_not_data.append(translate(not_data, d))
    refl_trans_not_data.append(translate(refl_not_data, d))

In [67]:
all_bacon_data = np.asarray([bacon_data] + [refl_bacon_data] + orig_trans_bacon_data + refl_trans_bacon_data)
all_kevin_data = np.asarray([kevin_data] + [refl_kevin_data] + orig_trans_kevin_data + refl_trans_kevin_data)
all_not_data = np.asarray([not_data] + [refl_not_data] + orig_trans_not_data + refl_trans_not_data)

## Save generated data
So that we don't have to do this over an over.

In [68]:
all_bacon_data.tofile("generated_data/all_bacon_data.dat")
all_kevin_data.tofile("generated_data/all_kevin_data.dat")
all_not_data.tofile("generated_data/all_not_data.dat")

In [23]:
x = np.empty(10, dtype=np.uint8)
x.fill(3)
y = np.empty(10, dtype=np.uint8)
y.fill(7)

In [27]:
y.astype(np.float32)

array([ 7.,  7.,  7.,  7.,  7.,  7.,  7.,  7.,  7.,  7.], dtype=float32)

In [21]:
x = np.concatenate((bacon_data, kevin_data, not_data))

In [22]:
len(x)

400

## Prepare data splits for training
Write the splits so that we have a static set of train/validation/test and don't have to do this all the time.

In [69]:
def shuffle_in_unison_inplace(a, b):
    p = np.random.permutation(len(a))
    return a[p], b[p]

def load_and_prepare_dataset():
    t_start = time.time()

    # Read in data
    data_bacon_file = open("generated_data/all_bacon_data.dat", "rb")
    data_kevin_file = open("generated_data/all_kevin_data.dat", "rb")
    data_not_file = open("generated_data/all_not_data.dat", "rb")
    bacon_data = np.fromfile(data_bacon_file, dtype=np.uint8)
    kevin_data = np.fromfile(data_kevin_file, dtype=np.uint8)
    not_data = np.fromfile(data_not_file, dtype=np.uint8)
    
    bacon_data = bacon_data.reshape((-1, 4, 64, 64))
    kevin_data = kevin_data.reshape((-1, 4, 64, 64))
    not_data = not_data.reshape((-1, 4, 64, 64))
    
    print("-- Files read")

    # Map classes to numbers
    y_bacon = np.empty(len(bacon_data), dtype=np.uint8)
    y_kevin = np.empty(len(kevin_data), dtype=np.uint8)
    y_not = np.empty(len(not_data), dtype=np.uint8)
    y_not.fill(0)
    y_bacon.fill(1)
    y_kevin.fill(2)

    # Coalesce the data
    X = np.concatenate((not_data, bacon_data, kevin_data))
    y = np.concatenate((y_not, y_bacon, y_kevin))
    print("-- Data coalesced")

    # Cast X data as floating point (single precision)
    X = X.astype(np.float32)

    # Randomize the order
    X, y = shuffle_in_unison_inplace(X, y)
    print("-- Data shuffled")

    # Break into 80:10:10 train:validation:test
    X_test = X[:(len(X) // 10)]
    X_val = X[(len(X) // 10):(2 * (len(X) // 10))]
    X_train = X[(2 * (len(X) // 10)):]

    y_test = y[:(len(y) // 10)]
    y_val = y[(len(y) // 10):(2 * (len(y) // 10))]
    y_train = y[(2 * (len(y) // 10)):]
    print("-- Data split")

    print("Time to load and prepare data: {}".format(time.time() - t_start))
    return X_train, y_train, X_val, y_val, X_test, y_test

In [42]:
y_bacon = np.empty(len(bacon_data), dtype=np.uint8)
y_kevin = np.empty(len(kevin_data), dtype=np.uint8)
y_bacon.fill(1)
y_kevin.fill(2)
y = np.concatenate((y_bacon, y_kevin))

In [53]:
bacon_data = np.fromfile("generated_data/all_bacon_data.dat", dtype=np.uint8)
bacon_data = bacon_data.reshape((-1, 4, 128, 128))
len(bacon_data)

1000

In [43]:
len(y)

200

In [70]:
X_train, y_train, X_val, y_val, X_test, y_test = load_and_prepare_dataset()

# Save sets to disk
X_train.tofile("generated_data/X_train.dat")
X_val.tofile("generated_data/X_val.dat")
X_test.tofile("generated_data/X_test.dat")

y_train.tofile("generated_data/y_train.dat")
y_val.tofile("generated_data/y_val.dat")
y_test.tofile("generated_data/y_test.dat")

-- Files read
-- Data coalesced
-- Data shuffled
-- Data split
Time to load and prepare data: 0.34552812576293945


In [73]:
len(y_train) // 400

8