In [1]:
import tensorflow as tf
import keras

Using TensorFlow backend.


## Keras for Neural Networks
- Digit recognition using the MNIST dataset
    - MNIST: Modified National Institute of Standards and Technology
    - One of most used datasets for advanced ML techniques
- Goal: use MNIST dataset and NNs to classify handwritten numbers as digits
- Create three different styles of NN

In [2]:
# import dataset
from keras.datasets import mnist

# model building imports
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, Conv2D, MaxPooling2D
from keras.layers import LSTM, Input, TimeDistributed
from keras.models import Model
from keras.optimizers import RMSprop

# import backend
from keras import backend as K

In [3]:
(X_train, y_train), (X_test, y_test) = mnist.load_data()
X_train

array([[[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]],

       [[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]],

       [[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]],

       ...,

       [[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]],

       [[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 

- Data are not images, instead values of pixels
- Higher dimensional structure than usual featureset matrices
    - Set of clouds; each cloud represents an image
    - Cloud has columns of values representing the darkness of pixels
- Pixel darkness is unlikely to have a simple relationship with the digit represented
- Need to find meaningful patterns within clouds to create models from
- Perfect task for NNs:
    - Multiple layers allows for clouds full values to be transformed into meaningful vectors
    - Output will be labels predicting the digit each cloud represents
    
### Multi Layer Perceptron
- Set of perceptron models organized into layers, one feeding into the next
- Need to reshape data into flat vectors for each digit
- Need to convert outcome to a matrix of binary variables rather than the digit

In [4]:
print('X_train shape: {}\nX_test shape: {}'.format(X_train.shape, X_test.shape))

X_train shape: (60000, 28, 28)
X_test shape: (10000, 28, 28)


In [5]:
# 60,000 train images, 10,000 test images
# Images are 28*28, array length should be 784

X_train = X_train.reshape(60000, 784)
X_test = X_test.reshape(10000, 784)

# Convert to float32 for type consistency
X_train = X_train.astype('float32')
X_test = X_test.astype('float32')

# Normalize values from 0-255 (256 values of pixels) to 1
X_train /= 255
X_test /= 255

# print sample sizes
print('{} train samples\n{} test samples'.format(X_train.shape[0],
                                                 X_test.shape[0]))

# Convert class vectors to binary class matrices
# Instead of one column with 10 values, create 10 binary columns
y_train = keras.utils.to_categorical(y_train, 10)
y_test = keras.utils.to_categorical(y_test, 10)

60000 train samples
10000 test samples


- Create model using dense layers and dropouts
- Dropouts drop a certain portion of perceptrons to combat overfitting
- Activation function: [ReLU (Rectified Linear Unit)](https://en.wikipedia.org/wiki/Rectifier_(neural_networks)

In [6]:
# instantiate model
model = Sequential()

# add dense layers to create fully connected MLP
# note: input shape is specified only for the first layer
model.add(Dense(64, activation='relu', input_shape=(784,)))
model.add(Dropout(0.1)) # dropout layers removing features to reduce overfitting

model.add(Dense(64, activation='relu'))
model.add(Dropout(0.1))

model.add(Dense(10, activation='softmax')) # set last layer units equal to number of classes

model.summary()

# compile model
model.compile(
    loss='categorical_crossentropy',
    optimizer=RMSprop(),
    metrics=['accuracy']
)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 64)                50240     
_________________________________________________________________
dropout_1 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 64)                4160      
_________________________________________________________________
dropout_2 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 10)                650       
Total params: 55,050
Trainable params: 55,050
Non-trainable params: 0
_________________________________________________________________


- Now have model to use for predicting digits from pixel data using epochs
- **Epoch**: one forward pass and one backward pass of all training data
    - Multiple epochs, updating weights with a single epoch (pass) is not enough to train NN
- **Batch size**: number of samples to use in one forward/backward pass (one epoch)
    - Higher batch size requires more memory
- Setting layer width to 64 perceptrons, is arbitrary, however...
    - Units within $2^x$ series parallelizes more efficiently
- Number of parameters is the product of input width plus one and layer width
    - Reflects number of weights creating in that layer

In [7]:
history = model.fit(
    X_train,
    y_train,
    batch_size=128,
    epochs=10,
    verbose=1,
    validation_data=(X_test, y_test)
)
score = model.evaluate(X_test, y_test, verbose=0)
print('test loss: {}'.format(score[0]))
print('test accuracy: {}'.format(score[1]))

Train on 60000 samples, validate on 10000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
test loss: 0.08942102362994338
test accuracy: 0.9748


**Result**: Each epoch trained in ~1 second and returned accuracy in the .97 range

### Convolutional Neural Networks
**Note**: the complexity of CNNs is computationally demanding, may take hours to run
- **Convolution** takes data and creates overlapping subsegments testing for a given feature in a set of spaces upon which the model is developed
    1. Define shape of input data
        - Can be any number of dimensions
        - Using 2d here since images are in two dimensions
    2. Create kernels (or tiles)
        - Kernels are little windows that look over subsets of data of a given size
        - Using 3x3 kernels which run overlapping over the 28x28 input looking for features
        - This is a convolutional layer, searching for a subpattern over the whole image
        - Can chain multiple convolutional layers together, using two here
    3. Create pooling layer
        - Downsampling technique which reduces sample size and simplifies later processes
        - For each value generated by convolution layers
            - Pooling layer looks over the grid in non-overlapping segments and takes the maximum value of outputs
            - Approximate/relative location matters more than exact location
    4. Flatten data back out to put into dense layers (as with MLP model)

In [8]:
import time
start_time = time.time()

# input image dimensions from data
img_rows, img_cols = 28, 28
num_classes = 10

# shuffle data and split into train/test sets
(X_train, y_train), (X_test, y_test) = mnist.load_data()

if K.image_data_format() == 'channels_first':
    X_train = X_train.reshape(X_train.shape[0], 1, img_rows, img_cols)
    X_test = X_test.reshape(X_test.shape[0], 1, img_rows, img_cols)
    input_shape = (1, img_rows, img_cols)
else:
    X_train = X_train.reshape(X_train.shape[0], img_rows, img_cols, 1)
    X_test = X_test.reshape(X_test.shape[0], img_rows, img_cols, 1)
    input_shape = (img_rows, img_cols, 1)
    
X_train = X_train.astype('float32')
X_test = X_test.astype('float32')
X_train /= 255
X_test /= 255
print('X_train shape: {}'.format(X_train.shape))

# convert class vectors to binary class matrices
y_train = keras.utils.to_categorical(y_train, num_classes)
y_test = keras.utils.to_categorical(y_test, num_classes)

# instatiate and build model
model = Sequential()

# add first convolutional layer, specifying shape
model.add(Conv2D(32,
                 kernel_size=(3,3),
                 activation='relu',
                 input_shape=input_shape
                ))
model.add(Conv2D(64, (3,3), activation='relu'))
model.add(MaxPooling2D(pool_size=(2,2)))
model.add(Dropout(0.25))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(num_classes, activation='softmax'))

# compile model
model.compile(loss=keras.losses.categorical_crossentropy,
              optimizer=keras.optimizers.Adadelta(),
              metrics=['accuracy'])

# fit model
model.fit(X_train,
          y_train,
          batch_size=128,
          epochs=10,
          verbose=1,
          validation_data=(X_test, y_test))
score = model.evaluate(X_test, y_test, verbose=0)
print('test loss: {}'.format(score[0]))
print('test accuracy: {}'.format(score[1]))
print('runtime: {} seconds'.format((time.time() - start_time)))

X_train shape: (60000, 28, 28, 1)
Train on 60000 samples, validate on 10000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
test loss: 0.030302373815378633
test accuracy: 0.9911
runtime: 1159.7456641197205 seconds


### Hierarchical Recurrent Neural Networks
- Feedforward: data flows in one direction until it reaches end of NN
- **Recurrent NNs** let data cycle through network, ignoring this directional logic
- Caveats of RNNs:
    - Requires abandoning sequential model building, can get much more complicated
    - Recurrent layers  often with time distribution
    - Time distribution: handles extra dimension created through LSTM layer as a time dimension
    - LSTM: long short-term memory, unit of a RNN composed of a cell and input/output/forget gates

In [9]:
start_time = time.time()

# training params
batch_size = 64
num_classes = 10
epochs = 3

# embedding dimensions
row_hidden = 32
col_hidden = 32

# data, shuffled/split between train/test sets
(X_train, y_train), (X_test, y_test) = mnist.load_data()

# reshape data to 4D for Hierarchical RNN
X_train = X_train.reshape(X_train.shape[0],28,28,1)
X_test = X_test.reshape(X_test.shape[0],28,28,1)
X_train = X_train.astype('float32')
X_test = X_test.astype('float32')
X_train /= 255
X_test /= 255
print('X_train shape: {}'.format(X_train.shape))
print('X_test shape: {}'.format(X_test.shape))

# convert class vectors to binary class matrices
y_train = keras.utils.to_categorical(y_train, num_classes)
y_test = keras.utils.to_categorical(y_test, num_classes)

row, col, pixel = X_train.shape[1:]

# 4D input
X = Input(shape=(row, col, pixel))

# encode rows of pixels using TimeDistributed Wrapper
encoded_rows = TimeDistributed(LSTM(row_hidden))(X)

# encode columns of encoded rows
encoded_columns = LSTM(col_hidden)(encoded_rows)

# predictions & model
prediction = Dense(num_classes, activation='softmax')(encoded_columns)
model = Model(X, prediction)
model.compile(
    loss='categorical_crossentropy',
    optimizer='rmsprop',
    metrics=['accuracy']
)
# train
model.fit(
    X_train, y_train,
    batch_size=batch_size,
    epochs=epochs,
    verbose=1,
    validation_data=(X_test, y_test)
)
# evaluate
scores = model.evaluate(X_test, y_test, verbose=0)
print('test loss: {}'.format(scores[0]))
print('test accuracy: {}'.format(scores[1]))
print('runtime: {} seconds'.format(time.time() - start_time))

X_train shape: (60000, 28, 28, 1)
X_test shape: (10000, 28, 28, 1)
Train on 60000 samples, validate on 10000 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3
test loss: 0.1701113304913044
test accuracy: 0.9471
runtime: 330.780207157135 seconds
