In [317]:
import numpy as np
import matplotlib.pyplot as plt
from keras.models import Sequential, Model
from keras.layers import Dense, Flatten, Input, Dropout, Conv1D
from keras.optimizers import SGD, Adam
from keras.datasets import mnist
from keras import regularizers
from keras import metrics
from keras.utils import to_categorical

In [231]:
(x_train, y_train), (x_test, y_test) = mnist.load_data()

# Normalize all values
x_train = x_train/255
x_test = x_test/255

In [234]:
print('The shape of x_train is {}'.format(x_train.shape))
print('The shape of y_train is {}'.format(y_train.shape))
print('The shape of x_test is {}'.format(x_test.shape))
print('The shape of y_test is {}'.format(y_test.shape))

The shape of x_train is (60000, 28, 28)
The shape of y_train is (60000,)
The shape of x_test is (10000, 28, 28)
The shape of y_test is (10000,)


In [235]:
x_train = np.reshape(x_train, (60000, 784))
x_test = np.reshape(x_test, (10000, 784))

In [236]:
y_train = to_categorical(y_train)
y_test = to_categorical(y_test)
print('The shape of y_train is {}'.format(y_train.shape))
print('The shape of y_test is {}'.format(y_test.shape))

The shape of y_train is (60000, 10)
The shape of y_test is (10000, 10)


## A Simple and Small Neural Network
 - hidden layer of $100$ units, tanh activation
 - stochastic gradient descent with momentum optimizer
 - no regularization

In [240]:
model = Sequential()
model.add(Dense(100, activation='tanh', input_dim=784))
model.add(Dense(10, activation='softmax'))

sgd = SGD(lr=0.001, decay=1e-6, momentum=0.9)
model.compile(loss='categorical_crossentropy', optimizer=sgd, metrics=['accuracy'])

In [241]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_183 (Dense)            (None, 100)               78500     
_________________________________________________________________
dense_184 (Dense)            (None, 10)                1010      
Total params: 79,510
Trainable params: 79,510
Non-trainable params: 0
_________________________________________________________________


In [242]:
model.fit(x_train, y_train, epochs=10, batch_size=100)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x23916ad79b0>

In [245]:
score = model.evaluate(x_test, y_test, batch_size=10)
score



[0.2814977415269241, 0.921399993300438]

$92.01\%$ accuracy! With such a simple model! Let us see if we can do better!

## A Deeper Neural Network
- 5 hidden layers, 200 units each. relu activation.
- dropout on input ($20\%$) and output ($20\%$)
- Adam optimizer

In [246]:
def model2(input_shape=(784,)):
    
    # input placeholder
    X_input = Input(shape=input_shape)
    
    # 50% dropout layer
    X = Dropout(0.2)(X_input)
    
    # 5 layers of 200 units, relu activations
    X = Dense(200, activation='relu')(X)
    X = Dense(200, activation='relu')(X)
    X = Dense(200, activation='relu')(X)
    X = Dense(200, activation='relu')(X)
    X = Dense(200, activation='relu')(X)
    
    # 20% dropout
    X = Dropout(0.2)(X)
    
    # classification output layer
    Y = Dense(10, activation='softmax')(X)

    # build the model
    model = Model(inputs=X_input, outputs=Y, name='MODEL2')
    
    return model

In [247]:
model2 = model2() # build the model
adam = Adam() # set optimizer
model2.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['accuracy'])
model2.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_23 (InputLayer)        (None, 784)               0         
_________________________________________________________________
dropout_43 (Dropout)         (None, 784)               0         
_________________________________________________________________
dense_185 (Dense)            (None, 200)               157000    
_________________________________________________________________
dense_186 (Dense)            (None, 200)               40200     
_________________________________________________________________
dense_187 (Dense)            (None, 200)               40200     
_________________________________________________________________
dense_188 (Dense)            (None, 200)               40200     
_________________________________________________________________
dense_189 (Dense)            (None, 200)               40200     
__________

In [248]:
model2.fit(x_train, y_train, epochs=10, batch_size=100)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x2391797e0b8>

In [249]:
score = model2.evaluate(x_test, y_test, batch_size=10)
score



[0.07398390667223975, 0.9812999963760376]

$98.13\%$ Accuracy! Let's just try an ordinary deep neural network.

### 3 x 500 Neural Network

In [293]:
def model3(input_shape=(784,)):
    
    # input placeholder
    X_input = Input(shape=input_shape)
    
    # 2 wide layers
    X = Dense(500, activation='tanh')(X_input)
    X = Dense(500, activation='tanh')(X)
    X = Dense(500, activation='tanh')(X)
    
    # classification output layer
    Y = Dense(10, activation='softmax')(X)

    # build the model
    model = Model(inputs=X_input, outputs=Y, name='MODEL2')
    
    return model

In [294]:
model3 = model3() # build the model
adam = Adam()
model3.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['accuracy'])
model3.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_37 (InputLayer)        (None, 784)               0         
_________________________________________________________________
dense_221 (Dense)            (None, 500)               392500    
_________________________________________________________________
dense_222 (Dense)            (None, 500)               250500    
_________________________________________________________________
dense_223 (Dense)            (None, 500)               250500    
_________________________________________________________________
dense_224 (Dense)            (None, 10)                5010      
Total params: 898,510
Trainable params: 898,510
Non-trainable params: 0
_________________________________________________________________


In [295]:
model3.fit(x_train, y_train, epochs=10, batch_size=100)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x239745a5dd8>

In [296]:
score = model3.evaluate(x_test, y_test, batch_size=10)
score



[0.07896114289291836, 0.9770999962687492]

Train accuracy 99.35 %. Test accuracy 97.71%. So the network isn't generalizing well. Let's just try something bigger with regularization.

## A Big, Deep, Network

In [321]:
def model4(input_shape=(784,)):
    
    # input placeholder
    X_input = Input(shape=input_shape)

    # 20% dropout layer
    X = Dropout(0.2)(X_input)
    
    # Wide and spotty network
    X = Dense(1000, activation = 'relu')(X)
    X = Dropout(0.2)(X)
    
    # 2 grouping of 3 x 100 relus + 20% dropout
    X = Dense(200, activation='relu')(X)
    X = Dense(200, activation='relu')(X)
    X = Dense(200, activation='relu')(X)
    X = Dropout(0.1)(X)
    
    X = Dense(100, activation='relu')(X)
    X = Dense(100, activation='relu')(X)
    X = Dense(100, activation='relu')(X)
    X = Dropout(0.1)(X)
    
    # classification output layer
    X = Dense(50, activation='sigmoid')(X)
    Y = Dense(10, activation='softmax')(X)

    # build the model
    model = Model(inputs=X_input, outputs=Y, name='MODEL4')
    
    return model

In [322]:
model4 = model4() # build the model
adam = Adam()
model4.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['accuracy'])
model4.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_42 (InputLayer)        (None, 784)               0         
_________________________________________________________________
dropout_49 (Dropout)         (None, 784)               0         
_________________________________________________________________
dense_234 (Dense)            (None, 1000)              785000    
_________________________________________________________________
dropout_50 (Dropout)         (None, 1000)              0         
_________________________________________________________________
dense_235 (Dense)            (None, 200)               200200    
_________________________________________________________________
dense_236 (Dense)            (None, 200)               40200     
_________________________________________________________________
dense_237 (Dense)            (None, 200)               40200     
__________

In [323]:
model4.fit(x_train, y_train, epochs=10, batch_size=100)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x2397933f5f8>

In [324]:
score = model4.evaluate(x_test, y_test, batch_size=10)
score



[0.06813931174157187, 0.9827999965548515]

Test accuracy of 98.28% versus train accuracy of 98.23%. This network generlized better than the previous model. Is that due to the dropout? Or the different network architecture. Let's remove the dropout and see what happens.

In [326]:
def model5(input_shape=(784,)):
    
    X_input = Input(shape=input_shape)
    X = Dense(1000, activation='relu')(X_input)
    X = Dense(200, activation='relu')(X)
    X = Dense(200, activation='relu')(X)
    X = Dense(200, activation='relu')(X)
    X = Dense(100, activation='relu')(X)
    X = Dense(100, activation='relu')(X)
    X = Dense(100, activation='relu')(X)
    X = Dense(50, activation='sigmoid')(X)
    Y = Dense(10, activation='softmax')(X)

    model = Model(inputs=X_input, outputs=Y, name='MODEL4')
    
    return model

In [328]:
model5 = model5() # build the model
adam = Adam()
model5.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['accuracy'])
model5.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_43 (InputLayer)        (None, 784)               0         
_________________________________________________________________
dense_243 (Dense)            (None, 1000)              785000    
_________________________________________________________________
dense_244 (Dense)            (None, 200)               200200    
_________________________________________________________________
dense_245 (Dense)            (None, 200)               40200     
_________________________________________________________________
dense_246 (Dense)            (None, 200)               40200     
_________________________________________________________________
dense_247 (Dense)            (None, 100)               20100     
_________________________________________________________________
dense_248 (Dense)            (None, 100)               10100     
__________

In [329]:
model5.fit(x_train, y_train, epochs=10, batch_size=100)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x2397bb43f98>

In [331]:
score = model5.evaluate(x_test, y_test, batch_size=10)
score



[0.09015802885312588, 0.9791999959945679]

99.2% accuracy on the training data, 97.9% accuracy on the test data. Looks like dropout does provide value, after all.