# Import dependencies

In [None]:
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import SGD, Adam
from keras.datasets import mnist
from keras.utils import np_utils

# Load the MNIST Dataset

In [None]:
# Settings
nb_classes = 10
number_of_data = 5000

# Load MNIST dataset
(X_train, y_train), (X_test, y_test) = mnist.load_data()
X_train = X_train.reshape(60000, 784)
X_test = X_test.reshape(10000, 784)
X_train = X_train.astype('float32')
X_test = X_test.astype('float32')
X_train /= 255
X_test /= 255
Y_Train = np_utils.to_categorical(y_train, nb_classes)
Y_Test = np_utils.to_categorical(y_test, nb_classes)

X_train = X_train[:number_of_data]
Y_Train = Y_Train[:number_of_data]
X_test = X_test[:number_of_data]
Y_Test = Y_Test[:number_of_data]

# Visualize MNIST dataset
import matplotlib.pyplot as plt
%matplotlib inline

n = 10  # how many digits we will display
plt.figure(figsize=(10, 4))
for i in range(n):
    # display original
    ax = plt.subplot(2, n, i + 1)
    plt.imshow(X_train[i].reshape(28, 28))
    plt.gray()
    ax.get_xaxis().set_visible(False)
    ax.get_yaxis().set_visible(False)
plt.show()

# Single-layer neural network
<img align="left" width="40%" src="images/nn-1-layer.png"/>

## Build

In [None]:
model = Sequential()
model.add(Dense(units=10, input_dim=784, kernel_initializer='normal', activation='softmax'))
model.compile(optimizer=SGD(lr=0.05), loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()

## Train

In [None]:
batch_size = 128
epochs = 10

history = model.fit(X_train, Y_Train, epochs=epochs, batch_size=batch_size, verbose=1)

## Evaluate

In [None]:
evaluation = model.evaluate(X_test, Y_Test, verbose=1)
print('Summary: Loss over the test dataset: %.2f, Accuracy: %.2f' % (evaluation[0], evaluation[1]))

# Multi-layer neural network
<img align="left" width="40%" src="images/nn-deep-layer.png"/>

## Build

In [None]:
model = Sequential()
model.add(Dense(units=625, input_dim=784, kernel_initializer='normal', activation='sigmoid'))
model.add(Dense(units=10, input_dim=625, kernel_initializer='normal', activation='softmax'))
model.compile(optimizer=SGD(lr=0.1), loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()

## Train

In [None]:
batch_size = 128
epochs = 10

history = model.fit(X_train, Y_Train, epochs=epochs, batch_size=batch_size, verbose=1)

## Evaluate

In [None]:
evaluation = model.evaluate(X_test, Y_Test, verbose=1)
print('Summary: Loss over the test dataset: %.2f, Accuracy: %.2f' % (evaluation[0], evaluation[1]))

# Hyperparameters

## Preliminaries

In [None]:
def default_model():
  model = Sequential()
  model.add(Dense(units=625, input_dim=784, kernel_initializer='normal', activation='sigmoid'))
  model.add(Dense(units=10, input_dim=625, kernel_initializer='normal', activation='softmax'))
  return model

def default_optim():
  return Adam(lr=0.01, beta_1=0.9, beta_2=0.999)

def train_model(model, optim=default_optim(), epochs=10, batch_size=128):
  model.compile(optimizer=optim, loss='categorical_crossentropy', metrics=['accuracy'])
  history = model.fit(X_train, Y_Train, epochs=epochs, batch_size=batch_size, verbose=1)
  
def eval_model(model):
  evaluation = model.evaluate(X_test, Y_Test, verbose=1)
  print('Summary: Loss over the test dataset: %.4f, Accuracy: %.4f' % (evaluation[0], evaluation[1]))
  
def train_and_eval_model(model, optim=default_optim(), epochs=5, batch_size=128):
  train_model(model, optim, epochs, batch_size)
  eval_model(model)

## Optimizers

### SGD
<img align="left" width="40%" src="images/sgd.jpeg"/>
Source: Andrew Ng's Machine Learning Lecture

In [None]:
from keras.optimizers import SGD

optim = SGD(lr=0.01, momentum=0.0, nesterov=False)

train_and_eval_model(default_model(), optim)

### RMSprop

In [None]:
from keras.optimizers import RMSprop

optim = RMSprop(lr=0.001, rho=0.9)

train_and_eval_model(default_model(), optim)

### Adam

In [None]:
from keras.optimizers import Adam

optim = Adam(lr=0.01, beta_1=0.9, beta_2=0.999)

train_and_eval_model(default_model(), optim)

## Learning Rate
<img align="left" width="30%" src="images/lr.jpg"/>
Source: Andrew Ng's Machine Learning Lecture

In [None]:
from keras.optimizers import SGD

# Change this to 0.1 and 0.001 to see the difference
LEARNING_RATE = 0.01

optim = SGD(lr=LEARNING_RATE, momentum=0.0, nesterov=False)

train_and_eval_model(default_model(), optim)

## Initializations

### RandomNormal

In [None]:
from keras.initializers import RandomNormal

init = RandomNormal(mean=0.0, stddev=0.05)

model = Sequential()
model.add(Dense(units=10, input_dim=784, kernel_initializer=init, activation='softmax'))

train_and_eval_model(model, default_optim())

### He Normal

NormalDistribution with variance = 2 / number of input nodes

He, Kaiming, et al. "Delving deep into rectifiers: Surpassing human-level performance on imagenet classification." Proceedings of the IEEE international conference on computer vision. 2015.


In [None]:
from keras.initializers import he_normal

init = he_normal()

model = Sequential()
model.add(Dense(units=10, input_dim=784, kernel_initializer=init, activation='softmax'))

train_and_eval_model(model, default_optim())

### Glorot Uniform

UniformDistribution from -2 / (number of input nodes + number of output nodes) to 2 / number of input nodes + number of output nodes)

Glorot, Xavier, and Yoshua Bengio. "Understanding the difficulty of training deep feedforward neural networks." Proceedings of the Thirteenth International Conference on Artificial Intelligence and Statistics. 2010.

In [None]:
from keras.initializers import glorot_uniform

init = glorot_uniform()

model = Sequential()
model.add(Dense(units=10, input_dim=784, kernel_initializer=init, activation='softmax'))

train_and_eval_model(model, default_optim())

## Activations

### Sigmoid
<img align="left" width="30%" src="images/sigmoid.png"/>

In [None]:
from keras.layers import Activation

model = Sequential()
model.add(Dense(units=625, input_dim=784, kernel_initializer='normal'))
model.add(Activation('sigmoid'))
model.add(Dense(units=10, input_dim=625, kernel_initializer='normal', activation='softmax'))

train_and_eval_model(model, default_optim())

### Tanh
<img align="left" width="30%" src="images/tanh.png"/>

In [None]:
from keras.layers import Activation

model = Sequential()
model.add(Dense(units=625, input_dim=784, kernel_initializer='normal'))
model.add(Activation('tanh'))
model.add(Dense(units=10, input_dim=625, kernel_initializer='normal', activation='softmax'))

train_and_eval_model(model, default_optim())

### ReLU
<img align="left" width="30%" src="images/relu.png"/>

In [None]:
from keras.layers import Activation

model = Sequential()
model.add(Dense(units=625, input_dim=784, kernel_initializer='normal'))
model.add(Activation('relu'))
model.add(Dense(units=10, input_dim=625, kernel_initializer='normal', activation='softmax'))

train_and_eval_model(model, default_optim())