##In-Depth Look into Convolutional Neural Networks



In [None]:
# imports
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
from keras.models import Sequential
from keras.layers import Dense, Activation, LocallyConnected1D, InputLayer, Flatten, LocallyConnected2D, Conv2D, Dropout
from keras.optimizers import SGD
from keras import utils as np_utils

In [None]:
img_h = img_w = 16             
img_size_flat = img_h * img_w  # 16x16=256, the total number of pixels per image
n_classes = 10                 # total classes

In [None]:
train_set = np.genfromtxt("./mnist_train.txt")
test_set = np.genfromtxt("./mnist_test.txt")

In [None]:
print("training dataset shape: ", train_set.shape)
print("test dataset shape: ", test_set.shape)

training dataset shape:  (7291, 257)
test dataset shape:  (2007, 257)


In [None]:
# extract labels and data 
train_labels = train_set[:,0]
test_labels = test_set[:,0]
print("training labels shape: ", train_labels.shape)
print("testing labels shape: ", test_labels.shape)

train_data = train_set[:,1:]
test_data = test_set[:,1:]
print("training data shape: ", train_data.shape)
print("testing data shape: ", test_data.shape)

training labels shape:  (7291,)
testing labels shape:  (2007,)
training data shape:  (7291, 256)
testing data shape:  (2007, 256)


In [None]:
# reshape train and test datasets
train_data = train_data.reshape((train_data.shape[0], img_h, img_w, 1))
test_data = test_data.reshape((test_data.shape[0], img_h, img_w, 1))
print("reshaped training data: ", train_data.shape)
print("reshaped testing data: ", test_data.shape)

# use one hot encoding for outputs
train_labels = np_utils.to_categorical(train_labels)
test_labels = np_utils.to_categorical(test_labels)
print("one hot encoded training labels shape: ", train_labels.shape)
print("one hot encoded testing labels shape: ", test_labels.shape)

reshaped training data:  (7291, 16, 16, 1)
reshaped testing data:  (2007, 16, 16, 1)
one hot encoded training labels shape:  (7291, 10)
one hot encoded testing labels shape:  (2007, 10)


###Task 1: Neural Network Design

In [None]:
def define_fully_connected_model(lr_val, momentum_val, img_h, img_w, initializer):
  model = Sequential()
  model.add(InputLayer(input_shape=(img_h, img_w, 1)))
  model.add(Flatten())
  model.add(Dense(24, activation='relu', kernel_initializer=initializer))
  model.add(Dense(128, activation='relu', kernel_initializer=initializer))
  model.add(Dense(256, activation='tanh', kernel_initializer=initializer))
  model.add(Dense(10, activation='softmax'))
  # compile model
  opt = SGD(lr=lr_val, momentum=momentum_val)
  model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy'])
  return model

The above model is a fully connected model with 3 hidden layers. The learning rate, momentum, image dimensions and initalizer choise are passed in as hyperparameters. The hidden layers use the relu, relu and tanh activation functions respectively. The output of the network has ten units, one for each target class. The target labels are passed as one hot encoded vectors. 

In [None]:
def locally_connected_model(lr_val, momentum_val, img_h, img_w, initializer):
  model = Sequential()
  model.add(InputLayer(input_shape=(img_h, img_w, 1)))
  model.add(LocallyConnected2D(64, (3,3), activation="relu", kernel_initializer=initializer)) 
  model.add(LocallyConnected2D(32, (3,3), activation="relu", kernel_initializer=initializer))
  model.add(LocallyConnected2D(32, (3,3), activation="tanh", kernel_initializer=initializer))
  model.add(Flatten())
  model.add(Dense(64, activation='relu', kernel_initializer=initializer))
  model.add(Dense(10, activation='softmax'))
  # compile model
  opt = SGD(lr=lr_val, momentum=momentum_val)
  model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy'])
  return model

The above model is a locally connected model with 3 hidden locally connected layers. There is one hidden dense layer with 64 units. The learning rate, momentum, image dimensions and initalizer choise are passed in as hyperparameters. The hidden layers use the relu, relu, tanh and relu activation functions respectively. In this locally connected architecture, the filters used during convolution are only passed over a local portion of the image the weights are not shared with the entire model but only the direct local neighbors. The output of the network has ten units, one for each target class. The target labels are passed as one hot encoded vectors. 

In [None]:
def conv_model(lr_val, momentum_val, img_h, img_w, initializer):
  model = Sequential()
  model.add(InputLayer(input_shape=(img_h, img_w, 1)))
  model.add(Conv2D(64, (3, 3), activation='relu', kernel_initializer=initializer))
  model.add(Conv2D(32, (3, 3), activation='relu', kernel_initializer=initializer))
  model.add(Conv2D(32, (3, 3), activation='relu', kernel_initializer=initializer))
  model.add(Flatten())
  model.add(Dense(64, activation='relu', kernel_initializer=initializer))
  model.add(Dense(10, activation='softmax'))
  # compile model
  opt = SGD(lr=lr_val, momentum=momentum_val)
  model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy'])
  return model

The above model is a convolutional network with 2D 3x3 kernels that will be convolved across the whole image. This network has 3 convolutional layers and 1 hidden dense layer. The output layer has 10 units, one for each target class.

###Task 2: Techniques for Optimization

#### Experiment 1: Parameter Initialization

In [None]:
# Experiment 1 fixed hyper-params
epochs = 15
batch_size = 64
lr = 0.01
momentum = 0.9

###### Model 1: Fully Connected Model

The three initializers used to set the model weights are 'Zeros', 'HeUniform' and 'TruncatedNormal'. The comparison of training loss(*loss*), training accuracy (*accuracy*), validation loss(*val_loss*) and validation accuracy(*val_accuracy*) for each experiment with varying initializers can be seen in the training process below. The ranking of initializers best to worst (when analyzing training loss and accuracy) is: 1) HeUniform 2)TruncatedNormal 3)Zeros.

In [None]:
# define initializer --> slow loss convergence and poor accuracy
initializer = 'Zeros'
fc_model = define_fully_connected_model(lr, momentum, img_h, img_w, initializer)
fc_model.fit(train_data, train_labels, batch_size=batch_size, epochs=epochs, validation_data=(test_data, test_labels))


Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<tensorflow.python.keras.callbacks.History at 0x7f2e6a6c7c10>

In [None]:
# define initializer --> fast loss convergence and good accuracy
initializer = 'HeUniform'
fc_model = define_fully_connected_model(lr, momentum, img_h, img_w, initializer)
fc_model.fit(train_data, train_labels, batch_size=batch_size, epochs=epochs, validation_data=(test_data, test_labels))

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<tensorflow.python.keras.callbacks.History at 0x7f2e6a798a90>

In [None]:
# define initializer --> fast loss convergence and good(but worse than previous case) accuracy
initializer = 'TruncatedNormal'
fc_model = define_fully_connected_model(lr, momentum, img_h, img_w, initializer)
fc_model.fit(train_data, train_labels, batch_size=batch_size, epochs=epochs, validation_data=(test_data, test_labels))

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<tensorflow.python.keras.callbacks.History at 0x7f2e6a2d0490>

###### Model 2: Locally Connected Model

The three initializers used to set the model weights are 'Zeros', 'RandomNormal' and 'HeUniform'. The comparison of training loss(*loss*), training accuracy (*accuracy*), validation loss(*val_loss*) and validation accuracy(*val_accuracy*) for each experiment with varying initializers can be seen in the training process below. The ranking of initializers best to worst (when analyzing training loss and accuracy) is: 1) RandomNormal 2)HeUniform 3)Zeros.

In [None]:
# define initializer --> slow loss convergence and poor accuracy
initializer = 'Zeros'
loc_con_model = locally_connected_model(lr, momentum, img_h, img_w, initializer)
loc_con_model.fit(train_data, train_labels, batch_size=batch_size, epochs=epochs, validation_data=(test_data, test_labels))

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<tensorflow.python.keras.callbacks.History at 0x7f2e60263990>

In [None]:
# define initializer --> fast loss convergence and good accuracy
initializer = 'RandomNormal'
loc_con_model = locally_connected_model(lr, momentum, img_h, img_w, initializer)
loc_con_model.fit(train_data, train_labels, batch_size=batch_size, epochs=epochs, validation_data=(test_data, test_labels))

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<tensorflow.python.keras.callbacks.History at 0x7f2e00c8e910>

In [None]:
# define initializer --> fast loss convergence and good(but worse than previous case) accuracy
initializer = 'HeUniform'
loc_con_model = locally_connected_model(lr, momentum, img_h, img_w, initializer)
loc_con_model.fit(train_data, train_labels, batch_size=batch_size, epochs=epochs, validation_data=(test_data, test_labels))

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<tensorflow.python.keras.callbacks.History at 0x7f2dde306210>

###### Model 3: Convolutional Model

The three initializers used to set the model weights are 'Zeros', 'HeUniform' and 'HeNormal'. The comparison of training loss(*loss*), training accuracy (*accuracy*), validation loss(*val_loss*) and validation accuracy(*val_accuracy*) for each experiment with varying initializers can be seen in the training process below. The ranking of initializers best to worst (when analyzing training loss and accuracy) is: 1) HeUniform 2)HeRandom 3)Zeros.

In [None]:
# define initializer --> slow loss convergence and poor accuracy
initializer = 'Zeros'
conv_model = conv_model(lr, momentum, img_h, img_w, initializer)
conv_model.fit(train_data, train_labels, batch_size=batch_size, epochs=epochs, validation_data=(test_data, test_labels))

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<tensorflow.python.keras.callbacks.History at 0x7f2dbbedf750>

In [None]:
# redefine model to avoid errors

def conv_model(lr_val, momentum_val, img_h, img_w, initializer):
  model = Sequential()
  model.add(InputLayer(input_shape=(img_h, img_w, 1)))
  model.add(Conv2D(64, (3, 3), activation='relu', kernel_initializer=initializer))
  model.add(Conv2D(32, (3, 3), activation='relu', kernel_initializer=initializer))
  model.add(Conv2D(32, (3, 3), activation='relu', kernel_initializer=initializer))
  model.add(Flatten())
  model.add(Dense(64, activation='relu', kernel_initializer=initializer))
  model.add(Dense(10, activation='softmax'))
  # compile model
  opt = SGD(lr=lr_val, momentum=momentum_val)
  model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy'])
  return model

In [None]:
# define initializer --> fast loss convergence and good accuracy
initializer = 'HeUniform'
conv_model = conv_model(lr, momentum, img_h, img_w, initializer)
conv_model.fit(train_data, train_labels, batch_size=batch_size, epochs=epochs, validation_data=(test_data, test_labels))

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<tensorflow.python.keras.callbacks.History at 0x7f2dbbdf6a50>

In [None]:
# redefine model to avoid errors

def conv_model(lr_val, momentum_val, img_h, img_w, initializer):
  model = Sequential()
  model.add(InputLayer(input_shape=(img_h, img_w, 1)))
  model.add(Conv2D(64, (3, 3), activation='relu', kernel_initializer=initializer))
  model.add(Conv2D(32, (3, 3), activation='relu', kernel_initializer=initializer))
  model.add(Conv2D(32, (3, 3), activation='relu', kernel_initializer=initializer))
  model.add(Flatten())
  model.add(Dense(64, activation='relu', kernel_initializer=initializer))
  model.add(Dense(10, activation='softmax'))
  # compile model
  opt = SGD(lr=lr_val, momentum=momentum_val)
  model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy'])
  return model

In [None]:
# define initializer --> fast loss convergence and good(but worse than previous case) accuracy
initializer = 'HeNormal'
conv_model = conv_model(lr, momentum, img_h, img_w, initializer)
conv_model.fit(train_data, train_labels, batch_size=batch_size, epochs=epochs, validation_data=(test_data, test_labels))

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<tensorflow.python.keras.callbacks.History at 0x7f2dbb42dad0>

In [None]:
# redefine model to avoid errors

def conv_model(lr_val, momentum_val, img_h, img_w, initializer):
  model = Sequential()
  model.add(InputLayer(input_shape=(img_h, img_w, 1)))
  model.add(Conv2D(64, (3, 3), activation='relu', kernel_initializer=initializer))
  model.add(Conv2D(32, (3, 3), activation='relu', kernel_initializer=initializer))
  model.add(Conv2D(32, (3, 3), activation='relu', kernel_initializer=initializer))
  model.add(Flatten())
  model.add(Dense(64, activation='relu', kernel_initializer=initializer))
  model.add(Dense(10, activation='softmax'))
  # compile model
  opt = SGD(lr=lr_val, momentum=momentum_val)
  model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy'])
  return model

#### Experiment 2: Varying Learning Rates

In [None]:
# Experiment 2 fixed hyper-params
epochs = 15
batch_size = 64
# lr = 0.01
initializer = 'HeUniform'
momentum = 0.9

###### Model 1: Fully Connected Model

In [None]:
# define initializer --> slow loss convergence and poor accuracy
lr = 1e-6
fc_model = define_fully_connected_model(lr, momentum, img_h, img_w, initializer)
fc_model.fit(train_data, train_labels, batch_size=batch_size, epochs=epochs, validation_data=(test_data, test_labels))


Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<tensorflow.python.keras.callbacks.History at 0x7f2dbb261590>

In [None]:
# define initializer --> fast loss convergence and good accuracy
lr = 0.01
fc_model = define_fully_connected_model(lr, momentum, img_h, img_w, initializer)
fc_model.fit(train_data, train_labels, batch_size=batch_size, epochs=epochs, validation_data=(test_data, test_labels))

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<tensorflow.python.keras.callbacks.History at 0x7f2dbb13be10>

In [None]:
# define initializer --> fast loss convergence and good(but worse than previous case) accuracy
lr = 0.09
fc_model = define_fully_connected_model(lr, momentum, img_h, img_w, initializer)
fc_model.fit(train_data, train_labels, batch_size=batch_size, epochs=epochs, validation_data=(test_data, test_labels))

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<tensorflow.python.keras.callbacks.History at 0x7f2dbafb39d0>

###### Model 2: Locally Connected Model

In [None]:
# set best initializer for Model 2
initializer = "RandomNormal"

In [None]:
# define initializer --> slow loss convergence and poor accuracy
lr = 1e-6
loc_con_model = locally_connected_model(lr, momentum, img_h, img_w, initializer)
loc_con_model.fit(train_data, train_labels, batch_size=batch_size, epochs=epochs, validation_data=(test_data, test_labels))

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<tensorflow.python.keras.callbacks.History at 0x7f2d44a4bcd0>

In [None]:
# define initializer --> fast loss convergence and good accuracy
lr = 0.01
loc_con_model = locally_connected_model(lr, momentum, img_h, img_w, initializer)
loc_con_model.fit(train_data, train_labels, batch_size=batch_size, epochs=epochs, validation_data=(test_data, test_labels))

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<tensorflow.python.keras.callbacks.History at 0x7f2d40a1ddd0>

In [None]:
# define initializer --> fast loss convergence and good(but worse than previous case) accuracy
lr = 0.20
loc_con_model = locally_connected_model(lr, momentum, img_h, img_w, initializer)
loc_con_model.fit(train_data, train_labels, batch_size=batch_size, epochs=epochs, validation_data=(test_data, test_labels))

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<tensorflow.python.keras.callbacks.History at 0x7f2d40eee210>

###### Model 3: Convolutional Model

In [None]:
# set best initializer for Model 2
initializer = "HeUniform"

In [None]:
# define initializer --> slow loss convergence and poor accuracy
lr = 1e-6
conv_model = conv_model(lr, momentum, img_h, img_w, initializer)
conv_model.fit(train_data, train_labels, batch_size=batch_size, epochs=epochs, validation_data=(test_data, test_labels))

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<tensorflow.python.keras.callbacks.History at 0x7f2d3b76d410>

In [None]:
# redefine model to avoid errors

def conv_model(lr_val, momentum_val, img_h, img_w, initializer):
  model = Sequential()
  model.add(InputLayer(input_shape=(img_h, img_w, 1)))
  model.add(Conv2D(64, (3, 3), activation='relu', kernel_initializer=initializer))
  model.add(Conv2D(32, (3, 3), activation='relu', kernel_initializer=initializer))
  model.add(Conv2D(32, (3, 3), activation='relu', kernel_initializer=initializer))
  model.add(Flatten())
  model.add(Dense(64, activation='relu', kernel_initializer=initializer))
  model.add(Dense(10, activation='softmax'))
  # compile model
  opt = SGD(lr=lr_val, momentum=momentum_val)
  model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy'])
  return model

In [None]:
# define initializer --> fast loss convergence and good accuracy
lr = 0.01
conv_model = conv_model(lr, momentum, img_h, img_w, initializer)
conv_model.fit(train_data, train_labels, batch_size=batch_size, epochs=epochs, validation_data=(test_data, test_labels))

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<tensorflow.python.keras.callbacks.History at 0x7f2d3ae15c90>

In [None]:
# redefine model to avoid errors

def conv_model(lr_val, momentum_val, img_h, img_w, initializer):
  model = Sequential()
  model.add(InputLayer(input_shape=(img_h, img_w, 1)))
  model.add(Conv2D(64, (3, 3), activation='relu', kernel_initializer=initializer))
  model.add(Conv2D(32, (3, 3), activation='relu', kernel_initializer=initializer))
  model.add(Conv2D(32, (3, 3), activation='relu', kernel_initializer=initializer))
  model.add(Flatten())
  model.add(Dense(64, activation='relu', kernel_initializer=initializer))
  model.add(Dense(10, activation='softmax'))
  # compile model
  opt = SGD(lr=lr_val, momentum=momentum_val)
  model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy'])
  return model

In [None]:
# define initializer --> fast loss convergence and good(but worse than previous case) accuracy
lr = 0.092
conv_model = conv_model(lr, momentum, img_h, img_w, initializer)
conv_model.fit(train_data, train_labels, batch_size=batch_size, epochs=epochs, validation_data=(test_data, test_labels))

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<tensorflow.python.keras.callbacks.History at 0x7f2d3ac77b10>

In [None]:
# redefine model to avoid errors

def conv_model(lr_val, momentum_val, img_h, img_w, initializer):
  model = Sequential()
  model.add(InputLayer(input_shape=(img_h, img_w, 1)))
  model.add(Conv2D(64, (3, 3), activation='relu', kernel_initializer=initializer))
  model.add(Conv2D(32, (3, 3), activation='relu', kernel_initializer=initializer))
  model.add(Conv2D(32, (3, 3), activation='relu', kernel_initializer=initializer))
  model.add(Flatten())
  model.add(Dense(64, activation='relu', kernel_initializer=initializer))
  model.add(Dense(10, activation='softmax'))
  # compile model
  opt = SGD(lr=lr_val, momentum=momentum_val)
  model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy'])
  return model

#### Experiment 3: Effect of Batch Size on Batch Normalization for Convolutional Network

Increasing batch size brings the sampled values closer to trained population values. Increasing batch size improves batch normalization. With a batch size that is too small the network is chasing a moving target. This change in the distribution of inputs to layers in the network is referred to the technical name “internal covariate shift.”

In [None]:
# Experiment 3 fixed hyper-params --> with best values for CNN
epochs = 15
lr = 0.01
initializer = 'HeUniform'
momentum = 0.9

In [None]:
# ineffective batch size
batch_size = 1
conv_model = conv_model(lr, momentum, img_h, img_w, initializer)
conv_model.fit(train_data, train_labels, batch_size=batch_size, epochs=epochs, validation_data=(test_data, test_labels))

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<tensorflow.python.keras.callbacks.History at 0x7f2d3aab1bd0>

In [None]:
# redefine model to avoid errors

def conv_model(lr_val, momentum_val, img_h, img_w, initializer):
  model = Sequential()
  model.add(InputLayer(input_shape=(img_h, img_w, 1)))
  model.add(Conv2D(64, (3, 3), activation='relu', kernel_initializer=initializer))
  model.add(Conv2D(32, (3, 3), activation='relu', kernel_initializer=initializer))
  model.add(Conv2D(32, (3, 3), activation='relu', kernel_initializer=initializer))
  model.add(Flatten())
  model.add(Dense(64, activation='relu', kernel_initializer=initializer))
  model.add(Dense(10, activation='softmax'))
  # compile model
  opt = SGD(lr=lr_val, momentum=momentum_val)
  model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy'])
  return model

In [None]:
# effective batch size
batch_size = 128
conv_model = conv_model(lr, momentum, img_h, img_w, initializer)
conv_model.fit(train_data, train_labels, batch_size=batch_size, epochs=epochs, validation_data=(test_data, test_labels))

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<tensorflow.python.keras.callbacks.History at 0x7f2d3a95b9d0>

#### Experiment 4: Effect of Momentum on Convolutional Network Training

Different momentum values are contrasted on the convolutional neural network. To exaggerate the differences the values 0.01, 0.50 and 0.99 are used. With a higher momentum, a marginally better performance was observed. 

In [None]:
# Experiment 4 fixed hyper-params --> with best values for CNN
epochs = 15
lr = 0.01
batch_size = 128
initializer = 'HeUniform'

In [None]:
# redefine model to avoid errors

def conv_model(lr_val, momentum_val, img_h, img_w, initializer):
  model = Sequential()
  model.add(InputLayer(input_shape=(img_h, img_w, 1)))
  model.add(Conv2D(64, (3, 3), activation='relu', kernel_initializer=initializer))
  model.add(Conv2D(32, (3, 3), activation='relu', kernel_initializer=initializer))
  model.add(Conv2D(32, (3, 3), activation='relu', kernel_initializer=initializer))
  model.add(Flatten())
  model.add(Dense(64, activation='relu', kernel_initializer=initializer))
  model.add(Dense(10, activation='softmax'))
  # compile model
  opt = SGD(lr=lr_val, momentum=momentum_val)
  model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy'])
  return model

In [None]:
# momentum value 1
momentum = 0.01
conv_model = conv_model(lr, momentum, img_h, img_w, initializer)
conv_model.fit(train_data, train_labels, batch_size=batch_size, epochs=epochs, validation_data=(test_data, test_labels))

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<tensorflow.python.keras.callbacks.History at 0x7f2d36720b10>

In [None]:
# redefine model to avoid errors

def conv_model(lr_val, momentum_val, img_h, img_w, initializer):
  model = Sequential()
  model.add(InputLayer(input_shape=(img_h, img_w, 1)))
  model.add(Conv2D(64, (3, 3), activation='relu', kernel_initializer=initializer))
  model.add(Conv2D(32, (3, 3), activation='relu', kernel_initializer=initializer))
  model.add(Conv2D(32, (3, 3), activation='relu', kernel_initializer=initializer))
  model.add(Flatten())
  model.add(Dense(64, activation='relu', kernel_initializer=initializer))
  model.add(Dense(10, activation='softmax'))
  # compile model
  opt = SGD(lr=lr_val, momentum=momentum_val)
  model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy'])
  return model

In [None]:
# momentum value 2
momentum = 0.50
conv_model = conv_model(lr, momentum, img_h, img_w, initializer)
conv_model.fit(train_data, train_labels, batch_size=batch_size, epochs=epochs, validation_data=(test_data, test_labels))

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<tensorflow.python.keras.callbacks.History at 0x7f2d365cea50>

In [None]:
# redefine model to avoid errors

def conv_model(lr_val, momentum_val, img_h, img_w, initializer):
  model = Sequential()
  model.add(InputLayer(input_shape=(img_h, img_w, 1)))
  model.add(Conv2D(64, (3, 3), activation='relu', kernel_initializer=initializer))
  model.add(Conv2D(32, (3, 3), activation='relu', kernel_initializer=initializer))
  model.add(Conv2D(32, (3, 3), activation='relu', kernel_initializer=initializer))
  model.add(Flatten())
  model.add(Dense(64, activation='relu', kernel_initializer=initializer))
  model.add(Dense(10, activation='softmax'))
  # compile model
  opt = SGD(lr=lr_val, momentum=momentum_val)
  model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy'])
  return model

In [None]:
# momentum value 3
momentum = 0.99
conv_model = conv_model(lr, momentum, img_h, img_w, initializer)
conv_model.fit(train_data, train_labels, batch_size=batch_size, epochs=epochs, validation_data=(test_data, test_labels))

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<tensorflow.python.keras.callbacks.History at 0x7f2d364b30d0>

###Task 3: Techniques for Improving Generalization

#### Method 1: Ensemble Learning

In this framework, 20 weak learners are used and their learned values combined to make one strong learner. In this way the final strong model is resiliant and performs well. 

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier

In [None]:
ensemble_train_set = np.genfromtxt("./mnist_train.txt")
ensemble_test_set = np.genfromtxt("./mnist_test.txt")

# extract labels and data 
ensemble_train_labels = ensemble_train_set[:,0]
ensemble_test_labels = ensemble_test_set[:,0]
print("ensemble training labels shape: ", ensemble_train_labels.shape)
print("ensemble testing labels shape: ", ensemble_test_labels.shape)

ensemble_train_data = ensemble_train_set[:,1:]
ensemble_test_data = ensemble_test_set[:,1:]
print("ensemble training data shape: ", ensemble_train_data.shape)
print("ensemble testing data shape: ", ensemble_test_data.shape)

ensemble training labels shape:  (7291,)
ensemble testing labels shape:  (2007,)
ensemble training data shape:  (7291, 256)
ensemble testing data shape:  (2007, 256)


In [None]:
# Bagging
bg = BaggingClassifier(DecisionTreeClassifier(),max_samples=0.5, max_features=1.0, n_estimators=20)
bg.fit(ensemble_train_data, ensemble_train_labels)

BaggingClassifier(base_estimator=DecisionTreeClassifier(ccp_alpha=0.0,
                                                        class_weight=None,
                                                        criterion='gini',
                                                        max_depth=None,
                                                        max_features=None,
                                                        max_leaf_nodes=None,
                                                        min_impurity_decrease=0.0,
                                                        min_impurity_split=None,
                                                        min_samples_leaf=1,
                                                        min_samples_split=2,
                                                        min_weight_fraction_leaf=0.0,
                                                        presort='deprecated',
                                                        random_state=None,


In [None]:
# validate on train set
bg.score(ensemble_train_data,ensemble_train_labels)

0.9916335207790427

In [None]:
# validate on test set
bg.score(ensemble_test_data,ensemble_test_labels)

0.8824115595416044

#### Method 2: Dropout

In [None]:
# best hyperparam values for Fully Connected Network
epochs = 15
lr = 0.01
batch_size = 128
momentum = 0.9
initializer = 'HeUniform'

In [None]:
# redefine fully connected network WITHOUT dropout 
# reduce number of neurons to reduce overfitting

def define_fully_connected_model(lr_val, momentum_val, img_h, img_w, initializer):
  model = Sequential()
  model.add(InputLayer(input_shape=(img_h, img_w, 1)))
  model.add(Flatten())
  model.add(Dense(10, activation='relu', kernel_initializer=initializer))
  model.add(Dense(32, activation='tanh', kernel_initializer=initializer))
  model.add(Dense(10, activation='softmax'))
  # compile model
  opt = SGD(lr=lr_val, momentum=momentum_val)
  model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy'])
  return model

In [None]:
# test fully connected network without dropout
fc_model = define_fully_connected_model(lr, momentum, img_h, img_w, initializer)
fc_model.fit(train_data, train_labels, batch_size=batch_size, epochs=epochs, validation_data=(test_data, test_labels))

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<tensorflow.python.keras.callbacks.History at 0x7f2d3a9bdb90>

Add Dropout Layers to Fully Connected Network

In [None]:
# re-define fully connected network WITH dropout layer
# Effective case with dropout rate = 0.5

def define_fully_connected_model(lr_val, momentum_val, img_h, img_w, initializer):
  model = Sequential()
  model.add(InputLayer(input_shape=(img_h, img_w, 1)))
  model.add(Flatten())
  model.add(Dense(10, activation='relu', kernel_initializer=initializer))
  model.add(Dropout(0.2)) ## added dropout layer ****
  model.add(Dense(32, activation='tanh', kernel_initializer=initializer))
  model.add(Dense(10, activation='softmax'))
  # compile model
  opt = SGD(lr=lr_val, momentum=momentum_val)
  model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy'])
  return model

In [None]:
# test fully connected network WITH dropout
# Effective case with dropout rate = 0.2
fc_model = define_fully_connected_model(lr, momentum, img_h, img_w, initializer)
fc_model.fit(train_data, train_labels, batch_size=batch_size, epochs=epochs, validation_data=(test_data, test_labels))

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<tensorflow.python.keras.callbacks.History at 0x7f2d3ae0f5d0>

In [None]:
# define fully connected model WITH dropout layer
# Ineffective case with dropout rate = 0.80 --> TOO HIGH DROPOUT RATE!

def define_fully_connected_model(lr_val, momentum_val, img_h, img_w, initializer):
  model = Sequential()
  model.add(InputLayer(input_shape=(img_h, img_w, 1)))
  model.add(Flatten())
  model.add(Dense(10, activation='relu', kernel_initializer=initializer))
  model.add(Dropout(0.80)) ## added dropout layer ****
  model.add(Dense(32, activation='tanh', kernel_initializer=initializer))
  model.add(Dense(10, activation='softmax'))
  # compile model
  opt = SGD(lr=lr_val, momentum=momentum_val)
  model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy'])
  return model

In [None]:
# test FC model WITH high dropout 
# Ineffective case with dropout rate = 0.80 --> TOO HIGH DROPOUT RATE!
fc_model = define_fully_connected_model(lr, momentum, img_h, img_w, initializer)
fc_model.fit(train_data, train_labels, batch_size=batch_size, epochs=epochs, validation_data=(test_data, test_labels))

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<tensorflow.python.keras.callbacks.History at 0x7f2d40a5af90>