In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras import utils as tf_utils
from sklearn.model_selection import train_test_split

from keras.models import Sequential
from keras.layers import Conv2D, MaxPooling2D
from keras.layers import Activation, Dropout, Flatten, Dense, Lambda
from keras.preprocessing.image import ImageDataGenerator
from keras.layers.normalization import BatchNormalization
from keras import regularizers


#from keras.models import Sequential
#from keras.layers import Dense, Dropout, Flatten
#from keras.callbacks import ModelCheckpoint


# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

# Loading the data...

In [None]:
mnist_train_complete = pd.read_csv("/kaggle/input/digit-recognizer/train.csv")
mnist_test_complete = pd.read_csv("/kaggle/input/digit-recognizer/test.csv")

mnist_train_complete.head(5)

In [None]:
# preparing the training and testing sets, separating the training pictures of the numbers (i.e. train_x)
# from their label (i.e train_y).
# We set here also the data types as int32
train_y = mnist_train_complete.iloc[:, 0].values.astype('int32')
train_x = mnist_train_complete.iloc[:, 1:].values.astype('int32')
test_x = mnist_test_complete.values.astype('int32')

# reshaping the training and testing sets to have each digit image of 28 by 28 pixels
train_x = train_x.reshape(train_x.shape[0], 28, 28)
test_x = test_x.reshape(test_x.shape[0], 28, 28)

# Visualizing some digit images

In [None]:
for i in range (10,14):
    plt.subplot(330 + i+1)
    plt.imshow(train_x[i], cmap=plt.get_cmap('gray'))
    plt.title(train_y[i])

In [None]:
def visualize_detail(img, ax):
    ax.imshow(img, cmap='gray')
    width, height = img.shape
    threshold = img.max()/2.5
    for x in range(width):
        for y in range(height):
            ax.annotate(str(round(img[x][y], 2)), xy=(y,x),
                        horizontalalignment='center',
                        verticalalignment='center',
                        color='white' if img[x][y] < threshold else 'black')
fig = plt.figure(figsize=(12,12))
ax = fig.add_subplot(111)
visualize_detail(train_x[10], ax)

# Preprocessing

In [None]:
# Normalizing the training and testing sets
train_x = train_x.astype('float32')/np.max(train_x)
test_x = test_x.astype('float32')/np.max(test_x)

# center the normalized data around zero
mean = np.std(train_x)
train_x -= mean
mean = np.std(test_x)
test_x -= mean

In [None]:
# creating the training and validationg sets
splitted_train_X, splitted_test_X, splitted_train_y, splitted_test_y = train_test_split(train_x, train_y, test_size=0.2, random_state=81)

# one-hot encoding the training and validation sets
ohe_splitted_train_y = tf_utils.to_categorical(splitted_train_y, 10)
ohe_splitted_test_y = tf_utils.to_categorical(splitted_test_y, 10)

# print first one-hot training labels
print('One-hot labels:')
print(splitted_train_y[:10])

# Solution 1. Model using fully connected NNs

In [None]:
# define a fully connected NNs model
model = tf.keras.models.Sequential()
model.add(tf.keras.layers.Flatten(input_shape = splitted_train_X.shape[1:]))
model.add(tf.keras.layers.Dense(512, activation='relu'))
model.add(tf.keras.layers.Dropout(0.2))
model.add(tf.keras.layers.Dense(512, activation='relu'))
model.add(tf.keras.layers.Dropout(0.2))
model.add(tf.keras.layers.Dense(10, activation='softmax'))

# summary of model
# model.summary()

In [None]:
# compile the model
model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'])

In [None]:
# evaluate test accuracy
score = model.evaluate(splitted_test_X, ohe_splitted_test_y, verbose=0)
accuracy = 100 * score[1]

# print test accuracy
print('Test accuracy: %4f%%' % accuracy)

In [None]:
checkpointer = ModelCheckpoint(filepath='mnist.model.best.hdf5',
                               verbose=1, save_best_only=True)

hist = model.fit(splitted_train_X, ohe_splitted_train_y, batch_size=128, epochs=10,
                 validation_split=0.2, callbacks=[checkpointer],
                 verbose=1, shuffle=True)

In [None]:
# load the weights that yielded the best validation accuracy
model.load_weights('mnist.model.best.hdf5')

In [None]:
score = model.evaluate(splitted_test_X, ohe_splitted_test_y, verbose=0)
accuracy = 100 * score[1]

#print test accuracy
print('Test accuracy: %.4f%%' % accuracy)

## Making predictions using Solution 1

In [None]:
predictions = model.predict(test_x)
predictions = [ np.argmax(x) for x in predictions ]

In [None]:
# prepare submission
submission = pd.read_csv('/kaggle/input/digit-recognizer/sample_submission.csv')
submission.drop('Label', axis=1, inplace=True)
submission['Label'] = predictions
submission.to_csv('submission.csv', index=False)

The prediction obtained by this solution yielded a score of 0.97185. <br>
**Note**: By centring the normalized training (and testing) data around zero, my score moved forward by 83 places on the leaderboard (previous score was 0.96800).

# Solution 2. Model using Convolutional NNs

In this solution I implement a Convolutional Network to replace my Fully Connected Neural Network from solution1.

In [None]:
extended_splitted_train_X = splitted_train_X[..., tf.newaxis]
extended_splitted_test_X = splitted_test_X[..., tf.newaxis]
extended_splitted_test_X.shape

In [None]:
# define a Convolutional NNs model
model = Sequential()
model.add(Conv2D(filters=32, kernel_size=3, padding='same', activation='relu', input_shape=extended_splitted_train_X.shape[1:]))
model.add(MaxPooling2D(pool_size=2))
model.add(Conv2D(filters=32, kernel_size=3, padding='same', activation='relu'))
model.add(MaxPooling2D(pool_size=2))
model.add(Conv2D(filters=32, kernel_size=3, padding='same', activation='relu'))
model.add(MaxPooling2D(pool_size=2))

# Converts our 3D feature maps to 1D features vectors
model.add(Flatten())
model.add(Dense(64))
model.add(Activation('relu'))
model.add(Dropout(0.2))
model.add(Dense(10, activation='softmax'))

# model.summary()

In [None]:
# compile the model
model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'])

In [None]:
# evaluate test accuracy
score = model.evaluate(extended_splitted_test_X, ohe_splitted_test_y, verbose=0)
accuracy = 100 * score[1]

# print test accuracy
print('Test accuracy: %4f%%' % accuracy)

In [None]:
model.fit(extended_splitted_train_X, ohe_splitted_train_y, batch_size=128, epochs=10,
                 validation_split=0.2, #callbacks=[checkpointer], AttributeError: 'Sequential' object has no attribute '_in_multi_worker_mode'
                 verbose=1, validation_data=(extended_splitted_test_X, ohe_splitted_test_y), shuffle=True)

In [None]:
score = model.evaluate(extended_splitted_test_X, ohe_splitted_test_y, verbose=0)
accuracy = 100 * score[1]

#print test accuracy
print('Test accuracy: %.4f%%' % accuracy)

## Making predictions using Solution 2

In [None]:
# extend the test imagae set with an additional dimension
extended_test_x = test_x[..., tf.newaxis]
predictions = model.predict(extended_test_x)
predictions = [ np.argmax(x) for x in predictions ]

# prepare submission
submission = pd.read_csv('/kaggle/input/digit-recognizer/sample_submission.csv')
submission.drop('Label', axis=1, inplace=True)
submission['Label'] = predictions
submission.to_csv('submission.csv', index=False)

The prediction obtained by this solution yielded a score of 0.98714.

# Solution 3. Model using Convolutional NNs with data augmentation

In this solution I make use of the model architecture of solution number 2 but I also implement data augmentation for the training.

## Augmenting an image

In [None]:
# define a data augmentator for our images
image_augmentator = ImageDataGenerator(
    rotation_range=10,
    width_shift_range=0.1,
    height_shift_range=0.1,
    # rescale=1./255,
    shear_range=0.2,
    zoom_range=0.1,
    fill_mode='nearest')

# define size of batch
batch_size = 32

train_batches = image_augmentator.flow(extended_splitted_train_X, ohe_splitted_train_y, batch_size=batch_size)
val_batches = image_augmentator.flow(extended_splitted_test_X, ohe_splitted_test_y, batch_size=batch_size)

Let's look at an example of data augmentation:

In [None]:
example_img = train_x[10][..., tf.newaxis]
transf_params = { 'theta':15., 'tx':0.1, 'ty':0.1, 'shear':0.2 }
augmented_image = image_augmentator.apply_transform(example_img, transf_params)

# reducing dimensinoality to two
twoDim_image = augmented_image[:, :, 0]

fig = plt.figure(figsize=(12,12))
ax = fig.add_subplot(111)
visualize_detail(twoDim_image, ax)

In [None]:
# define a Convolutional NNs model (solution number 3)
model = Sequential()
model.add(Conv2D(filters=16, kernel_size=3, padding='same', activation='relu', input_shape=extended_splitted_train_X.shape[1:]))
#model.add(MaxPooling2D(pool_size=2))
model.add(Conv2D(filters=16, kernel_size=3, padding='same', activation='relu'))
model.add(MaxPooling2D(pool_size=2))
model.add(Dropout(0.1))
model.add(Conv2D(filters=16, kernel_size=3, padding='same', activation='relu'))
model.add(MaxPooling2D(pool_size=2))

# Converts our 3D feature maps to 1D features vectors
model.add(Flatten())
model.add(Dense(64))
model.add(Activation('relu'))
model.add(Dropout(0.2))
model.add(Dense(10, activation='softmax'))

# model.summary()

In [None]:
# compile the model
model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'])

In [None]:
model.fit_generator(generator=train_batches, steps_per_epoch =train_batches.n, epochs=1,
                    validation_data=val_batches, validation_steps=val_batches.n)

In [None]:
score = model.evaluate(extended_splitted_test_X, ohe_splitted_test_y, verbose=0)
accuracy = 100 * score[1]

#print test accuracy
print('Test accuracy: %.4f%%' % accuracy)

## Making predictions using Solution 3

In [None]:
# extend the test imagae set with an additional dimension
extended_test_x = test_x[..., tf.newaxis]
predictions = model.predict(extended_test_x)
predictions = [ np.argmax(x) for x in predictions ]

# prepare submission
submission = pd.read_csv('/kaggle/input/digit-recognizer/sample_submission.csv')
submission.drop('Label', axis=1, inplace=True)
submission['Label'] = predictions
submission.to_csv('submission.csv', index=False)

The prediction obtained by this solution yielded a score of 0.98771.
I advanced 43 places on the leaderboard.

# Solution 4. Model using Convolutional NNs with data augmentation and batch normalization

In [None]:
# define a Convolutional NNs model (solution number 4)
model = Sequential()
model.add(Conv2D(filters=16, kernel_size=3, padding='same', activation='relu', input_shape=extended_splitted_train_X.shape[1:]))
#model.add(MaxPooling2D(pool_size=2))
BatchNormalization(axis=1)
model.add(Conv2D(filters=16, kernel_size=3, padding='same', activation='relu'))
model.add(MaxPooling2D(pool_size=2))
model.add(Dropout(0.1))
BatchNormalization(axis=1)
model.add(Conv2D(filters=16, kernel_size=3, padding='same', activation='relu'))
model.add(MaxPooling2D(pool_size=2))

# Converts our 3D feature maps to 1D features vectors
model.add(Flatten())
BatchNormalization()
model.add(Dense(64))
model.add(Activation('relu'))
model.add(Dropout(0.2))
BatchNormalization()
model.add(Dense(10, activation='softmax'))

# model.summary()

In [None]:
# compile the model
model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'])

In [None]:
model.fit_generator(generator=train_batches, steps_per_epoch =train_batches.n, epochs=1,# 2000 // batch_size, epochs=50,
                    validation_data=val_batches, validation_steps=val_batches.n)#800 // batch_size)

In [None]:
score = model.evaluate(extended_splitted_test_X, ohe_splitted_test_y, verbose=0)
accuracy = 100 * score[1]

#print test accuracy
print('Test accuracy: %.4f%%' % accuracy)

Let's start all over again, but with more filters on each convolutional layer.

In [None]:
# define a Convolutional NNs model (same architecture as in solution number 4)
model = Sequential()
model.add(Conv2D(filters=32, kernel_size=3, padding='same', activation='relu', input_shape=extended_splitted_train_X.shape[1:]))
#model.add(MaxPooling2D(pool_size=2))
BatchNormalization(axis=1)
model.add(Conv2D(filters=32, kernel_size=3, padding='same', activation='relu'))
model.add(MaxPooling2D(pool_size=2))
model.add(Dropout(0.1))
BatchNormalization(axis=1)
model.add(Conv2D(filters=32, kernel_size=3, padding='same', activation='relu'))
model.add(MaxPooling2D(pool_size=2))
#model.add(Conv2D(filters=32, kernel_size=3, padding='same', activation='relu'))

# Converts our 3D feature maps to 1D features vectors
model.add(Flatten())
BatchNormalization()
model.add(Dense(64))
model.add(Activation('relu'))
model.add(Dropout(0.2))
BatchNormalization()
model.add(Dense(10, activation='softmax'))

# model.summary()

In [None]:
# compile the model
model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'])

In [None]:
# add another dimension to the training data
final_train_x = train_x[..., tf.newaxis]
# one hot encode the complete training labels
final_train_y = tf_utils.to_categorical(train_y, 10)

#model.optimizer.learning_rate=0.01
batches = image_augmentator.flow(final_train_x, final_train_y, batch_size=64)

history = model.fit_generator(generator=batches, steps_per_epoch=batches.n, epochs=1)# 2000 // batch_size, epochs=50,
                    #validation_data=val_batches, validation_steps=val_batches.n)#800 // batch_size)

In [None]:
extended_test_x = test_x[..., tf.newaxis]
predictions = model.predict(extended_test_x)
predictions = [ np.argmax(x) for x in predictions ]

# prepare submission
submission = pd.read_csv('/kaggle/input/digit-recognizer/sample_submission.csv')
submission.drop('Label', axis=1, inplace=True)
submission['Label'] = predictions
submission.to_csv('submission.csv', index=False)

With this last try I got a score of 0.98257 which is actually not an improvement to my last score.
:(

# Solution 5. Adding a Lambda layer to my last CNN

I will try again, but this time I will add a Lambda layer at the input of my NN. This layer input will center the data around zero mean and unite variance (I got this from [Poonam Ligade's notebook](https://www.kaggle.com/poonaml/deep-neural-network-keras-way/notebook)). This means I have to take again the original data (not preprocessed data), and add another dimension. The Lambda layer will perform a "Standardize" function (defined later some blocks below) which will do the preprocessing to each one of the images (i.e. as mentioned before, center the data around zero mean and unit variance).

In [None]:
# create new datasets out of the original files provided by kaggle (to avoid confussions with other
# variables created in other sections of this notebook and because I need this data without the first preprocessing steps
# I performed in my previous solutions)
train_y_sol5 = mnist_train_complete.iloc[:, 0].values.astype('int32')
train_x_sol5 = mnist_train_complete.iloc[:, 1:].values.astype('int32')
train_x_sol5 = train_x_sol5.reshape(train_x_sol5.shape[0], 28, 28,1)

# create new datasets out of the original files provided by kaggle (to avoid confussions with other
# variables created in other sections of this notebook and because I need this data without the first preprocessing steps
# I performed in my previous solutions)
test_x_sol5 =  mnist_test_complete.values.astype('int32')
test_x_sol5 =  test_x_sol5.reshape(test_x_sol5.shape[0], 28, 28,1)

In [None]:
# cross validation
s5_train_x, s5_test_x, s5_train_y, s5_test_y = train_test_split(train_x_sol5, train_y_sol5,
                                                                test_size=0.2,
                                                                random_state=81)
# one-hot encoding the target labels
ohe_s5_train_y = tf_utils.to_categorical(s5_train_y, 10)
ohe_s5_test_y = tf_utils.to_categorical(s5_test_y, 10)

In [None]:
# create new image generators using the same image_augmentator created previously,
# but with a different number of batches (prevous batch size was 32).
train_batches_sol5 = image_augmentator.flow(s5_train_x, ohe_s5_train_y, batch_size=64)
val_batches_sol5 = image_augmentator.flow(s5_test_x, ohe_s5_test_y, batch_size=64)

In [None]:
# new preprocessing of data (to be applied to each individual image by the Lamda layer)
mean_px = train_x_sol5.mean().astype(np.float32)
std_px = train_x_sol5.std().astype(np.float32)

# define the function that will be performed by our Lambda layer on each of the input images
def standardize(x): 
    return (x-mean_px)/std_px

In [None]:
# define a Convolutional NNs model (same architecture as in solution number 4)
model = Sequential()
model.add(Lambda(standardize, input_shape=(28,28,1)))
model.add(Conv2D(filters=32, kernel_size=3, padding='same', activation='relu', 
                 kernel_regularizer=regularizers.l2(0.01),
                 #activity_regularizer=regularizers.l2(0.01),
                 ))

BatchNormalization(axis=1)
model.add(Conv2D(filters=32, kernel_size=3, padding='same', activation='relu'#,
                 #kernel_regularizer=regularizers.l2(0.01)
                 #activity_regularizer=regularizers.l2(0.01)
                ))
model.add(MaxPooling2D(pool_size=2))

BatchNormalization(axis=1)
model.add(Conv2D(filters=32, kernel_size=3, padding='same', activation='relu',
                 #kernel_regularizer=regularizers.l2(0.01)
                 #activity_regularizer=regularizers.l1(0.01)
         ))
model.add(MaxPooling2D(pool_size=2))
#model.add(Dropout(0.1))

# Converts our 3D feature maps to 1D features vectors
model.add(Flatten())
BatchNormalization()
model.add(Dense(64))
model.add(Activation('relu'))
model.add(Dropout(0.1))
BatchNormalization()
model.add(Dense(10, activation='softmax'))

# model.summary()

In [None]:
# compile the model
model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'])

In [None]:
model.fit_generator(generator=train_batches_sol5, steps_per_epoch =train_batches_sol5.n, epochs=1,# 2000 // batch_size, epochs=50,
                    validation_data=val_batches_sol5, validation_steps=val_batches_sol5.n)#800 // batch_size)

In [None]:
score = model.evaluate(s5_test_x, ohe_s5_test_y, verbose=0)
accuracy = 100 * score[1]

#print test accuracy
print('Test accuracy: %.4f%%' % accuracy) #97.9167

In [None]:
model.optimizer.lerning_rate=0.01
gen = ImageDataGenerator()
batches = gen.flow(train_x_sol5, tf_utils.to_categorical(train_y_sol5, 10), batch_size=64)
history=model.fit_generator(generator=batches, steps_per_epoch=batches.n, epochs=3)

In [None]:
extended_test_x = test_x[..., tf.newaxis]
predictions = model.predict(extended_test_x)
predictions = [ np.argmax(x) for x in predictions ]

# prepare submission
submission = pd.read_csv('/kaggle/input/digit-recognizer/sample_submission.csv')
submission.drop('Label', axis=1, inplace=True)
submission['Label'] = predictions
submission.to_csv('submission.csv', index=False)

# Conclusion

## Changes made on my last Neural Network

Instead of preprocessing the complete set of images by scaling them and trying to center them on zero mean (using the max value and the mean value of the complete dataset as done in my [small preprocessing section](#Preprocessing)), I did a zero mean centering with unit variance on each one of the images by means of defining a "standardize" function that would be executed by a Lambda layer (placed at the begining / top of my neural network). I also added [Ridge regression](https://towardsdatascience.com/ridge-regression-for-better-usage-2f19b3a202db) to the first convolutional layer of my neural network in an attempt to penalize more those feature of the images that does not help the algorithm to improve during its training. Additionally I removed the dropout layer from the convolutional part of my architecture and left only a dropout layer between the two Dense layers at the end of the architecture (lowering the percentage of dropout to 10% in comparison to the same dropout layer in my previous solutions).
Finally, I did the training of this last model in two steps being the first step a training on the cross validation data sets with data augmentation and the second step being a training on the complete training dataset provided by Kaggle, without data augmentation and adjusting a new learning rate value of 10 % in our model / NN.