In [0]:
'''Trains a simple convnet on the MNIST dataset.

Gets to 99.25% test accuracy after 12 epochs
(there is still a lot of margin for parameter tuning).
16 seconds per epoch on a GRID K520 GPU.
'''

from __future__ import print_function
import keras
from keras.datasets import mnist
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten
from keras.layers import Conv2D, MaxPooling2D, DepthwiseConv2D
from keras import backend as K

batch_size = 128
num_classes = 10
epochs = 12

# input image dimensions
img_rows, img_cols = 28, 28

# the data, split between train and test sets
(x_train, y_train), (x_test, y_test) = mnist.load_data()

if K.image_data_format() == 'channels_first':
    x_train = x_train.reshape(x_train.shape[0], 1, img_rows, img_cols)
    x_test = x_test.reshape(x_test.shape[0], 1, img_rows, img_cols)
    input_shape = (1, img_rows, img_cols)
else:
    x_train = x_train.reshape(x_train.shape[0], img_rows, img_cols, 1)
    x_test = x_test.reshape(x_test.shape[0], img_rows, img_cols, 1)
    input_shape = (img_rows, img_cols, 1)

x_train = x_train.astype('float32')
x_test = x_test.astype('float32')
x_train /= 255
x_test /= 255
print('x_train shape:', x_train.shape)
print(x_train.shape[0], 'train samples')
print(x_test.shape[0], 'test samples')

# convert class vectors to binary class matrices
y_train = keras.utils.to_categorical(y_train, num_classes)
y_test = keras.utils.to_categorical(y_test, num_classes)

model = Sequential()
model.add(DepthwiseConv2D( kernel_size=(3, 3),
                 activation='relu',
                 input_shape=input_shape))
model.add(Conv2D(64, (7, 7), activation='relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.25))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(num_classes, activation='softmax'))
model.summary()
model.compile(loss=keras.losses.categorical_crossentropy,
              optimizer=keras.optimizers.Adadelta(),
              metrics=['accuracy'])

model.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=epochs,
          verbose=1,
          validation_data=(x_test, y_test))
score = model.evaluate(x_test, y_test, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])


x_train shape: (60000, 28, 28, 1)
60000 train samples
10000 test samples
Model: "sequential_25"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
depthwise_conv2d_8 (Depthwis (None, 26, 26, 1)         10        
_________________________________________________________________
conv2d_30 (Conv2D)           (None, 20, 20, 64)        3200      
_________________________________________________________________
max_pooling2d_23 (MaxPooling (None, 10, 10, 64)        0         
_________________________________________________________________
dropout_45 (Dropout)         (None, 10, 10, 64)        0         
_________________________________________________________________
flatten_23 (Flatten)         (None, 6400)              0         
_________________________________________________________________
dense_45 (Dense)             (None, 128)               819328    
______________________________________________

Here, to reduce the inference time I have taken below steps:
--------------------------------------------------------------------------------------- 
I have compared both the codes on google colab and below are the comparisons and explanation for the changes, I have made to meet the requirements.

Comparison of the Original Code and Updated Code:

**Original Code (Which I got from Sudesh):**

Total params: 1,199,882

Trainable params: 1,199,882

Non-trainable params: 0

Test loss: 0.035234447981064565

Test accuracy: 0.991


**Updated Code:**

Total params: 823,828

Trainable params: 823,828

Non-trainable params: 0

Test loss: 0.025193355914763925

Test accuracy: 0.9921



1. I have replaced the first Conv2D layer with DepthwiseConv2D layer (with 3X3 kernel). 

=> **Reason:** These type of CNN’s (i.e., DepthWiseConv2D) are widely used because of the following two reasons –

They have lesser number of parameters to adjust as compared to the standard CNN’s, which reduces overfitting.

They are computationally cheaper because of fewer computations which makes them work faster.


2. In the second layer of Conv2D, I have changed the kernel size to (7X7).

=> **Reason:** A larger size kernel can overlook at the features and could skip the essential details in the images whereas a smaller size kernel could provide more information leading to more confusion. But here, I have tested the model with 5X5 and 7X7 both and the model was performing slightly worse than the model with 5X5 kernel with a huge difference in the number of parameters. So, I decided to keep 7X7 kernel.

These changes, have made the training faster. Though, there are many changes need to be implemented to make the code tun more efficiently but this is the start in the same direction.