In [1]:
import tensorflow
from tensorflow.keras.datasets import mnist
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
# stuff that is related to our CNN model
from tensorflow.keras.layers import Flatten, Conv2D, MaxPooling2D
from matplotlib import pyplot as plt
from tensorflow.keras.callbacks import ModelCheckpoint 
from tensorflow.keras.callbacks import TensorBoard
import os 
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import confusion_matrix
import numpy

In [2]:
(X_train, y_train), (X_valid, y_valid) = mnist.load_data()

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz


In [3]:
# note what is happenning here - we pre-process the data in completely different way!
# we do not flatten them, as for the dense general network, we retain their shape as pictures
# note we have 28, 28, 1 - two first figures are related to the picture size, the latter
# means it is black-and-white image (for RGB we would use 3)
X_train = X_train.reshape(60000, 28, 28, 1).astype('float32')
X_valid = X_valid.reshape(10000, 28, 28, 1).astype('float32')

# "normalisation" of the input pixels - as we did before
X_train /= 255
X_valid /= 255

# no change here as well, we need one-hot encoding for the final classifier
n_classes = 10
y_train = to_categorical(y_train, n_classes)
y_valid = to_categorical(y_valid, n_classes)

In [4]:
model = Sequential()

# the best way to teach coding is to start to code... please consult the documentaion
# of Keras Conv2D object to configure it
# we need, the number of kernels/filters, their size, strides, activation and input shape
model.add(Conv2D(32, 3, strides=(1,1), activation='sigmoid', input_shape=(28, 28, 1)))

# same as before, but not need to worry about the shape now
model.add(Conv2D(32,kernel_size=(3,3),strides=(1,1),activation='relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.15))
model.add(Flatten())

# here the model is shallow, so probably we do not need any batch norm
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.15))

model.add(Dense(10, activation='softmax'))

In [5]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d (Conv2D)             (None, 26, 26, 32)        320       
                                                                 
 conv2d_1 (Conv2D)           (None, 24, 24, 32)        9248      
                                                                 
 max_pooling2d (MaxPooling2D  (None, 12, 12, 32)       0         
 )                                                               
                                                                 
 dropout (Dropout)           (None, 12, 12, 32)        0         
                                                                 
 flatten (Flatten)           (None, 4608)              0         
                                                                 
 dense (Dense)               (None, 128)               589952    
                                                        

In [6]:
output_dir = 'model_output/'
arch = 'test_1'
output_path = output_dir + arch
if not os.path.exists(output_path):
    os.makedirs(output_path)

modelcheckpoint = ModelCheckpoint(output_path + '/weights.{epoch:02d}.hdf5', # decimal integers
                                  save_weights_only=True) 
tb = TensorBoard(log_dir='logs/' + arch)

In [7]:
# after the initial training we will implement the callbacks to save the best working point and to use TB for analysis
model.compile(loss='categorical_crossentropy', optimizer='nadam', metrics=['accuracy'])

In [8]:
model.fit(X_train, y_train, batch_size=128, epochs=2, verbose=1, validation_data=(X_valid, y_valid))

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7f78be2df1f0>

At this laboratory classes we learned about CNN model - this kind of network has slightly different build than previous models. The new thing is pooling layer - thanks to this we can reduce spatial size of represenation. There is a lot of pooling functions, in our case we used MaxPooling2D from keras library. The other thing that we did was looking for the proper numbers for kernel size. We figured that (3,3) - (5,5) are optimal, but we did test this practically (just by changing those numbers and see what will happen). Now I will change those number and try to notice some trends.

In [9]:
model1 = Sequential()
model1.add(Conv2D(32, 3, strides=(1,1), activation='sigmoid', input_shape=(28, 28, 1)))

model1.add(Conv2D(32,kernel_size=(1,1),strides=(1,1),activation='relu'))
model1.add(MaxPooling2D(pool_size=(2, 2)))
model1.add(Dropout(0.15))
model1.add(Flatten())

model1.add(Dense(128, activation='relu'))
model1.add(Dropout(0.15))

model1.add(Dense(10, activation='softmax'))

model1.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d_2 (Conv2D)           (None, 26, 26, 32)        320       
                                                                 
 conv2d_3 (Conv2D)           (None, 26, 26, 32)        1056      
                                                                 
 max_pooling2d_1 (MaxPooling  (None, 13, 13, 32)       0         
 2D)                                                             
                                                                 
 dropout_2 (Dropout)         (None, 13, 13, 32)        0         
                                                                 
 flatten_1 (Flatten)         (None, 5408)              0         
                                                                 
 dense_2 (Dense)             (None, 128)               692352    
                                                      

In [10]:
model1.compile(loss='categorical_crossentropy', optimizer='nadam', metrics=['accuracy'])

In [11]:
model1.fit(X_train, y_train, batch_size=128, epochs=2, verbose=1, validation_data=(X_valid, y_valid))

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7f78a9c11150>

We can see a slightly worse performance - probably with more epochs it would be easier to see, but we get this with fitting settled as stands. Now lets check different numbers.

In [12]:
model2 = Sequential()
model2.add(Conv2D(32, 3, strides=(1,1), activation='sigmoid', input_shape=(28, 28, 1)))

model2.add(Conv2D(32,kernel_size=(5,5),strides=(1,1),activation='relu'))
model2.add(MaxPooling2D(pool_size=(2, 2)))
model2.add(Dropout(0.15))
model2.add(Flatten())

model2.add(Dense(128, activation='relu'))
model2.add(Dropout(0.15))

model2.add(Dense(10, activation='softmax'))

model2.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d_4 (Conv2D)           (None, 26, 26, 32)        320       
                                                                 
 conv2d_5 (Conv2D)           (None, 22, 22, 32)        25632     
                                                                 
 max_pooling2d_2 (MaxPooling  (None, 11, 11, 32)       0         
 2D)                                                             
                                                                 
 dropout_4 (Dropout)         (None, 11, 11, 32)        0         
                                                                 
 flatten_2 (Flatten)         (None, 3872)              0         
                                                                 
 dense_4 (Dense)             (None, 128)               495744    
                                                      

In [13]:
model2.compile(loss='categorical_crossentropy', optimizer='nadam', metrics=['accuracy'])

In [14]:
model2.fit(X_train, y_train, batch_size=128, epochs=2, verbose=1, validation_data=(X_valid, y_valid))

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7f78a9980a90>

It seems that the performance dropped really hard - it is possible, that this kernel size compared with other parameters in this model creates this output - maybe it's just too big in this case. Now I will see what will happen with much bigger kernel size.

In [15]:
model3 = Sequential()
model3.add(Conv2D(32, 3, strides=(1,1), activation='sigmoid', input_shape=(28, 28, 1)))

model3.add(Conv2D(32,kernel_size=(10,10),strides=(1,1),activation='relu'))
model3.add(MaxPooling2D(pool_size=(2, 2)))
model3.add(Dropout(0.15))
model3.add(Flatten())

model3.add(Dense(128, activation='relu'))
model3.add(Dropout(0.15))

model3.add(Dense(10, activation='softmax'))

model3.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d_6 (Conv2D)           (None, 26, 26, 32)        320       
                                                                 
 conv2d_7 (Conv2D)           (None, 17, 17, 32)        102432    
                                                                 
 max_pooling2d_3 (MaxPooling  (None, 8, 8, 32)         0         
 2D)                                                             
                                                                 
 dropout_6 (Dropout)         (None, 8, 8, 32)          0         
                                                                 
 flatten_3 (Flatten)         (None, 2048)              0         
                                                                 
 dense_6 (Dense)             (None, 128)               262272    
                                                      

In [16]:
model3.compile(loss='categorical_crossentropy', optimizer='nadam', metrics=['accuracy'])

In [17]:
model3.fit(X_train, y_train, batch_size=128, epochs=2, verbose=1, validation_data=(X_valid, y_valid))

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7f78aa562530>

As we can see, the performance is simmillar to the previous one. It looks like we crossed some kind of border for size, for which sizes the model worked good. Of course, I have to undeline the fact that it is in this certain situation.

In conclusion, the kernel size in cnn model is really important. For smaller examles we should use (3,3) and for bigger ones (5,5) - in those cases the model should work perfectly. We should also avoid the even numbers in kernel size - its linked with finding the middle of matrix that is written. We cannot also use (1,1), becouse there will be no neigbours to middle of the matrix and it is not doing any extraction. So to remember is this - use (3,3) or (5,5) kernel size (of course, choose those sizes wisely).