# Training algorithms example: MNIST dataset

#### Loading the data from MNIST dataset

In [1]:
from tensorflow.keras.datasets import mnist

(train_images, train_labels), (test_images, test_labels) = mnist.load_data()

print(train_images.shape, test_images.shape)
print(len(train_labels), len(test_labels))

train_images = train_images.reshape((60000, 28 * 28))
train_images = train_images.astype('float32') / 255
test_images = test_images.reshape((10000, 28 * 28))
test_images = test_images.astype('float32') / 255

from tensorflow.keras.utils import to_categorical
train_labels = to_categorical(train_labels)
test_labels = to_categorical(test_labels)

(60000, 28, 28) (10000, 28, 28)
60000 10000


#### Create the model based on a DNN

In [2]:
from tensorflow.keras import models
from tensorflow.keras import layers

network = models.Sequential()
network.add(layers.Dense(512, activation='relu', input_shape=(28 * 28,)))
network.add(layers.Dense(256, activation='relu'))
network.add(layers.Dense(10, activation='softmax'))

network.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 512)               401920    
                                                                 
 dense_1 (Dense)             (None, 256)               131328    
                                                                 
 dense_2 (Dense)             (None, 10)                2570      
                                                                 
Total params: 535,818
Trainable params: 535,818
Non-trainable params: 0
_________________________________________________________________


#### Train with SGD

In [5]:
from tensorflow.keras import optimizers

alg_optimizer = optimizers.SGD(learning_rate=0.01, momentum=0.0)
# Optimizers are used with compile and fit method and are used to change the attributes of the ML/DL model such as weights and learning rate in order to reduce the losses. Optimizers help to get results faster.

# SGD -> Stochastic gradient descent, performs a parameter update for each training and label example
# There are three variants of gradient descent, which differ in how much data we use to compute the gradient of the objective function. Depending on the amount of data, we make a trade-off between the accuracy of the parameter update and the time it takes to perform an update.

# lr -> learning rate, is the most important hyperparameter as it controls how much to change the model in response to the estimated error each time the model weights are updated.
# The default learning rate value is 0.001, the recommended value for starting.

network.compile(optimizer= alg_optimizer,
                loss='categorical_crossentropy',
                metrics=['accuracy'])

network.fit(train_images, train_labels, epochs=5, batch_size=128)

test_loss, test_acc = network.evaluate(test_images, test_labels, verbose = 0)
print(test_loss, test_acc)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
0.2450302541255951 0.9311000108718872


In [7]:
# Train with SGD with momentum

algo1 = optimizers.SGD(learning_rate=0.01, momentum=0.9)
# Momentum is method which helps accelerate gradients vectors in the right directions, thus leading to faster converging.

network.compile(optimizer= algo1,
                loss='categorical_crossentropy',
                metrics=['accuracy'])

network.fit(train_images, train_labels, epochs=5, batch_size=128)

test_loss, test_acc = network.evaluate(test_images, test_labels, verbose = 0)
print(test_loss, test_acc)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
0.06510666012763977 0.9807999730110168


#### Train with RMSprop

In [10]:
algo2 = optimizers.RMSprop(learning_rate=0.01, rho=0.99)

network.compile(optimizer= algo2,
                loss='categorical_crossentropy',
                metrics=['accuracy'])

network.fit(train_images, train_labels, epochs=5, batch_size=128)

test_loss, test_acc = network.evaluate(test_images, test_labels, verbose = 0)
print(test_loss, test_acc)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
0.13997267186641693 0.9634000062942505


#### Train with Adam

In [12]:
algo3 = optimizers.Adam(learning_rate=0.001, beta_1=0.9, beta_2=0.999)
network.compile(optimizer= algo3,
                loss='categorical_crossentropy',
                metrics=['accuracy'])

network.fit(train_images, train_labels, epochs=5, batch_size=128)

test_loss, test_acc = network.evaluate(test_images, test_labels, verbose = 0)
print(test_loss, test_acc)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
0.11560291796922684 0.9722999930381775


#### Batch normalization
Is a technique for training very deep neural networks that normalizes the contributions to a layer for every mini-batch. This has the impact of settling the learning process and drastically decreasing the number of training epochs required to train deep neural networks.

In [14]:
from keras.layers import BatchNormalization

network_bn = models.Sequential()

network_bn.add(layers.Dense(512, activation='relu', input_shape=(28 * 28,)))
network_bn.add(BatchNormalization())
network_bn.add(layers.Dense(256, activation='relu'))
network_bn.add(BatchNormalization())
network_bn.add(layers.Dense(10, activation='softmax'))
# Apply a batch normalization for each layer.

network_bn.summary()

network_bn.compile(optimizer= algo3,
                loss='categorical_crossentropy',
                metrics=['accuracy'])

network_bn.fit(train_images, train_labels, epochs=5, batch_size=128)

test_loss, test_acc = network_bn.evaluate(test_images, test_labels, verbose = 0)
print(test_loss, test_acc)

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_6 (Dense)             (None, 512)               401920    
                                                                 
 batch_normalization_2 (Batc  (None, 512)              2048      
 hNormalization)                                                 
                                                                 
 dense_7 (Dense)             (None, 256)               131328    
                                                                 
 batch_normalization_3 (Batc  (None, 256)              1024      
 hNormalization)                                                 
                                                                 
 dense_8 (Dense)             (None, 10)                2570      
                                                                 
Total params: 538,890
Trainable params: 537,354
Non-tr

KeyError: in user code:

    File "C:\Users\anaca\Documents\GitHub\SIB-ML-Portfolio\venv\lib\site-packages\keras\engine\training.py", line 1249, in train_function  *
        return step_function(self, iterator)
    File "C:\Users\anaca\Documents\GitHub\SIB-ML-Portfolio\venv\lib\site-packages\keras\engine\training.py", line 1233, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "C:\Users\anaca\Documents\GitHub\SIB-ML-Portfolio\venv\lib\site-packages\keras\engine\training.py", line 1222, in run_step  **
        outputs = model.train_step(data)
    File "C:\Users\anaca\Documents\GitHub\SIB-ML-Portfolio\venv\lib\site-packages\keras\engine\training.py", line 1027, in train_step
        self.optimizer.minimize(loss, self.trainable_variables, tape=tape)
    File "C:\Users\anaca\Documents\GitHub\SIB-ML-Portfolio\venv\lib\site-packages\keras\optimizers\optimizer_experimental\optimizer.py", line 527, in minimize
        self.apply_gradients(grads_and_vars)
    File "C:\Users\anaca\Documents\GitHub\SIB-ML-Portfolio\venv\lib\site-packages\keras\optimizers\optimizer_experimental\optimizer.py", line 1140, in apply_gradients
        return super().apply_gradients(grads_and_vars, name=name)
    File "C:\Users\anaca\Documents\GitHub\SIB-ML-Portfolio\venv\lib\site-packages\keras\optimizers\optimizer_experimental\optimizer.py", line 634, in apply_gradients
        iteration = self._internal_apply_gradients(grads_and_vars)
    File "C:\Users\anaca\Documents\GitHub\SIB-ML-Portfolio\venv\lib\site-packages\keras\optimizers\optimizer_experimental\optimizer.py", line 1166, in _internal_apply_gradients
        return tf.__internal__.distribute.interim.maybe_merge_call(
    File "C:\Users\anaca\Documents\GitHub\SIB-ML-Portfolio\venv\lib\site-packages\keras\optimizers\optimizer_experimental\optimizer.py", line 1216, in _distributed_apply_gradients_fn
        distribution.extended.update(
    File "C:\Users\anaca\Documents\GitHub\SIB-ML-Portfolio\venv\lib\site-packages\keras\optimizers\optimizer_experimental\optimizer.py", line 1213, in apply_grad_to_update_var  **
        return self._update_step(grad, var)
    File "C:\Users\anaca\Documents\GitHub\SIB-ML-Portfolio\venv\lib\site-packages\keras\optimizers\optimizer_experimental\optimizer.py", line 216, in _update_step
        raise KeyError(

    KeyError: 'The optimizer cannot recognize variable dense_6/kernel:0. This usually means you are trying to call the optimizer to update different parts of the model separately. Please call `optimizer.build(variables)` with the full list of trainable variables before the training loop or use legacy optimizer `tf.keras.optimizers.legacy.{self.__class__.__name__}.'
