In [18]:
import tensorflow_datasets as tfds
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt

### How many training/test images are there?
    - 60,000 train images
    - 10,000 test images

### What's the image shape?
    - shape = (28, 28, 1)

### What range are pixel values in?
    - the range goes from 0 to 255 (grayscale values)

In [19]:
def data_pipeline(train_ds, test_ds, batch_size):
    train_ds = train_ds.map(lambda feature_dict: (feature_dict['image'], feature_dict['label'])) # extracting image and labels
    train_ds = train_ds.map(lambda image, label: (tf.reshape(image,(-1,)), label)) # reshape from, 28,28,1 to one vector
    train_ds = train_ds.map(lambda image, label: ((tf.cast(image, tf.float32)/128)-1, label)) # rescaling the values
    train_ds = train_ds.map(lambda image, label: (image, tf.one_hot(label, depth=10))) # one-hot encoder
    train_ds = train_ds.shuffle(1024).batch(4) # taking batches of 4 out of 1024
    train_ds = train_ds.prefetch(4) # always having 4 minibatches ready for the gpu (to minimize the runtime); splitting work between cpu and gpu

    test_ds = test_ds.map(lambda feature_dict: (feature_dict['image'], feature_dict['label'])) # extracting image and labels
    test_ds = test_ds.map(lambda image, label: (tf.reshape(image,(-1,)), label)) # reshape from, 28,28,1 to one vector
    test_ds = test_ds.map(lambda image, label: ((tf.cast(image, tf.float32)/128)-1, label)) # rescaling the values
    test_ds = test_ds.map(lambda image, label: (image, tf.one_hot(label, depth=10))) # one-hot encoder
    test_ds = test_ds.shuffle(1024).batch(batch_size) # taking batches of 4 out of 1024
    test_ds = test_ds.prefetch(4) # always having 4 minibatches ready for the gpu (to minimize the runtime); splitting work between cpu and gpu
    return train_ds, test_ds

In [20]:
#Model Creation via Subclassing from tf.keras.Model
class MLP_Model(tf.keras.Model):
  def __init__(self, layer_sizes, output_size = 10): # the output size should be 10, because we are dealing with 10 different digits
    super().__init__()
    self.mlp_layers = []
    #layer_sizes e.g. [256, 256]
    for layer_size in layer_sizes:
      new_layer = tf.keras.layers.Dense(units = layer_size, activation = 'sigmoid') # whole implementation for one layer
      self.mlp_layers.append(new_layer)
    self.output_layer = tf.keras.layers.Dense( units = output_size, activation = 'softmax')

  def call(self, x):
    for layer in self.mlp_layers:
      x = layer(x)
    y = self.output_layer(x)
    return y


In [21]:
def training_loop(n_epochs, model, ds_train, ds_test, loss_function, optimizer):
    train_losses = []
    test_losses = []
    test_accuracies = []
    for epoch in range(n_epochs):
        epoch_losses = []
        for x, target in ds_train:
            with tf.GradientTape() as tape:
                pred = model(x)
                loss = loss_function(target, pred)
            gradients = tape.gradient(loss, model.variables) # we want to calculate the gradient outside the gradient tape
            optimizer.apply_gradients(zip(gradients, model.variables))
            epoch_losses.append(loss.numpy())
        train_losses.append(tf.reduce_mean(epoch_losses))

        test_accuracy_aggregator = []
        test_loss_aggregator = []

        for x, target in ds_test:
            prediction = model(x)
            sample_test_loss = loss_function(target, prediction)
            sample_test_accuracy =  np.argmax(target, axis=1) == np.argmax(prediction, axis=1)
            sample_test_accuracy = np.mean(sample_test_accuracy)
            test_loss_aggregator.append(sample_test_loss.numpy())
            test_accuracy_aggregator.append(np.mean(sample_test_accuracy))

        test_losses.append(tf.reduce_mean(test_loss_aggregator))
        test_accuracies.append(tf.reduce_mean(test_accuracy_aggregator))

    return train_losses, test_losses, test_accuracies


In [22]:
def main(n_epochs = 10, lay_sizes = [256,256], learn_rate = 0.01, batch_size = 4):
  #load the data
  train_ds = tfds.load('mnist', split = 'train')
  test_ds = tfds.load('mnist', split = 'test')
  
  # initialize the model, the loss function and the optimizer
  model = MLP_Model(layer_sizes = lay_sizes)
  cce = tf.keras.losses.CategoricalCrossentropy()
  optimizer = tf.keras.optimizers.legacy.SGD(learning_rate = learn_rate)

  # orginize your data
  train_dataset, test_dataset = data_pipeline(train_ds, test_ds, batch_size)
  #just for showcasing I use .take, but generally you should train your model on the whole data
  train_dataset = train_dataset.take(1000)
  test_dataset = test_dataset.take(100)

  # Initialize lists for later visualization.
  train_losses = []
  test_losses = []
  test_accuracies = []

  #the training loop
  train_losses, test_losses, test_accuracies = training_loop(n_epochs, model, train_dataset, test_dataset, cce, optimizer)

  # visualize the data
  plt.figure()
  line1, = plt.plot(train_losses)
  line2, = plt.plot(test_losses)
  line3, = plt.plot(test_accuracies)
  plt.xlabel("Training steps")
  plt.ylabel("Loss/Accuracy")
  plt.legend((line1,line2, line3),("training","test", "test accuracy"))
  plt.show()

In [None]:
# the initial model
main()

# high lerning_rate
main(learn_rate = 1)

# low layer_size
main(lay_sizes = [32,32])

# low batch_size
main(batch_size = 1)

# smaller number ov epochs
main(n_epochs = 3)

### 3 Adjusting the hyperparameters of your models
  - 1. If the learning rate is too high, the model will not be able to assign the images correctly to the pictures. Problems such as 'overshooting' could occur.
  - 2. Even a model with two layers, each of which only consists of 32 units, produces somewhat decent results.
  - 3. With a batch_size of 1, you can see in the plot that the variables fluctuate, which is one of the reasons why you should choose a larger batch size.
  - 4. If the number of epochs is too low, the model does not receive enough training data, which is why the losses are still relatively high.