# Digit Classification Task

In this Experiment, we carry a digit classification task based on the MNIST dataset.
We begin by importing the relevant modules and the MNIST dataset.

In [4]:
#Import Relevant Tensorflow functions
import tensorflow as tf
tf.compat.v1.enable_eager_execution()
import tensorflow_datasets as tfds

In [5]:
#Import Relevant 
from functools import partial
import time
import os, sys
# Import hessianlearn repository
sys.path.append( os.environ.get('HESSIANLEARN_PATH', "../../"))
from hessianlearn import *
from hessianlearn.hessianlearn.data.data import Data
from hessianlearn.hessianlearn.problem.problem import ClassificationProblem,  AutoencoderProblem
from hessianlearn.hessianlearn.problem.regularization import L2Regularization
from hessianlearn.hessianlearn.model.model import HessianlearnModelSettings, HessianlearnModel
import numpy  as np

Import the MNIST dataset and clean the data

In [6]:
#load the mnist
(ds_train, ds_test), ds_info = tfds.load(
    'mnist',
    split=['train', 'test'],
    shuffle_files=True,
    as_supervised=True,
    with_info=True,
)
def normalize_img(image, label):
    """Normalizes images: `uint8` -> `float32`."""
    return tf.cast(image, tf.float32) / 255., label

#setting the train
ds_train = ds_train.map(
    normalize_img, num_parallel_calls=tf.data.experimental.AUTOTUNE)
ds_train = ds_train.cache()
ds_train = ds_train.shuffle(ds_info.splits['train'].num_examples)
ds_train = ds_train.batch(128)
ds_train = ds_train.prefetch(tf.data.experimental.AUTOTUNE)

#setting the test
ds_test = ds_test.map(
    normalize_img, num_parallel_calls=tf.data.experimental.AUTOTUNE)
ds_test = ds_test.batch(128)
ds_test = ds_test.cache()
ds_test = ds_test.prefetch(tf.data.experimental.AUTOTUNE)

## Train Baseline Models

We now train the baseline models

In [6]:
#Train Model with Adam 

model = tf.keras.models.Sequential([
  tf.keras.layers.Conv2D(32, 5, input_shape=(28, 28, 1)),
  tf.keras.layers.MaxPooling2D(pool_size=(2, 2)),
  tf.keras.layers.ReLU(negative_slope=0.01),
  tf.keras.layers.Conv2D(64, 5),
  tf.keras.layers.MaxPooling2D(pool_size=(2, 2)),
  tf.keras.layers.ReLU(negative_slope=0.01),
  tf.keras.layers.MaxPooling2D(pool_size=(2, 2)),
  tf.keras.layers.Flatten(input_shape=(64, 2, 2)),
  tf.keras.layers.Dense(128,activation=partial(tf.nn.leaky_relu, alpha=0.01)),
  tf.keras.layers.Dense(10, activation=partial(tf.nn.leaky_relu, alpha=0.01))
])
model.compile(
    optimizer=tf.keras.optimizers.Adam(0.001),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=[tf.keras.metrics.SparseCategoricalAccuracy()],
)

model.fit(
    ds_train,
    epochs=25,
    validation_data=ds_test,
)

Train on 469 steps, validate on 79 steps
Epoch 1/25



Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<tensorflow.python.keras.callbacks.History at 0x7ff9abf68f28>

Wonderful, Adam showed his great qualities. Let's try the other two baseline optimizers. 

In [7]:
#Train the Model with RMSProp 

model = tf.keras.models.Sequential([
  tf.keras.layers.Conv2D(32, 5, input_shape=(28, 28, 1)),
  tf.keras.layers.MaxPooling2D(pool_size=(2, 2)),
  tf.keras.layers.ReLU(negative_slope=0.01),
  tf.keras.layers.Conv2D(64, 5),
  tf.keras.layers.MaxPooling2D(pool_size=(2, 2)),
  tf.keras.layers.ReLU(negative_slope=0.01),
  tf.keras.layers.MaxPooling2D(pool_size=(2, 2)),
  tf.keras.layers.Flatten(input_shape=(64, 2, 2)),
  tf.keras.layers.Dense(128,activation=partial(tf.nn.leaky_relu, alpha=0.01)),
  tf.keras.layers.Dense(10, activation=partial(tf.nn.leaky_relu, alpha=0.01))
])

model.compile(
    optimizer=tf.keras.optimizers.RMSprop(),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=[tf.keras.metrics.SparseCategoricalAccuracy()],
)

model.fit(
    ds_train,
    epochs=25,
    validation_data=ds_test,
)

Train on 469 steps, validate on 79 steps
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<tensorflow.python.keras.callbacks.History at 0x7ff9806dcf60>

RMSPror also does a great job. Now, we will check SGD

In [8]:
#Train with Stochastic Gradient Descent

model = tf.keras.models.Sequential([
  tf.keras.layers.Conv2D(32, 5, input_shape=(28, 28, 1)),
  tf.keras.layers.MaxPooling2D(pool_size=(2, 2)),
  tf.keras.layers.ReLU(negative_slope=0.01),
  tf.keras.layers.Conv2D(64, 5),
  tf.keras.layers.MaxPooling2D(pool_size=(2, 2)),
  tf.keras.layers.ReLU(negative_slope=0.01),
  tf.keras.layers.MaxPooling2D(pool_size=(2, 2)),
  tf.keras.layers.Flatten(input_shape=(64, 2, 2)),
  tf.keras.layers.Dense(128,activation=partial(tf.nn.leaky_relu, alpha=0.01)),
  tf.keras.layers.Dense(10, activation=partial(tf.nn.leaky_relu, alpha=0.01))
])

model.compile(
    optimizer=tf.keras.optimizers.SGD(),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=[tf.keras.metrics.SparseCategoricalAccuracy()],
)

model.fit(
    ds_train,
    epochs=25,
    validation_data=ds_test,
)

Train on 469 steps, validate on 79 steps
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<tensorflow.python.keras.callbacks.History at 0x7ff95efc84a8>

The gradient descent performed slightly worse - it was slower to learn and ended up with a slightly lower score. 

In [None]:
## Now, we will focus on the 

In [10]:
(x_train, y_tr), (x_test, y_t) = tf.keras.datasets.mnist.load_data()


# Normalize the data
x_train = x_train.astype('float32') / 255.
x_test = x_test.astype('float32') / 255.
x_train = np.reshape(x_train, (-1,28,28,1))
x_test = np.reshape(x_test, (-1,28,28,1))
y_train = np.zeros((y_tr.shape[0], 10))
y_test = np.zeros((y_t.shape[0], 10))
for i in range(y_tr.shape[0]):
    y_train[i, y_tr[i]] = 1
for i in range(y_t.shape[0]):
    y_test[i, y_t[i]] = 1
y_train = y_train.astype(np.float)
y_test = y_test.astype(np.float)

In [12]:
settings = {}
settings['batch_size'] = 32
settings['hess_batch_size'] = 8


model = tf.keras.models.Sequential([
  tf.keras.layers.Conv2D(32, 5, input_shape=(28, 28, 1)),
  tf.keras.layers.MaxPooling2D(pool_size=(2, 2)),
  tf.keras.layers.ReLU(negative_slope=0.01),
  tf.keras.layers.Conv2D(64, 5),
  tf.keras.layers.MaxPooling2D(pool_size=(2, 2)),
  tf.keras.layers.ReLU(negative_slope=0.01),
  tf.keras.layers.MaxPooling2D(pool_size=(2, 2)),
  tf.keras.layers.Flatten(input_shape=(64, 2, 2)),
  tf.keras.layers.Dense(128,activation=partial(tf.nn.leaky_relu, alpha=0.01)),
  tf.keras.layers.Dense(10, activation=partial(tf.nn.leaky_relu, alpha=0.01))
])

# Instante the data object
problem = ClassificationProblem(model,dtype = tf.float32)

train_data = {problem.x:x_train, problem.y_true:y_train}
validation_data = {problem.x:x_test, problem.y_true:y_test}

HLModelSettings = HessianlearnModelSettings()
HLModelSettings['hessian_low_rank'] = 40
#HLModelSettings['globalization'] = None
HLModelSettings['max_sweeps'] = 20
#HLModelSettings['alpha'] = 5e-2
#HLModelSettings['printing_sweep_frequency'] = 10
regularization = L2Regularization(problem)
settings = {}
settings['batch_size'] = 128
settings['hess_batch_size'] = 128
data = Data(train_data,settings['batch_size'], validation_data = validation_data, hessian_batch_size=128)
HLModel = HessianlearnModel(problem,regularization,data,settings = HLModelSettings)



Data dimension agree
################################################################################
                      Size of configuration space:  86282                       
                          Size of training data: 60000                          
                   Approximate data cardinality needed: 8628                    
################################################################################


In [13]:
HLModel.fit()

                  Using low rank SFN optimizer with fixed step                  
                                Batch size = 128                                
                            Hessian batch size = 128                            
                             Hessian low rank = 40                              
################################################################################
 sweeps    Loss     acc     ||g||   Lossval   accval   maxacc   alpha     rank   
  0.00   2.30e+00 14.062% 4.95e-01 2.31e+00 9.610% 9.610% 5.00e-02     0 
  1.04   2.16e+00 16.406% 1.13e+00 2.18e+00 15.660% 15.660% 5.00e-02    40 
  2.07   2.33e+00 10.938% 1.03e+00 2.35e+00 9.760% 15.660% 5.00e-02    40 
  3.11   2.31e+00 14.062% 1.04e+00 2.27e+00 20.320% 20.320% 5.00e-02    40 
  4.15   2.28e+00 7.812% 8.02e-01 2.22e+00 15.810% 20.320% 5.00e-02    40 
  5.01   2.17e+00 18.750% 3.12e-01 2.18e+00 17.170% 20.320% 5.00e-02    40 
  6.05   2.18e+00 18.750% 4.52e-01 2.14e+00 19.220% 20.320% 5

Our proposed method proved to be significantly worse than all the more simple ones. Let's see what happens on the artificial problem. 