# Exercise 8

## 8a: five hidden layers of 100 neurons each, He initialization, and the ELU activation function.

In [24]:
import tensorflow as tf
import numpy as np
from tensorflow.contrib.layers import fully_connected
import numpy as np
tf.reset_default_graph()

In [25]:
tf.test.is_gpu_available()

True

In [26]:
# load data
(X_train, y_train), (X_test, y_test) = tf.keras.datasets.mnist.load_data()
X_train = X_train.astype(np.float32).reshape(-1, 28*28) / 255.0
X_test = X_test.astype(np.float32).reshape(-1, 28*28) / 255.0
y_train = y_train.astype(np.int32)
y_test = y_test.astype(np.int32)
X_valid, X_train = X_train[:1000], X_train[45000:]
y_valid, y_train = y_train[:1000], y_train[45000:]
# print(X_valid.shape, X_train.shape)

he_init = tf.variance_scaling_initializer()

def dnn(inputs, n_hidden_layers=5, n_neurons=100, name=None,
        activation=tf.nn.elu, initializer=he_init):
    with tf.variable_scope(name, "dnn"):
        for layer in range(n_hidden_layers):
            inputs = tf.layers.dense(inputs, n_neurons, activation=activation,
                                     kernel_initializer=initializer,
                                     name="hidden%d" % (layer + 1))
        return inputs

n_inputs = 28 * 28 # MNIST
n_outputs = 5


X = tf.placeholder(tf.float32, shape=(None, n_inputs), name="X")
y = tf.placeholder(tf.int32, shape=(None), name="y")

dnn_outputs = dnn(X)

logits = tf.layers.dense(dnn_outputs, n_outputs, kernel_initializer=he_init, name="logits")
Y_proba = tf.nn.softmax(logits, name="Y_proba")

## 8b: Adam and early stopping

In [27]:
learning_rate = 0.01

xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=logits)
loss = tf.reduce_mean(xentropy, name="loss")

optimizer = tf.train.AdamOptimizer(learning_rate)
training_op = optimizer.minimize(loss, name="training_op")

correct = tf.nn.in_top_k(logits, y, 1)
accuracy = tf.reduce_mean(tf.cast(correct, tf.float32), name="accuracy")

init = tf.global_variables_initializer()
saver = tf.train.Saver()

# digits 5-9

In [28]:
# 5-9
X_train1 = X_train[y_train >= 5]
y_train1 = y_train[y_train >= 5]-5
X_valid1 = X_valid[y_valid >= 5]
y_valid1 = y_valid[y_valid >= 5]-5
X_test1 = X_test[y_test >= 5]
y_test1 = y_test[y_test >= 5]-5

In [29]:
n_epochs = 1000
batch_size = 20

max_checks_without_progress = 20
checks_without_progress = 0
best_loss = np.infty

with tf.Session() as sess:
    init.run()

    for epoch in range(n_epochs):
        rnd_idx = np.random.permutation(len(X_train1))
        for rnd_indices in np.array_split(rnd_idx, len(X_train1) // batch_size):
            X_batch, y_batch = X_train1[rnd_indices], y_train1[rnd_indices]
            sess.run(training_op, feed_dict={X: X_batch, y: y_batch})
        loss_val, acc_val = sess.run([loss, accuracy], feed_dict={X: X_valid1, y: y_valid1})
        if loss_val < best_loss:
            save_path = saver.save(sess, "./my_mnist_model_5_to_9.ckpt")
            best_loss = loss_val
            checks_without_progress = 0
        else:
            checks_without_progress += 1
            if checks_without_progress > max_checks_without_progress:
                print("Early stopping!")
                break
        print("{}\tValidation loss: {:.6f}\tBest loss: {:.6f}\tAccuracy: {:.2f}%".format(
            epoch, loss_val, best_loss, acc_val * 100))

with tf.Session() as sess:
    saver.restore(sess, "./my_mnist_model_5_to_9.ckpt")
    acc_test = accuracy.eval(feed_dict={X: X_test1, y: y_test1})
    print("Final test accuracy: {:.2f}%".format(acc_test * 100))

0	Validation loss: 0.373305	Best loss: 0.373305	Accuracy: 88.57%
1	Validation loss: 0.187626	Best loss: 0.187626	Accuracy: 94.29%
2	Validation loss: 0.224016	Best loss: 0.187626	Accuracy: 92.86%
3	Validation loss: 0.340064	Best loss: 0.187626	Accuracy: 92.65%
4	Validation loss: 1.614719	Best loss: 0.187626	Accuracy: 19.18%
5	Validation loss: 1.644004	Best loss: 0.187626	Accuracy: 20.41%
6	Validation loss: 1.666405	Best loss: 0.187626	Accuracy: 24.49%
7	Validation loss: 1.624175	Best loss: 0.187626	Accuracy: 25.31%
8	Validation loss: 1.691371	Best loss: 0.187626	Accuracy: 23.88%
9	Validation loss: 1.915739	Best loss: 0.187626	Accuracy: 18.78%
10	Validation loss: 1.803772	Best loss: 0.187626	Accuracy: 19.18%
11	Validation loss: 1.664465	Best loss: 0.187626	Accuracy: 17.76%
12	Validation loss: 1.645928	Best loss: 0.187626	Accuracy: 20.41%
13	Validation loss: 1.814430	Best loss: 0.187626	Accuracy: 17.76%
14	Validation loss: 1.709177	Best loss: 0.187626	Accuracy: 20.41%
15	Validation loss: 

<span style='color:blue'>**Baseline model gets 93.13%**</span>

## c: hyperparameters, cross-validation and precision

In [31]:
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.exceptions import NotFittedError

class DNNClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, n_hidden_layers=5, n_neurons=100, optimizer_class=tf.train.AdamOptimizer,
                 learning_rate=0.01, batch_size=20, activation=tf.nn.elu, initializer=he_init,
                 batch_norm_momentum=None, dropout_rate=None, random_state=None):
        """Initialize the DNNClassifier by simply storing all the hyperparameters."""
        self.n_hidden_layers = n_hidden_layers
        self.n_neurons = n_neurons
        self.optimizer_class = optimizer_class
        self.learning_rate = learning_rate
        self.batch_size = batch_size
        self.activation = activation
        self.initializer = initializer
        self.batch_norm_momentum = batch_norm_momentum
        self.dropout_rate = dropout_rate
        self.random_state = random_state
        self._session = None

    def _dnn(self, inputs):
        """Build the hidden layers, with support for batch normalization and dropout."""
        for layer in range(self.n_hidden_layers):
            if self.dropout_rate:
                inputs = tf.layers.dropout(inputs, self.dropout_rate, training=self._training)
            inputs = tf.layers.dense(inputs, self.n_neurons,
                                     kernel_initializer=self.initializer,
                                     name="hidden%d" % (layer + 1))
            if self.batch_norm_momentum:
                inputs = tf.layers.batch_normalization(inputs, momentum=self.batch_norm_momentum,
                                                       training=self._training)
            inputs = self.activation(inputs, name="hidden%d_out" % (layer + 1))
        return inputs

    def _build_graph(self, n_inputs, n_outputs):
        """Build the same model as earlier"""
        if self.random_state is not None:
            tf.set_random_seed(self.random_state)
            np.random.seed(self.random_state)

        X = tf.placeholder(tf.float32, shape=(None, n_inputs), name="X")
        y = tf.placeholder(tf.int32, shape=(None), name="y")

        if self.batch_norm_momentum or self.dropout_rate:
            self._training = tf.placeholder_with_default(False, shape=(), name='training')
        else:
            self._training = None

        dnn_outputs = self._dnn(X)

        logits = tf.layers.dense(dnn_outputs, n_outputs, kernel_initializer=he_init, name="logits")
        Y_proba = tf.nn.softmax(logits, name="Y_proba")

        xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y,
                                                                  logits=logits)
        loss = tf.reduce_mean(xentropy, name="loss")

        optimizer = self.optimizer_class(learning_rate=self.learning_rate)
        training_op = optimizer.minimize(loss)

        correct = tf.nn.in_top_k(logits, y, 1)
        accuracy = tf.reduce_mean(tf.cast(correct, tf.float32), name="accuracy")

        init = tf.global_variables_initializer()
        saver = tf.train.Saver()

        # Make the important operations available easily through instance variables
        self._X, self._y = X, y
        self._Y_proba, self._loss = Y_proba, loss
        self._training_op, self._accuracy = training_op, accuracy
        self._init, self._saver = init, saver

    def close_session(self):
        if self._session:
            self._session.close()

    def _get_model_params(self):
        """Get all variable values (used for early stopping, faster than saving to disk)"""
        with self._graph.as_default():
            gvars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)
        return {gvar.op.name: value for gvar, value in zip(gvars, self._session.run(gvars))}

    def _restore_model_params(self, model_params):
        """Set all variables to the given values (for early stopping, faster than loading from disk)"""
        gvar_names = list(model_params.keys())
        assign_ops = {gvar_name: self._graph.get_operation_by_name(gvar_name + "/Assign")
                      for gvar_name in gvar_names}
        init_values = {gvar_name: assign_op.inputs[1] for gvar_name, assign_op in assign_ops.items()}
        feed_dict = {init_values[gvar_name]: model_params[gvar_name] for gvar_name in gvar_names}
        self._session.run(assign_ops, feed_dict=feed_dict)

    def fit(self, X, y, n_epochs=100, X_valid=None, y_valid=None):
        """Fit the model to the training set. If X_valid and y_valid are provided, use early stopping."""
        self.close_session()

        # infer n_inputs and n_outputs from the training set.
        n_inputs = X.shape[1]
        self.classes_ = np.unique(y)
        n_outputs = len(self.classes_)
        
        # Translate the labels vector to a vector of sorted class indices, containing
        # integers from 0 to n_outputs - 1.
        # For example, if y is equal to [8, 8, 9, 5, 7, 6, 6, 6], then the sorted class
        # labels (self.classes_) will be equal to [5, 6, 7, 8, 9], and the labels vector
        # will be translated to [3, 3, 4, 0, 2, 1, 1, 1]
        self.class_to_index_ = {label: index
                                for index, label in enumerate(self.classes_)}
        y = np.array([self.class_to_index_[label]
                      for label in y], dtype=np.int32)
        
        self._graph = tf.Graph()
        with self._graph.as_default():
            self._build_graph(n_inputs, n_outputs)
            # extra ops for batch normalization
            extra_update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)

        # needed in case of early stopping
        max_checks_without_progress = 20
        checks_without_progress = 0
        best_loss = np.infty
        best_params = None
        
        # Now train the model!
        self._session = tf.Session(graph=self._graph)
        with self._session.as_default() as sess:
            self._init.run()
            for epoch in range(n_epochs):
                rnd_idx = np.random.permutation(len(X))
                for rnd_indices in np.array_split(rnd_idx, len(X) // self.batch_size):
                    X_batch, y_batch = X[rnd_indices], y[rnd_indices]
                    feed_dict = {self._X: X_batch, self._y: y_batch}
                    if self._training is not None:
                        feed_dict[self._training] = True
                    sess.run(self._training_op, feed_dict=feed_dict)
                    if extra_update_ops:
                        sess.run(extra_update_ops, feed_dict=feed_dict)
                if X_valid is not None and y_valid is not None:
                    loss_val, acc_val = sess.run([self._loss, self._accuracy],
                                                 feed_dict={self._X: X_valid,
                                                            self._y: y_valid})
                    if loss_val < best_loss:
                        best_params = self._get_model_params()
                        best_loss = loss_val
                        checks_without_progress = 0
                    else:
                        checks_without_progress += 1
                    print("{}\tValidation loss: {:.6f}\tBest loss: {:.6f}\tAccuracy: {:.2f}%".format(
                        epoch, loss_val, best_loss, acc_val * 100))
                    if checks_without_progress > max_checks_without_progress:
                        print("Early stopping!")
                        break
                else:
                    loss_train, acc_train = sess.run([self._loss, self._accuracy],
                                                     feed_dict={self._X: X_batch,
                                                                self._y: y_batch})
                    print("{}\tLast training batch loss: {:.6f}\tAccuracy: {:.2f}%".format(
                        epoch, loss_train, acc_train * 100))
            # If we used early stopping then rollback to the best model found
            if best_params:
                self._restore_model_params(best_params)
            return self

    def predict_proba(self, X):
        if not self._session:
            raise NotFittedError("This %s instance is not fitted yet" % self.__class__.__name__)
        with self._session.as_default() as sess:
            return self._Y_proba.eval(feed_dict={self._X: X})

    def predict(self, X):
        class_indices = np.argmax(self.predict_proba(X), axis=1)
        return np.array([[self.classes_[class_index]]
                         for class_index in class_indices], np.int32)

    def save(self, path):
        self._saver.save(self._session, path)

<span style='color:blue'>**In an attempt to improve this initial score, we should searching for the best hyperparameters.**</span>

In [32]:
from sklearn.model_selection import RandomizedSearchCV

def leaky_relu(alpha=0.01):
    def parametrized_leaky_relu(z, name=None):
        return tf.maximum(alpha * z, z, name=name)
    return parametrized_leaky_relu

param_distribs = {
    "n_neurons": [100, 120,140],
    "batch_size": [50, 100, 500],
    "learning_rate": [0.01, 0.02, 0.05],
    "activation": [tf.nn.relu, tf.nn.elu, leaky_relu(alpha=0.1)],
}

rnd_search = RandomizedSearchCV(DNNClassifier(random_state=42), param_distribs, n_iter=50,
                                fit_params={"X_valid": X_valid1, "y_valid": y_valid1, "n_epochs": 1000},
                                random_state=42, verbose=2)
rnd_search.fit(X_train1, y_train1)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 3 folds for each of 50 candidates, totalling 150 fits
[CV] n_neurons=100, learning_rate=0.02, batch_size=50, activation=<function elu at 0x7ff44f822620> 
0	Validation loss: 0.313167	Best loss: 0.313167	Accuracy: 88.78%
1	Validation loss: 0.234912	Best loss: 0.234912	Accuracy: 93.88%
2	Validation loss: 0.253178	Best loss: 0.234912	Accuracy: 94.08%
3	Validation loss: 0.268187	Best loss: 0.234912	Accuracy: 94.08%
4	Validation loss: 0.295417	Best loss: 0.234912	Accuracy: 94.69%
5	Validation loss: 0.246227	Best loss: 0.234912	Accuracy: 95.10%
6	Validation loss: 0.232233	Best loss: 0.232233	Accuracy: 93.27%
7	Validation loss: 0.334749	Best loss: 0.232233	Accuracy: 93.67%
8	Validation loss: 0.276250	Best loss: 0.232233	Accuracy: 94.90%
9	Validation loss: 0.390597	Best loss: 0.232233	Accuracy: 94.29%
10	Validation loss: 2.011095	Best loss: 0.232233	Accuracy: 20.41%
11	Validation loss: 1.638257	Best loss: 0.232233	Accuracy: 17.76%
12	Validation loss: 1.707087	Best loss: 0.232233	Accurac

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    7.3s remaining:    0.0s


0	Validation loss: 0.368091	Best loss: 0.368091	Accuracy: 89.39%
1	Validation loss: 0.288692	Best loss: 0.288692	Accuracy: 92.86%
2	Validation loss: 0.273773	Best loss: 0.273773	Accuracy: 94.49%
3	Validation loss: 0.224186	Best loss: 0.224186	Accuracy: 93.88%
4	Validation loss: 0.251868	Best loss: 0.224186	Accuracy: 92.86%
5	Validation loss: 0.345850	Best loss: 0.224186	Accuracy: 90.61%
6	Validation loss: 0.419385	Best loss: 0.224186	Accuracy: 93.88%
7	Validation loss: 0.208431	Best loss: 0.208431	Accuracy: 94.29%
8	Validation loss: 0.257333	Best loss: 0.208431	Accuracy: 95.92%
9	Validation loss: 0.322313	Best loss: 0.208431	Accuracy: 93.88%
10	Validation loss: 1.817825	Best loss: 0.208431	Accuracy: 23.88%
11	Validation loss: 1.658772	Best loss: 0.208431	Accuracy: 20.41%
12	Validation loss: 1.840716	Best loss: 0.208431	Accuracy: 23.88%
13	Validation loss: 1.715040	Best loss: 0.208431	Accuracy: 18.78%
14	Validation loss: 1.733840	Best loss: 0.208431	Accuracy: 18.78%
15	Validation loss: 

[Parallel(n_jobs=1)]: Done 150 out of 150 | elapsed: 13.5min finished


0	Validation loss: 0.366333	Best loss: 0.366333	Accuracy: 87.35%
1	Validation loss: 0.184388	Best loss: 0.184388	Accuracy: 94.49%
2	Validation loss: 0.149142	Best loss: 0.149142	Accuracy: 94.49%
3	Validation loss: 0.160656	Best loss: 0.149142	Accuracy: 94.29%
4	Validation loss: 0.124243	Best loss: 0.124243	Accuracy: 96.12%
5	Validation loss: 0.125635	Best loss: 0.124243	Accuracy: 96.33%
6	Validation loss: 0.181729	Best loss: 0.124243	Accuracy: 95.51%
7	Validation loss: 0.139290	Best loss: 0.124243	Accuracy: 96.33%
8	Validation loss: 0.112242	Best loss: 0.112242	Accuracy: 96.12%
9	Validation loss: 0.084844	Best loss: 0.084844	Accuracy: 97.76%
10	Validation loss: 0.155158	Best loss: 0.084844	Accuracy: 96.33%
11	Validation loss: 0.171607	Best loss: 0.084844	Accuracy: 96.33%
12	Validation loss: 0.151524	Best loss: 0.084844	Accuracy: 96.12%
13	Validation loss: 0.137772	Best loss: 0.084844	Accuracy: 96.33%
14	Validation loss: 0.143459	Best loss: 0.084844	Accuracy: 97.35%
15	Validation loss: 

RandomizedSearchCV(cv='warn', error_score='raise-deprecating',
          estimator=DNNClassifier(activation=<function elu at 0x7ff44f822620>,
       batch_norm_momentum=None, batch_size=20, dropout_rate=None,
       initializer=<tensorflow.python.ops.init_ops.VarianceScaling object at 0x7ff3d432ec88>,
       learning_rate=0.01, n_hidden_layers=5, n_neurons=100,
       optimizer_class=<class 'tensorflow.python.training.adam.AdamOptimizer'>,
       random_state=42),
          fit_params={'X_valid': array([[0., 0., ..., 0., 0.],
       [0., 0., ..., 0., 0.],
       ...,
       [0., 0., ..., 0., 0.],
       [0., 0., ..., 0., 0.]], dtype=float32), 'y_valid': array([0, 4, ..., 1, 1], dtype=int32), 'n_epochs': 1000},
          iid='warn', n_iter=50, n_jobs=None,
          param_distributions={'n_neurons': [100, 120, 140], 'batch_size': [50, 100, 500], 'learning_rate': [0.01, 0.02, 0.05], 'activation': [<function relu at 0x7ff44f7b6950>, <function elu at 0x7ff44f822620>, <function leaky_relu.<

In [33]:
rnd_search.best_params_

{'n_neurons': 120,
 'learning_rate': 0.01,
 'batch_size': 500,
 'activation': <function tensorflow.python.ops.gen_nn_ops.relu(features, name=None)>}

In [34]:
y_pred = rnd_search.predict(X_test1)
accuracy_score(y_test1, y_pred)

0.9664678049783995

<span style='color:blue'>**This is the best model so far, so we should save it for later use**</span>

In [35]:
rnd_search.best_estimator_.save("./my_best_mnist_model_5_to_9")

## d: Batch Normalization

Without Batch Normalization

In [36]:
dnn_clf = DNNClassifier(activation= tf.nn.relu, batch_size=500, learning_rate=0.01,
                        n_neurons=120, random_state=42)
dnn_clf.fit(X_train1, y_train1, n_epochs=1000, X_valid=X_valid1, y_valid=y_valid1)
y_pred = dnn_clf.predict(X_test1)
accuracy_score(y_test1, y_pred)

0	Validation loss: 0.366333	Best loss: 0.366333	Accuracy: 87.35%
1	Validation loss: 0.184371	Best loss: 0.184371	Accuracy: 94.49%
2	Validation loss: 0.149413	Best loss: 0.149413	Accuracy: 94.49%
3	Validation loss: 0.162593	Best loss: 0.149413	Accuracy: 94.49%
4	Validation loss: 0.119762	Best loss: 0.119762	Accuracy: 96.12%
5	Validation loss: 0.134737	Best loss: 0.119762	Accuracy: 96.12%
6	Validation loss: 0.147234	Best loss: 0.119762	Accuracy: 96.53%
7	Validation loss: 0.157571	Best loss: 0.119762	Accuracy: 95.92%
8	Validation loss: 0.195341	Best loss: 0.119762	Accuracy: 95.71%
9	Validation loss: 0.156905	Best loss: 0.119762	Accuracy: 95.71%
10	Validation loss: 0.173011	Best loss: 0.119762	Accuracy: 96.12%
11	Validation loss: 0.161156	Best loss: 0.119762	Accuracy: 95.51%
12	Validation loss: 0.103851	Best loss: 0.103851	Accuracy: 97.55%
13	Validation loss: 0.123913	Best loss: 0.103851	Accuracy: 96.12%
14	Validation loss: 0.164113	Best loss: 0.103851	Accuracy: 96.94%
15	Validation loss: 

0.9687307138448878

With Batch Normalization

In [37]:
dnn_clf_bn = DNNClassifier(activation= tf.nn.relu, batch_size=500, learning_rate=0.01,
                        n_neurons=120, random_state=42,
                           batch_norm_momentum=0.95)
dnn_clf_bn.fit(X_train1, y_train1, n_epochs=1000, X_valid=X_valid1, y_valid=y_valid1)

0	Validation loss: 0.345291	Best loss: 0.345291	Accuracy: 90.61%
1	Validation loss: 0.192521	Best loss: 0.192521	Accuracy: 95.51%
2	Validation loss: 0.185130	Best loss: 0.185130	Accuracy: 95.92%
3	Validation loss: 0.139026	Best loss: 0.139026	Accuracy: 96.94%
4	Validation loss: 0.124044	Best loss: 0.124044	Accuracy: 96.33%
5	Validation loss: 0.139799	Best loss: 0.124044	Accuracy: 96.94%
6	Validation loss: 0.152070	Best loss: 0.124044	Accuracy: 96.73%
7	Validation loss: 0.155036	Best loss: 0.124044	Accuracy: 96.33%
8	Validation loss: 0.151337	Best loss: 0.124044	Accuracy: 95.92%
9	Validation loss: 0.143326	Best loss: 0.124044	Accuracy: 96.53%
10	Validation loss: 0.162083	Best loss: 0.124044	Accuracy: 96.33%
11	Validation loss: 0.144696	Best loss: 0.124044	Accuracy: 96.73%
12	Validation loss: 0.180198	Best loss: 0.124044	Accuracy: 96.12%
13	Validation loss: 0.157697	Best loss: 0.124044	Accuracy: 96.33%
14	Validation loss: 0.176703	Best loss: 0.124044	Accuracy: 95.92%
15	Validation loss: 

DNNClassifier(activation=<function relu at 0x7ff44f7b6950>,
       batch_norm_momentum=0.95, batch_size=500, dropout_rate=None,
       initializer=<tensorflow.python.ops.init_ops.VarianceScaling object at 0x7ff3d432ec88>,
       learning_rate=0.01, n_hidden_layers=5, n_neurons=120,
       optimizer_class=<class 'tensorflow.python.training.adam.AdamOptimizer'>,
       random_state=42)

New Accuracy

In [38]:
y_pred = dnn_clf_bn.predict(X_test1)
accuracy_score(y_test1, y_pred)

0.9699650277720634

<span style='color:blue'>**As we can see Batch normlaization slightly increases the accuracy but not by much. We can try and find the optimal paramters to make it more effective through a randomized search.**</span>

Hyperparameter search

In [39]:
from sklearn.model_selection import RandomizedSearchCV
# reduce parameters to minimize runtime
param_distribs = {
    "n_neurons": [100, 120, 140],
    "batch_size": [50, 100, 500],
    "learning_rate": [0.01, 0.02, 0.05],
    "activation": [tf.nn.relu, tf.nn.elu, leaky_relu(alpha=0.01), leaky_relu(alpha=0.1)],
    # you could also try exploring different numbers of hidden layers, different optimizers, etc.
    #"n_hidden_layers": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    #"optimizer_class": [tf.train.AdamOptimizer, partial(tf.train.MomentumOptimizer, momentum=0.95)],
    "batch_norm_momentum": [0.95, 0.98, 0.99],
}

rnd_search_bn = RandomizedSearchCV(DNNClassifier(random_state=42), param_distribs, n_iter=50,
                                   fit_params={"X_valid": X_valid1, "y_valid": y_valid1, "n_epochs": 1000},
                                   random_state=42, verbose=2)
rnd_search_bn.fit(X_train1, y_train1)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 3 folds for each of 50 candidates, totalling 150 fits
[CV] n_neurons=100, learning_rate=0.05, batch_size=500, batch_norm_momentum=0.98, activation=<function elu at 0x7ff44f822620> 
0	Validation loss: 11.534751	Best loss: 11.534751	Accuracy: 78.57%
1	Validation loss: 5.914383	Best loss: 5.914383	Accuracy: 80.61%
2	Validation loss: 2.357674	Best loss: 2.357674	Accuracy: 85.71%
3	Validation loss: 1.928056	Best loss: 1.928056	Accuracy: 85.31%
4	Validation loss: 0.762963	Best loss: 0.762963	Accuracy: 91.43%
5	Validation loss: 0.502139	Best loss: 0.502139	Accuracy: 92.65%
6	Validation loss: 0.389794	Best loss: 0.389794	Accuracy: 92.65%
7	Validation loss: 0.290762	Best loss: 0.290762	Accuracy: 93.67%
8	Validation loss: 0.278602	Best loss: 0.278602	Accuracy: 93.67%
9	Validation loss: 0.276627	Best loss: 0.276627	Accuracy: 93.47%
10	Validation loss: 0.248483	Best loss: 0.248483	Accuracy: 94.08%
11	Validation loss: 0.238109	Best loss: 0.238109	Accuracy: 93.47%
12	Validation loss: 0.29359

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    6.3s remaining:    0.0s


0	Validation loss: 26.070658	Best loss: 26.070658	Accuracy: 65.10%
1	Validation loss: 15.588041	Best loss: 15.588041	Accuracy: 66.53%
2	Validation loss: 3.709750	Best loss: 3.709750	Accuracy: 78.16%
3	Validation loss: 1.522752	Best loss: 1.522752	Accuracy: 85.92%
4	Validation loss: 0.741762	Best loss: 0.741762	Accuracy: 89.80%
5	Validation loss: 0.464621	Best loss: 0.464621	Accuracy: 92.04%
6	Validation loss: 0.368040	Best loss: 0.368040	Accuracy: 93.06%
7	Validation loss: 0.336106	Best loss: 0.336106	Accuracy: 93.67%
8	Validation loss: 0.241946	Best loss: 0.241946	Accuracy: 94.90%
9	Validation loss: 0.223201	Best loss: 0.223201	Accuracy: 95.10%
10	Validation loss: 0.194180	Best loss: 0.194180	Accuracy: 95.71%
11	Validation loss: 0.241386	Best loss: 0.194180	Accuracy: 94.29%
12	Validation loss: 0.208690	Best loss: 0.194180	Accuracy: 95.51%
13	Validation loss: 0.234642	Best loss: 0.194180	Accuracy: 95.31%
14	Validation loss: 0.235376	Best loss: 0.194180	Accuracy: 93.88%
15	Validation lo

[Parallel(n_jobs=1)]: Done 150 out of 150 | elapsed: 54.6min finished


0	Validation loss: 0.217724	Best loss: 0.217724	Accuracy: 92.86%
1	Validation loss: 0.129276	Best loss: 0.129276	Accuracy: 95.31%
2	Validation loss: 0.147191	Best loss: 0.129276	Accuracy: 96.12%
3	Validation loss: 0.125531	Best loss: 0.125531	Accuracy: 95.92%
4	Validation loss: 0.152719	Best loss: 0.125531	Accuracy: 95.51%
5	Validation loss: 0.141654	Best loss: 0.125531	Accuracy: 96.12%
6	Validation loss: 0.127117	Best loss: 0.125531	Accuracy: 95.92%
7	Validation loss: 0.117385	Best loss: 0.117385	Accuracy: 97.14%
8	Validation loss: 0.119377	Best loss: 0.117385	Accuracy: 95.71%
9	Validation loss: 0.114869	Best loss: 0.114869	Accuracy: 96.33%
10	Validation loss: 0.089569	Best loss: 0.089569	Accuracy: 97.76%
11	Validation loss: 0.104760	Best loss: 0.089569	Accuracy: 96.73%
12	Validation loss: 0.092636	Best loss: 0.089569	Accuracy: 96.53%
13	Validation loss: 0.126882	Best loss: 0.089569	Accuracy: 96.73%
14	Validation loss: 0.138764	Best loss: 0.089569	Accuracy: 95.92%
15	Validation loss: 

RandomizedSearchCV(cv='warn', error_score='raise-deprecating',
          estimator=DNNClassifier(activation=<function elu at 0x7ff44f822620>,
       batch_norm_momentum=None, batch_size=20, dropout_rate=None,
       initializer=<tensorflow.python.ops.init_ops.VarianceScaling object at 0x7ff3d432ec88>,
       learning_rate=0.01, n_hidden_layers=5, n_neurons=100,
       optimizer_class=<class 'tensorflow.python.training.adam.AdamOptimizer'>,
       random_state=42),
          fit_params={'X_valid': array([[0., 0., ..., 0., 0.],
       [0., 0., ..., 0., 0.],
       ...,
       [0., 0., ..., 0., 0.],
       [0., 0., ..., 0., 0.]], dtype=float32), 'y_valid': array([0, 4, ..., 1, 1], dtype=int32), 'n_epochs': 1000},
          iid='warn', n_iter=50, n_jobs=None,
          param_distributions={'n_neurons': [100, 120, 140], 'batch_size': [50, 100, 500], 'learning_rate': [0.01, 0.02, 0.05], 'activation': [<function relu at 0x7ff44f7b6950>, <function elu at 0x7ff44f822620>, <function leaky_relu.<

In [40]:
rnd_search_bn.best_params_

{'n_neurons': 140,
 'learning_rate': 0.01,
 'batch_size': 50,
 'batch_norm_momentum': 0.98,
 'activation': <function __main__.leaky_relu.<locals>.parametrized_leaky_relu(z, name=None)>}

In [41]:
y_pred = rnd_search_bn.predict(X_test1)
accuracy_score(y_test1, y_pred)

0.9709936227113762

<span style='color:blue'>**Compared to the baseline model of 93.13% accuracy, this is much better. We have reduced error by about a third.**</span>

## e: Dropout

In [42]:
dnn_clf_dropout = DNNClassifier(activation=leaky_relu(alpha=0.1), batch_size=500, learning_rate=0.01,
                                n_neurons=90, random_state=42,
                                dropout_rate=0.5)
dnn_clf_dropout.fit(X_train1, y_train1, n_epochs=1000, X_valid=X_valid1, y_valid=y_valid1)

0	Validation loss: 0.659904	Best loss: 0.659904	Accuracy: 69.80%
1	Validation loss: 0.353915	Best loss: 0.353915	Accuracy: 89.59%
2	Validation loss: 0.271649	Best loss: 0.271649	Accuracy: 89.80%
3	Validation loss: 0.242470	Best loss: 0.242470	Accuracy: 91.43%
4	Validation loss: 0.233236	Best loss: 0.233236	Accuracy: 92.24%
5	Validation loss: 0.177197	Best loss: 0.177197	Accuracy: 94.90%
6	Validation loss: 0.181688	Best loss: 0.177197	Accuracy: 93.47%
7	Validation loss: 0.169791	Best loss: 0.169791	Accuracy: 95.31%
8	Validation loss: 0.164366	Best loss: 0.164366	Accuracy: 94.08%
9	Validation loss: 0.152641	Best loss: 0.152641	Accuracy: 95.31%
10	Validation loss: 0.167314	Best loss: 0.152641	Accuracy: 94.29%
11	Validation loss: 0.145644	Best loss: 0.145644	Accuracy: 96.33%
12	Validation loss: 0.153651	Best loss: 0.145644	Accuracy: 95.51%
13	Validation loss: 0.141992	Best loss: 0.141992	Accuracy: 96.12%
14	Validation loss: 0.132663	Best loss: 0.132663	Accuracy: 96.12%
15	Validation loss: 

DNNClassifier(activation=<function leaky_relu.<locals>.parametrized_leaky_relu at 0x7ff3bfc82a60>,
       batch_norm_momentum=None, batch_size=500, dropout_rate=0.5,
       initializer=<tensorflow.python.ops.init_ops.VarianceScaling object at 0x7ff3d432ec88>,
       learning_rate=0.01, n_hidden_layers=5, n_neurons=90,
       optimizer_class=<class 'tensorflow.python.training.adam.AdamOptimizer'>,
       random_state=42)

Accuracy

In [45]:
y_pred = dnn_clf_dropout.predict(X_test1)
accuracy_score(y_test1, y_pred)

0.9672906809298498

<span style='color:blue'>**Again we will use a randomized search to find the optimal paramters for dropout**</span>

Hyperparameter search

In [46]:
from sklearn.model_selection import RandomizedSearchCV

param_distribs = {
    "n_neurons": [100, 120, 140],
    "batch_size": [50, 100, 500],
    "learning_rate": [0.01, 0.02, 0.05],
    "activation": [tf.nn.relu, tf.nn.elu, leaky_relu(alpha=0.01), leaky_relu(alpha=0.1)],
    # you could also try exploring different numbers of hidden layers, different optimizers, etc.
    #"n_hidden_layers": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    #"optimizer_class": [tf.train.AdamOptimizer, partial(tf.train.MomentumOptimizer, momentum=0.95)],
    "dropout_rate": [0.3, 0.4, 0.5],
}

rnd_search_dropout = RandomizedSearchCV(DNNClassifier(random_state=42), param_distribs, n_iter=50,
                                        fit_params={"X_valid": X_valid1, "y_valid": y_valid1, "n_epochs": 1000},
                                        random_state=42, verbose=2)
rnd_search_dropout.fit(X_train1, y_train1)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 3 folds for each of 50 candidates, totalling 150 fits
[CV] n_neurons=100, learning_rate=0.05, dropout_rate=0.5, batch_size=100, activation=<function elu at 0x7ff44f822620> 
0	Validation loss: 1.699098	Best loss: 1.699098	Accuracy: 19.18%
1	Validation loss: 1.643889	Best loss: 1.643889	Accuracy: 20.41%
2	Validation loss: 1.664865	Best loss: 1.643889	Accuracy: 17.76%
3	Validation loss: 1.708899	Best loss: 1.643889	Accuracy: 18.78%
4	Validation loss: 1.777488	Best loss: 1.643889	Accuracy: 20.41%
5	Validation loss: 1.675412	Best loss: 1.643889	Accuracy: 23.88%
6	Validation loss: 1.694554	Best loss: 1.643889	Accuracy: 17.76%
7	Validation loss: 1.669464	Best loss: 1.643889	Accuracy: 20.41%
8	Validation loss: 1.639220	Best loss: 1.639220	Accuracy: 17.76%
9	Validation loss: 1.708865	Best loss: 1.639220	Accuracy: 23.88%
10	Validation loss: 1.821535	Best loss: 1.639220	Accuracy: 17.76%
11	Validation loss: 1.663831	Best loss: 1.639220	Accuracy: 17.76%
12	Validation loss: 1.630666	Best los

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    9.0s remaining:    0.0s


0	Validation loss: 1.721754	Best loss: 1.721754	Accuracy: 17.76%
1	Validation loss: 1.707854	Best loss: 1.707854	Accuracy: 20.41%
2	Validation loss: 1.756343	Best loss: 1.707854	Accuracy: 17.76%
3	Validation loss: 1.703164	Best loss: 1.703164	Accuracy: 18.78%
4	Validation loss: 1.721026	Best loss: 1.703164	Accuracy: 23.88%
5	Validation loss: 1.728104	Best loss: 1.703164	Accuracy: 20.41%
6	Validation loss: 1.746193	Best loss: 1.703164	Accuracy: 19.18%
7	Validation loss: 1.684521	Best loss: 1.684521	Accuracy: 17.76%
8	Validation loss: 1.801070	Best loss: 1.684521	Accuracy: 17.76%
9	Validation loss: 1.637377	Best loss: 1.637377	Accuracy: 23.88%
10	Validation loss: 1.786996	Best loss: 1.637377	Accuracy: 19.18%
11	Validation loss: 1.727918	Best loss: 1.637377	Accuracy: 23.88%
12	Validation loss: 1.727419	Best loss: 1.637377	Accuracy: 20.41%
13	Validation loss: 1.635777	Best loss: 1.635777	Accuracy: 17.76%
14	Validation loss: 1.636062	Best loss: 1.635777	Accuracy: 17.76%
15	Validation loss: 

[Parallel(n_jobs=1)]: Done 150 out of 150 | elapsed: 19.8min finished


0	Validation loss: 0.459810	Best loss: 0.459810	Accuracy: 86.94%
1	Validation loss: 0.239334	Best loss: 0.239334	Accuracy: 92.45%
2	Validation loss: 0.164465	Best loss: 0.164465	Accuracy: 93.88%
3	Validation loss: 0.122840	Best loss: 0.122840	Accuracy: 96.53%
4	Validation loss: 0.116614	Best loss: 0.116614	Accuracy: 96.33%
5	Validation loss: 0.112174	Best loss: 0.112174	Accuracy: 96.12%
6	Validation loss: 0.108434	Best loss: 0.108434	Accuracy: 96.53%
7	Validation loss: 0.095364	Best loss: 0.095364	Accuracy: 97.55%
8	Validation loss: 0.116070	Best loss: 0.095364	Accuracy: 96.33%
9	Validation loss: 0.129126	Best loss: 0.095364	Accuracy: 96.12%
10	Validation loss: 0.108774	Best loss: 0.095364	Accuracy: 95.92%
11	Validation loss: 0.120267	Best loss: 0.095364	Accuracy: 96.73%
12	Validation loss: 0.111449	Best loss: 0.095364	Accuracy: 97.14%
13	Validation loss: 0.105353	Best loss: 0.095364	Accuracy: 96.73%
14	Validation loss: 0.101679	Best loss: 0.095364	Accuracy: 96.53%
15	Validation loss: 

RandomizedSearchCV(cv='warn', error_score='raise-deprecating',
          estimator=DNNClassifier(activation=<function elu at 0x7ff44f822620>,
       batch_norm_momentum=None, batch_size=20, dropout_rate=None,
       initializer=<tensorflow.python.ops.init_ops.VarianceScaling object at 0x7ff3d432ec88>,
       learning_rate=0.01, n_hidden_layers=5, n_neurons=100,
       optimizer_class=<class 'tensorflow.python.training.adam.AdamOptimizer'>,
       random_state=42),
          fit_params={'X_valid': array([[0., 0., ..., 0., 0.],
       [0., 0., ..., 0., 0.],
       ...,
       [0., 0., ..., 0., 0.],
       [0., 0., ..., 0., 0.]], dtype=float32), 'y_valid': array([0, 4, ..., 1, 1], dtype=int32), 'n_epochs': 1000},
          iid='warn', n_iter=50, n_jobs=None,
          param_distributions={'n_neurons': [100, 120, 140], 'batch_size': [50, 100, 500], 'learning_rate': [0.01, 0.02, 0.05], 'activation': [<function relu at 0x7ff44f7b6950>, <function elu at 0x7ff44f822620>, <function leaky_relu.<

In [47]:
rnd_search_dropout.best_params_

{'n_neurons': 120,
 'learning_rate': 0.01,
 'dropout_rate': 0.3,
 'batch_size': 500,
 'activation': <function tensorflow.python.ops.gen_nn_ops.relu(features, name=None)>}

In [48]:
y_pred = rnd_search_dropout.predict(X_test1)
accuracy_score(y_test1, y_pred)

0.9720222176506892

<span style='color:blue'>**Here, the optimal parameters increase the accuracy under dropout by about 0.5%.**</span>

# Exercise 9

## a: pretrained hidden layers of the previous model, freezes them, and replaces the softmax output layer with a fresh new one.

In [49]:
tf.reset_default_graph()

restore_saver = tf.train.import_meta_graph("./my_best_mnist_model_5_to_9.meta")

X = tf.get_default_graph().get_tensor_by_name("X:0")
y = tf.get_default_graph().get_tensor_by_name("y:0")
loss = tf.get_default_graph().get_tensor_by_name("loss:0")
Y_proba = tf.get_default_graph().get_tensor_by_name("Y_proba:0")
logits = Y_proba.op.inputs[0]
accuracy = tf.get_default_graph().get_tensor_by_name("accuracy:0")

In [50]:
learning_rate = 0.01

output_layer_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope="logits")
optimizer = tf.train.AdamOptimizer(learning_rate, name="Adam2")
training_op = optimizer.minimize(loss, var_list=output_layer_vars)

In [51]:
correct = tf.nn.in_top_k(logits, y, 1)
accuracy = tf.reduce_mean(tf.cast(correct, tf.float32), name="accuracy")

init = tf.global_variables_initializer()
five_frozen_saver = tf.train.Saver()

# b: digits 0-4

In [53]:
# 5-9
X_train1 = X_train[y_train >= 5]
y_train1 = y_train[y_train >= 5]-5
X_valid1 = X_valid[y_valid >= 5]
y_valid1 = y_valid[y_valid >= 5]-5
X_test1 = X_test[y_test >= 5]
y_test1 = y_test[y_test >= 5]-5
X_train2_full = X_train[y_train < 5]
y_train2_full =  y_train[y_train < 5]
X_valid2_full = X_valid[y_valid < 5]
y_valid2_full = y_valid[y_valid < 5]
X_test2 = X_test[y_test < 5]
y_test2 = y_test[y_test < 5]

In [54]:
def sample_n_instances_per_class(X, y, n=100):
    Xs, ys = [], []
    for label in np.unique(y):
        idx = (y == label)
        Xc = X[idx][:n]
        yc = y[idx][:n]
        Xs.append(Xc)
        ys.append(yc)
    return np.concatenate(Xs), np.concatenate(ys)

In [55]:
X_train2, y_train2 = sample_n_instances_per_class(X_train2_full, y_train2_full, n=100)
X_valid2, y_valid2 = sample_n_instances_per_class(X_valid2_full, y_valid2_full, n=30)

 Retrain last layer after freezing all other layers

In [56]:
import time

n_epochs = 1000
batch_size = 20

max_checks_without_progress = 20
checks_without_progress = 0
best_loss = np.infty

with tf.Session() as sess:
    init.run()
    restore_saver.restore(sess, "./my_best_mnist_model_5_to_9")
    for var in output_layer_vars:
        var.initializer.run()

    t0 = time.time()
        
    for epoch in range(n_epochs):
        rnd_idx = np.random.permutation(len(X_train2))
        for rnd_indices in np.array_split(rnd_idx, len(X_train2) // batch_size):
            X_batch, y_batch = X_train2[rnd_indices], y_train2[rnd_indices]
            sess.run(training_op, feed_dict={X: X_batch, y: y_batch})
        loss_val, acc_val = sess.run([loss, accuracy], feed_dict={X: X_valid2, y: y_valid2})
        if loss_val < best_loss:
            save_path = five_frozen_saver.save(sess, "./my_mnist_model_0_to_4_five_frozen")
            best_loss = loss_val
            checks_without_progress = 0
        else:
            checks_without_progress += 1
            if checks_without_progress > max_checks_without_progress:
                print("Early stopping!")
                break
        print("{}\tValidation loss: {:.6f}\tBest loss: {:.6f}\tAccuracy: {:.2f}%".format(
            epoch, loss_val, best_loss, acc_val * 100))

    t1 = time.time()
    print("Total training time: {:.1f}s".format(t1 - t0))

with tf.Session() as sess:
    five_frozen_saver.restore(sess, "./my_mnist_model_0_to_4_five_frozen")
    acc_test = accuracy.eval(feed_dict={X: X_test2, y: y_test2})
    print("Final test accuracy: {:.2f}%".format(acc_test * 100))

INFO:tensorflow:Restoring parameters from ./my_best_mnist_model_5_to_9
0	Validation loss: 1.081871	Best loss: 1.081871	Accuracy: 59.33%
1	Validation loss: 0.963888	Best loss: 0.963888	Accuracy: 61.33%
2	Validation loss: 0.900080	Best loss: 0.900080	Accuracy: 62.00%
3	Validation loss: 0.874640	Best loss: 0.874640	Accuracy: 64.67%
4	Validation loss: 0.931454	Best loss: 0.874640	Accuracy: 62.00%
5	Validation loss: 0.879047	Best loss: 0.874640	Accuracy: 64.67%
6	Validation loss: 0.884200	Best loss: 0.874640	Accuracy: 63.33%
7	Validation loss: 0.866685	Best loss: 0.866685	Accuracy: 67.33%
8	Validation loss: 0.828175	Best loss: 0.828175	Accuracy: 68.00%
9	Validation loss: 0.863401	Best loss: 0.828175	Accuracy: 66.00%
10	Validation loss: 0.818827	Best loss: 0.818827	Accuracy: 68.67%
11	Validation loss: 0.823998	Best loss: 0.818827	Accuracy: 67.33%
12	Validation loss: 0.849074	Best loss: 0.818827	Accuracy: 66.00%
13	Validation loss: 0.847120	Best loss: 0.818827	Accuracy: 64.67%
14	Validation l

<span style='color:blue'>**Since we are using the model we trained previously, the training this time is MUCH faster.**</span>

## c: caching the frozen layers, and train the model again

In [57]:
hidden5_out = tf.get_default_graph().get_tensor_by_name("hidden5_out:0")

In [58]:
import time

n_epochs = 1000
batch_size = 20

max_checks_without_progress = 20
checks_without_progress = 0
best_loss = np.infty

with tf.Session() as sess:
    init.run()
    restore_saver.restore(sess, "./my_best_mnist_model_5_to_9")
    for var in output_layer_vars:
        var.initializer.run()

    t0 = time.time()
    
    hidden5_train = hidden5_out.eval(feed_dict={X: X_train2, y: y_train2})
    hidden5_valid = hidden5_out.eval(feed_dict={X: X_valid2, y: y_valid2})
        
    for epoch in range(n_epochs):
        rnd_idx = np.random.permutation(len(X_train2))
        for rnd_indices in np.array_split(rnd_idx, len(X_train2) // batch_size):
            h5_batch, y_batch = hidden5_train[rnd_indices], y_train2[rnd_indices]
            sess.run(training_op, feed_dict={hidden5_out: h5_batch, y: y_batch})
        loss_val, acc_val = sess.run([loss, accuracy], feed_dict={hidden5_out: hidden5_valid, y: y_valid2})
        if loss_val < best_loss:
            save_path = five_frozen_saver.save(sess, "./my_mnist_model_0_to_4_five_frozen")
            best_loss = loss_val
            checks_without_progress = 0
        else:
            checks_without_progress += 1
            if checks_without_progress > max_checks_without_progress:
                print("Early stopping!")
                break
        print("{}\tValidation loss: {:.6f}\tBest loss: {:.6f}\tAccuracy: {:.2f}%".format(
            epoch, loss_val, best_loss, acc_val * 100))

    t1 = time.time()
    print("Total training time: {:.1f}s".format(t1 - t0))

with tf.Session() as sess:
    five_frozen_saver.restore(sess, "./my_mnist_model_0_to_4_five_frozen")
    acc_test = accuracy.eval(feed_dict={X: X_test2, y: y_test2})
    print("Final test accuracy: {:.2f}%".format(acc_test * 100))

INFO:tensorflow:Restoring parameters from ./my_best_mnist_model_5_to_9
0	Validation loss: 0.986412	Best loss: 0.986412	Accuracy: 59.33%
1	Validation loss: 0.971104	Best loss: 0.971104	Accuracy: 60.67%
2	Validation loss: 0.880444	Best loss: 0.880444	Accuracy: 62.67%
3	Validation loss: 0.902001	Best loss: 0.880444	Accuracy: 60.67%
4	Validation loss: 0.915864	Best loss: 0.880444	Accuracy: 57.33%
5	Validation loss: 0.939066	Best loss: 0.880444	Accuracy: 63.33%
6	Validation loss: 0.892196	Best loss: 0.880444	Accuracy: 60.00%
7	Validation loss: 0.821684	Best loss: 0.821684	Accuracy: 65.33%
8	Validation loss: 0.872775	Best loss: 0.821684	Accuracy: 68.00%
9	Validation loss: 0.832862	Best loss: 0.821684	Accuracy: 64.00%
10	Validation loss: 0.840878	Best loss: 0.821684	Accuracy: 66.00%
11	Validation loss: 0.801570	Best loss: 0.801570	Accuracy: 68.67%
12	Validation loss: 0.782057	Best loss: 0.782057	Accuracy: 66.67%
13	Validation loss: 0.855162	Best loss: 0.782057	Accuracy: 64.00%
14	Validation l

<span style='color:blue'>**While this accuracy may seem bad, considering that it trained in 3 seconds, it is still impressive compared to the incredibly long time the initial model took to get a 93%**</span>

## d: reusing just four hidden layers instead of five

In [59]:
tf.reset_default_graph()

n_outputs = 5

restore_saver = tf.train.import_meta_graph("./my_best_mnist_model_5_to_9.meta")

X = tf.get_default_graph().get_tensor_by_name("X:0")
y = tf.get_default_graph().get_tensor_by_name("y:0")

hidden4_out = tf.get_default_graph().get_tensor_by_name("hidden4_out:0")
logits = tf.layers.dense(hidden4_out, n_outputs, kernel_initializer=he_init, name="new_logits")
Y_proba = tf.nn.softmax(logits)
xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=logits)
loss = tf.reduce_mean(xentropy)
correct = tf.nn.in_top_k(logits, y, 1)
accuracy = tf.reduce_mean(tf.cast(correct, tf.float32), name="accuracy")

In [60]:
learning_rate = 0.01

output_layer_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope="new_logits")
optimizer = tf.train.AdamOptimizer(learning_rate, name="Adam2")
training_op = optimizer.minimize(loss, var_list=output_layer_vars)

init = tf.global_variables_initializer()
four_frozen_saver = tf.train.Saver()


In [61]:
n_epochs = 1000
batch_size = 20

max_checks_without_progress = 20
checks_without_progress = 0
best_loss = np.infty

with tf.Session() as sess:
    init.run()
    restore_saver.restore(sess, "./my_best_mnist_model_5_to_9")
        
    for epoch in range(n_epochs):
        rnd_idx = np.random.permutation(len(X_train2))
        for rnd_indices in np.array_split(rnd_idx, len(X_train2) // batch_size):
            X_batch, y_batch = X_train2[rnd_indices], y_train2[rnd_indices]
            sess.run(training_op, feed_dict={X: X_batch, y: y_batch})
        loss_val, acc_val = sess.run([loss, accuracy], feed_dict={X: X_valid2, y: y_valid2})
        if loss_val < best_loss:
            save_path = four_frozen_saver.save(sess, "./my_mnist_model_0_to_4_four_frozen")
            best_loss = loss_val
            checks_without_progress = 0
        else:
            checks_without_progress += 1
            if checks_without_progress > max_checks_without_progress:
                print("Early stopping!")
                break
        print("{}\tValidation loss: {:.6f}\tBest loss: {:.6f}\tAccuracy: {:.2f}%".format(
            epoch, loss_val, best_loss, acc_val * 100))

with tf.Session() as sess:
    four_frozen_saver.restore(sess, "./my_mnist_model_0_to_4_four_frozen")
    acc_test = accuracy.eval(feed_dict={X: X_test2, y: y_test2})
    print("Final test accuracy: {:.2f}%".format(acc_test * 100))

INFO:tensorflow:Restoring parameters from ./my_best_mnist_model_5_to_9
0	Validation loss: 0.974290	Best loss: 0.974290	Accuracy: 56.67%
1	Validation loss: 0.885347	Best loss: 0.885347	Accuracy: 60.67%
2	Validation loss: 0.920351	Best loss: 0.885347	Accuracy: 62.00%
3	Validation loss: 0.882951	Best loss: 0.882951	Accuracy: 66.67%
4	Validation loss: 0.858178	Best loss: 0.858178	Accuracy: 64.00%
5	Validation loss: 0.779761	Best loss: 0.779761	Accuracy: 66.67%
6	Validation loss: 0.821571	Best loss: 0.779761	Accuracy: 66.67%
7	Validation loss: 0.810910	Best loss: 0.779761	Accuracy: 68.67%
8	Validation loss: 0.765839	Best loss: 0.765839	Accuracy: 69.33%
9	Validation loss: 0.789241	Best loss: 0.765839	Accuracy: 68.00%
10	Validation loss: 0.763210	Best loss: 0.763210	Accuracy: 71.33%
11	Validation loss: 0.788085	Best loss: 0.763210	Accuracy: 66.00%
12	Validation loss: 0.760302	Best loss: 0.760302	Accuracy: 71.33%
13	Validation loss: 0.730968	Best loss: 0.730968	Accuracy: 73.33%
14	Validation l

<span style='color:blue'>**Reusing just 4 of the hidden layers instead of all 5 seems boosts our accuracy from a 75.85% to an 83.19%**</span>

## e: unfreeze the top two hidden layers and continue training

In [64]:
learning_rate = 0.01

unfrozen_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope="hidden[34]|new_logits")
optimizer = tf.train.AdamOptimizer(learning_rate, name="Adam3")
training_op = optimizer.minimize(loss, var_list=unfrozen_vars)

init = tf.global_variables_initializer()
two_frozen_saver = tf.train.Saver()

In [65]:
n_epochs = 1000
batch_size = 20

max_checks_without_progress = 20
checks_without_progress = 0
best_loss = np.infty

with tf.Session() as sess:
    init.run()
    four_frozen_saver.restore(sess, "./my_mnist_model_0_to_4_four_frozen")
        
    for epoch in range(n_epochs):
        rnd_idx = np.random.permutation(len(X_train2))
        for rnd_indices in np.array_split(rnd_idx, len(X_train2) // batch_size):
            X_batch, y_batch = X_train2[rnd_indices], y_train2[rnd_indices]
            sess.run(training_op, feed_dict={X: X_batch, y: y_batch})
        loss_val, acc_val = sess.run([loss, accuracy], feed_dict={X: X_valid2, y: y_valid2})
        if loss_val < best_loss:
            save_path = two_frozen_saver.save(sess, "./my_mnist_model_0_to_4_two_frozen")
            best_loss = loss_val
            checks_without_progress = 0
        else:
            checks_without_progress += 1
            if checks_without_progress > max_checks_without_progress:
                print("Early stopping!")
                break
        print("{}\tValidation loss: {:.6f}\tBest loss: {:.6f}\tAccuracy: {:.2f}%".format(
            epoch, loss_val, best_loss, acc_val * 100))

with tf.Session() as sess:
    two_frozen_saver.restore(sess, "./my_mnist_model_0_to_4_two_frozen")
    acc_test = accuracy.eval(feed_dict={X: X_test2, y: y_test2})
    print("Final test accuracy: {:.2f}%".format(acc_test * 100))

INFO:tensorflow:Restoring parameters from ./my_mnist_model_0_to_4_four_frozen
0	Validation loss: 0.792504	Best loss: 0.792504	Accuracy: 79.33%
1	Validation loss: 0.696911	Best loss: 0.696911	Accuracy: 80.67%
2	Validation loss: 0.833696	Best loss: 0.696911	Accuracy: 78.67%
3	Validation loss: 0.578301	Best loss: 0.578301	Accuracy: 85.33%
4	Validation loss: 1.104046	Best loss: 0.578301	Accuracy: 80.67%
5	Validation loss: 0.829661	Best loss: 0.578301	Accuracy: 84.00%
6	Validation loss: 0.693413	Best loss: 0.578301	Accuracy: 85.33%
7	Validation loss: 1.034690	Best loss: 0.578301	Accuracy: 83.33%
8	Validation loss: 0.607756	Best loss: 0.578301	Accuracy: 89.33%
9	Validation loss: 0.723333	Best loss: 0.578301	Accuracy: 83.33%
10	Validation loss: 0.707802	Best loss: 0.578301	Accuracy: 87.33%
11	Validation loss: 0.642105	Best loss: 0.578301	Accuracy: 86.67%
12	Validation loss: 0.923372	Best loss: 0.578301	Accuracy: 83.33%
13	Validation loss: 1.050571	Best loss: 0.578301	Accuracy: 80.00%
14	Valid

<span style='color:blue'>**unfreezing just the top 2 layers has an even more positive effect on accuracy.**</span>

In [67]:
learning_rate = 0.01

optimizer = tf.train.AdamOptimizer(learning_rate, name="Adam4")
training_op = optimizer.minimize(loss)

init = tf.global_variables_initializer()
no_frozen_saver = tf.train.Saver()

In [68]:
n_epochs = 1000
batch_size = 20

max_checks_without_progress = 20
checks_without_progress = 0
best_loss = np.infty

with tf.Session() as sess:
    init.run()
    two_frozen_saver.restore(sess, "./my_mnist_model_0_to_4_two_frozen")
        
    for epoch in range(n_epochs):
        rnd_idx = np.random.permutation(len(X_train2))
        for rnd_indices in np.array_split(rnd_idx, len(X_train2) // batch_size):
            X_batch, y_batch = X_train2[rnd_indices], y_train2[rnd_indices]
            sess.run(training_op, feed_dict={X: X_batch, y: y_batch})
        loss_val, acc_val = sess.run([loss, accuracy], feed_dict={X: X_valid2, y: y_valid2})
        if loss_val < best_loss:
            save_path = no_frozen_saver.save(sess, "./my_mnist_model_0_to_4_no_frozen")
            best_loss = loss_val
            checks_without_progress = 0
        else:
            checks_without_progress += 1
            if checks_without_progress > max_checks_without_progress:
                print("Early stopping!")
                break
        print("{}\tValidation loss: {:.6f}\tBest loss: {:.6f}\tAccuracy: {:.2f}%".format(
            epoch, loss_val, best_loss, acc_val * 100))

with tf.Session() as sess:
    no_frozen_saver.restore(sess, "./my_mnist_model_0_to_4_no_frozen")
    acc_test = accuracy.eval(feed_dict={X: X_test2, y: y_test2})
    print("Final test accuracy: {:.2f}%".format(acc_test * 100))

INFO:tensorflow:Restoring parameters from ./my_mnist_model_0_to_4_two_frozen
0	Validation loss: 0.724321	Best loss: 0.724321	Accuracy: 86.00%
1	Validation loss: 0.307258	Best loss: 0.307258	Accuracy: 92.67%
2	Validation loss: 0.637997	Best loss: 0.307258	Accuracy: 87.33%
3	Validation loss: 0.503302	Best loss: 0.307258	Accuracy: 94.00%
4	Validation loss: 0.593974	Best loss: 0.307258	Accuracy: 93.33%
5	Validation loss: 3.333824	Best loss: 0.307258	Accuracy: 83.33%
6	Validation loss: 0.406760	Best loss: 0.307258	Accuracy: 94.00%
7	Validation loss: 0.892480	Best loss: 0.307258	Accuracy: 92.00%
8	Validation loss: 0.404881	Best loss: 0.307258	Accuracy: 93.33%
9	Validation loss: 0.784829	Best loss: 0.307258	Accuracy: 90.00%
10	Validation loss: 0.508487	Best loss: 0.307258	Accuracy: 96.00%
11	Validation loss: 0.432359	Best loss: 0.307258	Accuracy: 94.00%
12	Validation loss: 1.002157	Best loss: 0.307258	Accuracy: 88.67%
13	Validation loss: 0.551694	Best loss: 0.307258	Accuracy: 93.33%
14	Valida

In [69]:
dnn_clf_0_to_4 = DNNClassifier(n_hidden_layers=4, random_state=42)
dnn_clf_0_to_4.fit(X_train2, y_train2, n_epochs=1000, X_valid=X_valid2, y_valid=y_valid2)

0	Validation loss: 0.514959	Best loss: 0.514959	Accuracy: 87.33%
1	Validation loss: 0.353730	Best loss: 0.353730	Accuracy: 88.67%
2	Validation loss: 0.456072	Best loss: 0.353730	Accuracy: 90.67%
3	Validation loss: 0.551842	Best loss: 0.353730	Accuracy: 88.67%
4	Validation loss: 0.378907	Best loss: 0.353730	Accuracy: 90.67%
5	Validation loss: 0.812283	Best loss: 0.353730	Accuracy: 89.33%
6	Validation loss: 0.358761	Best loss: 0.353730	Accuracy: 90.00%
7	Validation loss: 1.654652	Best loss: 0.353730	Accuracy: 90.67%
8	Validation loss: 0.926710	Best loss: 0.353730	Accuracy: 93.33%
9	Validation loss: 0.661424	Best loss: 0.353730	Accuracy: 90.67%
10	Validation loss: 0.724121	Best loss: 0.353730	Accuracy: 92.67%
11	Validation loss: 0.662871	Best loss: 0.353730	Accuracy: 92.67%
12	Validation loss: 0.891136	Best loss: 0.353730	Accuracy: 91.33%
13	Validation loss: 2.142180	Best loss: 0.353730	Accuracy: 89.33%
14	Validation loss: 2.092248	Best loss: 0.353730	Accuracy: 82.00%
15	Validation loss: 

DNNClassifier(activation=<function elu at 0x7ff44f822620>,
       batch_norm_momentum=None, batch_size=20, dropout_rate=None,
       initializer=<tensorflow.python.ops.init_ops.VarianceScaling object at 0x7ff3d432ec88>,
       learning_rate=0.01, n_hidden_layers=4, n_neurons=100,
       optimizer_class=<class 'tensorflow.python.training.adam.AdamOptimizer'>,
       random_state=42)

In [70]:
y_pred = dnn_clf_0_to_4.predict(X_test2)
accuracy_score(y_test2, y_pred)

0.9404553415061296

<span style='color:blue'>**Our final accuracy has now increased significantly. And from the significance increase in accuracy caused by the unfreezing of the first few layers, we can conclude that the first few layers have little to no difference when trained on 5-9 or 0-4.**</span>

<span style='color:red'>**NOTE THAT IN THE INTEREST OF SAVING TIME AND MY RATHER LIMITED COMPUTATIONAL RESOURCES , I REDUCED THE DATASET SIZE AND ALSO REDUCED THE NUMBER OF DIFFERENT VALUES FOR EACH PARAMETER AT EVERY STEP OF THE RANDOMIZED SEARCH. HOWEVER, I CONSISTENTLY USED THE SAME PARAMETERS ACROSS THE DIFFERENT RANDOMIZED SEARCHES TO MAKE THE ACCURACY SCORE COMPARABLE. THIS ALSO ENABLES ME TO ACTUALLY QUANTIFY AND WITNESS THE EFFECT OF EACH ADDITIONAL REGULARIZATION OR NORMALIZATION TECHNIQUE EVEN WITH LESS PARAMETERS. THIS COULD POTENTIALLY HAVE PREVENTED ME FROM COMING ACROSS A RATHER USEFUL PARAMETER(COMBINATION OF PARAMTERS) THAT COULD PERHAPS SIGNIFICANTLY IMPROVE ACCURACY! PLEASE KEEP THIS MIND WHILE COMPARING THE ABOVE SCORES TO OTHER SIMILAR MODELS.**</span>