# ℓ1 and ℓ2 Regularization

for simple linear models, you can use ℓ1 and ℓ2 regularization
to constrain a neural network’s connection weights

One way to do this using TensorFlow is to simply add the appropriate regularization
terms to your cost function. For example, assuming you have just one hidden layer
with weights weights1 and one output layer with weights weights2, then you can
apply ℓ1 regularization like this:

In [0]:
import tensorflow as tf

tf.reset_default_graph()

n_inputs = 28 * 28  # MNIST
n_hidden1 = 300
n_hidden2 = 50
n_outputs = 10

X = tf.placeholder(tf.float32, shape=(None, n_inputs), name="X")
y = tf.placeholder(tf.int32, shape=(None), name="y")

#
# Implementation of regularization 
#
scale = 0.001

my_dense_layer = partial(
    tf.layers.dense, activation=tf.nn.relu,
    kernel_regularizer=tf.contrib.layers.l1_regularizer(scale))

with tf.name_scope("dnn"):
    hidden1 = my_dense_layer(X, n_hidden1, name="hidden1")
    hidden2 = my_dense_layer(hidden1, n_hidden2, name="hidden2")
    logits = my_dense_layer(hidden2, n_outputs, activation=None,name="outputs")

with tf.name_scope("loss"):                                   
    xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=logits)                                
    base_loss = tf.reduce_mean(xentropy, name="avg_xentropy")   
    reg_losses = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
    loss = tf.add_n([base_loss] + reg_losses, name="loss")    
    

#
# Rest is normal 
#
with tf.name_scope("eval"):
    correct = tf.nn.in_top_k(logits, y, 1)
    accuracy = tf.reduce_mean(tf.cast(correct, tf.float32), name="accuracy")

learning_rate = 0.01

with tf.name_scope("train"):
    optimizer = tf.train.GradientDescentOptimizer(learning_rate)
    training_op = optimizer.minimize(loss)

init = tf.global_variables_initializer()
saver = tf.train.Saver()    

In [0]:

n_epochsn_epochs = 20
batch_size = 200

with tf.Session() as sess:
    init.run()
    for epoch in range(n_epochs):
        for X_batch, y_batch in shuffle_batch(X_train, y_train, batch_size):
            sess.run(training_op, feed_dict={X: X_batch, y: y_batch})
        accuracy_val = accuracy.eval(feed_dict={X: X_valid, y: y_valid})
        print(epoch, "Validation accuracy:", accuracy_val)

    save_path = saver.save(sess, "./my_model_final.ckpt")

This code creates a neural network with two hidden layers and one output layer, and
it also creates nodes in the graph to compute the ℓ1 regularization loss corresponding
to each layer’s weights. TensorFlow automatically adds these nodes to a special collection
containing all the regularization losses. You just need to add these regularization
losses to your overall loss, like this:
            
            reg_losses = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
            loss = tf.add_n([base_loss] + reg_losses, name="loss")
            
  
  
 # Dropout 
 
 It is a fairly simple algorithm: at every training step, every neuron (including the
input neurons but excluding the output neurons) has a **probability p** of being temporarily
“dropped out,” meaning it will be entirely ignored during this training step,
but it may be active during the next step. The hyperparameter p is
called the dropout rate, and it is typically set to 50%. After training, neurons don’t get
dropped anymore.

<img src="https://miro.medium.com/max/644/1*dEi_IkVB7IpkzZ-6H0Vpsg.png">


Neurons trained with dropout cannot co-adapt with their neighboring neurons; they have to be as useful as possible on their own. They also cannot rely excessively on just a few input neurons; they must pay attention to each of their input neurons. They end up being less sensitive to slight
changes in the inputs. In the end you get a more robust network that generalizes better.

There is one small but important technical detail. Suppose p = 50, in which case during
testing a neuron will be connected to twice as many input neurons as it was (on
average) during training. To compensate for this fact, we need to multiply each neuron
input connection weights by 0.5 after training. If we don’t, each neuron will get a
total input signal roughly twice as large as what the network was trained on, and it is
unlikely to perform well. More generally, we need to multiply each input connection
weight by the keep probability (1 – p) after training. Alternatively, we can divide each
neuron’s output by the keep probability during training (these alternatives are not
perfectly equivalent, but they work equally well).

An example of the implementation of the dropout regularization is the next code:

    

In [0]:
X = tf.placeholder(tf.float32, shape=(None, n_inputs), name="X")
y = tf.placeholder(tf.int32, shape=(None), name="y")

training = tf.placeholder_with_default(False, shape=(), name='training')

dropout_rate = 0.5  # == 1 - keep_prob
X_drop = tf.layers.dropout(X, dropout_rate, training=training)

with tf.name_scope("dnn"):
    hidden1 = tf.layers.dense(X_drop, n_hidden1, activation=tf.nn.relu)
    hidden1_drop = tf.layers.dropout(hidden1, dropout_rate, training=training)
    hidden2 = tf.layers.dense(hidden1_drop, n_hidden2, activation=tf.nn.relu)
    hidden2_drop = tf.layers.dropout(hidden2, dropout_rate, training=training)
    logits = tf.layers.dense(hidden2_drop, n_outputs)
    
    
with tf.name_scope("loss"):
    xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=logits)
    loss = tf.reduce_mean(xentropy, name="loss")

with tf.name_scope("train"):
    optimizer = tf.train.MomentumOptimizer(learning_rate, momentum=0.9)
    training_op = optimizer.minimize(loss)    

with tf.name_scope("eval"):
    correct = tf.nn.in_top_k(logits, y, 1)
    accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
    
init = tf.global_variables_initializer()
saver = tf.train.Saver()


n_epochs = 20
batch_size = 50

with tf.Session() as sess:
    init.run()
    for epoch in range(n_epochs):
        for X_batch, y_batch in shuffle_batch(X_train, y_train, batch_size):
            sess.run(training_op, feed_dict={X: X_batch, y: y_batch, training: True})
        accuracy_val = accuracy.eval(feed_dict={X: X_valid, y: y_valid})
        print(epoch, "Validation accuracy:", accuracy_val)

    save_path = saver.save(sess, "./my_model_final.ckpt")

     
*You want to use the dropout() function in tensorflow.con
trib.layers, not the one in tensorflow.nn. The first one turns off
(no-op) when not training, which is what you want, while the second
one does not.*

Of course, just like you did earlier for Batch Normalization, you need to set is_train
ing to True when training, and to False when testing.

If you observe that the model is overfitting, you can increase the dropout rate (i.e.,
reduce the keep_prob hyperparameter). Conversely, you should try decreasing the
dropout rate (i.e., increasing keep_prob) if the model underfits the training set. It can
also help to increase the dropout rate for large layers, and reduce it for small ones.

Dropout does tend to significantly slow down convergence, but it usually results in a
much better model when tuned properly. So, it is generally well worth the extra time
and effort.

# Max-Norm Regularization

Another regularization technique that is quite popular for neural networks is called
max-norm regularization: for each neuron, it constrains the weights **w** of the incoming
connections such that ∥ w ∥2 ≤ r, where r is the max-norm hyperparameter and
∥ · ∥2 is the ℓ2 norm.

Reducing r increases the amount of regularization and helps reduce overfitting. Maxnorm
regularization can also help alleviate the vanishing/exploding gradients problems
(if you are not using Batch Normalization).



In [25]:
import tensorflow as tf

tf.reset_default_graph()

n_inputs = 28 * 28
n_hidden1 = 300
n_hidden2 = 50
n_outputs = 10

learning_rate = 0.01
momentum = 0.9

X = tf.placeholder(tf.float32, shape=(None, n_inputs), name="X")
y = tf.placeholder(tf.int32, shape=(None), name="y")

def max_norm_regularizer(threshold, axes=1, name="max_norm",
                         collection="max_norm"):
    def max_norm(weights):
        clipped = tf.clip_by_norm(weights, clip_norm=threshold, axes=axes)
        clip_weights = tf.assign(weights, clipped, name=name)
        tf.add_to_collection(collection, clip_weights)
        return None # there is no regularization loss term
    return max_norm
  
  
# Then you can call this function to get a max norm regularizer
# (with the threshold you want). When you create a hidden layer, you can pass this regularizer to the kernel_regularizer argument:  


max_norm_reg = max_norm_regularizer(threshold=1.0)

with tf.name_scope("dnn"):
    hidden1 = tf.layers.dense(X, n_hidden1, activation=tf.nn.relu,
                              kernel_regularizer=max_norm_reg, name="hidden1")
    hidden2 = tf.layers.dense(hidden1, n_hidden2, activation=tf.nn.relu,
                              kernel_regularizer=max_norm_reg, name="hidden2")
    logits = tf.layers.dense(hidden2, n_outputs, name="outputs")
    
    
    
with tf.name_scope("loss"):
    xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=logits)
    loss = tf.reduce_mean(xentropy, name="loss")

with tf.name_scope("train"):
    optimizer = tf.train.MomentumOptimizer(learning_rate, momentum)
    training_op = optimizer.minimize(loss)    

with tf.name_scope("eval"):
    correct = tf.nn.in_top_k(logits, y, 1)
    accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))

init = tf.global_variables_initializer()
saver = tf.train.Saver()


n_epochs = 20
batch_size = 50

clip_all_weights = tf.get_collection("max_norm")

with tf.Session() as sess:
    init.run()
    for epoch in range(n_epochs):
        for X_batch, y_batch in shuffle_batch(X_train, y_train, batch_size):
            sess.run(training_op, feed_dict={X: X_batch, y: y_batch})
            sess.run(clip_all_weights)
        acc_valid = accuracy.eval(feed_dict={X: X_valid, y: y_valid}) 
        print(epoch, "Validation accuracy:", acc_valid)               

    save_path = saver.save(sess, "./my_model_final.ckpt")

0 Validation accuracy: 0.9542
1 Validation accuracy: 0.968
2 Validation accuracy: 0.974
3 Validation accuracy: 0.9786
4 Validation accuracy: 0.9784
5 Validation accuracy: 0.977
6 Validation accuracy: 0.9784
7 Validation accuracy: 0.9804
8 Validation accuracy: 0.9818
9 Validation accuracy: 0.9822
10 Validation accuracy: 0.9834
11 Validation accuracy: 0.9834
12 Validation accuracy: 0.9828
13 Validation accuracy: 0.9832
14 Validation accuracy: 0.9844
15 Validation accuracy: 0.9834
16 Validation accuracy: 0.9848
17 Validation accuracy: 0.9846
18 Validation accuracy: 0.9846
19 Validation accuracy: 0.9838
