In [153]:
import numpy as np
import tensorflow as tf

In [154]:
## Generate Guassian Input
x = tf.random.normal((1, 100), mean = 0, stddev=1)
print(tf.reduce_mean(x).numpy(), tf.math.reduce_std(x).numpy())

-0.03994225 1.0157017


### Exploding Gradients

In [155]:
## Frwd Prop : Assuming 100 layer NN with no activations
## You would find it quickly goes to nan (explodes)
for i in range(100):
    w = tf.random.normal((100, 100))
    x = tf.matmul(x, w)
print(tf.reduce_mean(x), tf.math.reduce_std(x))

tf.Tensor(nan, shape=(), dtype=float32) tf.Tensor(nan, shape=(), dtype=float32)


In [156]:
## Checking at which layer, the mean goes to nan
x = tf.random.normal((1, 100), mean = 0, stddev=1)
for i in range(100):
    w = tf.random.normal((100, 100), mean = 0, stddev=1)
    x = tf.matmul(x, w)
    if tf.math.is_nan(tf.reduce_mean(x).numpy()):
        print(i)
        break

37


Observations
* The activation outputs exploded within 37 of our network’s layers. 
* If we had changed the w std dev to be higher -> higher chance of mean exploding -> implying high weights multipled togther -> quick exploding
* In other words, we have initialized our weights large

## Vanishing Gradients

In [157]:
## Scaling weights and checking the output
x = tf.random.normal((1, 100), mean = 0, stddev=1)
for i in range(100):
    w = tf.random.normal((100, 100), mean = 0, stddev=1)*0.05
    x = tf.matmul(x, w)
print(tf.reduce_mean(x).numpy(), tf.math.reduce_std(x).numpy())

1.0689624e-32 0.0


Observations
* If we scale the weights by lower number, the activations get vanished to zero
* **Summary** : If weights are initialized too large, the network won’t learn well. The same happens when weights are initialized too small.

In [158]:
# The std dev of layer activations is very close to the square root of the number of input connections
mean, stddev = 0, 0
num_iter = 1000
for i in range(num_iter):
    w = tf.random.normal((512, 512), mean = 0, stddev=1)
    x = tf.random.normal((1, 512), mean = 0, stddev=1)
    y = tf.matmul(x, w)
    mean += tf.reduce_mean(y).numpy()
    stddev += tf.math.reduce_std(y).numpy()
print(mean/num_iter, stddev/num_iter)
print(np.sqrt(512))

-0.00431403724104166 22.624842472076416
22.627416997969522


Observations
* Recall that, our w, x both initialized to N(0, 1). 

* Layer Activation Calculation:
    * $y[0] = w[:0]\ *\ x[:,0] $
    
    * This will also be N(0, 1)
    * If we sum all this up -> we get mean = 0, variance = num_layers
    
* Ideally we want each layer’s outputs to have a standard deviation of about 1 -> we wont suffer from explosion of gradients

* Scaling the weight matrix by $\sqrt{num\ layers}$ will output each activations stddev as 1 -> no explosion of gradients

In [159]:
# Scale the weight matrix by sqrt(layers)
mean, stddev = 0, 0
num_iter = 1000
for i in range(num_iter):
    w = tf.random.normal((512, 512), mean = 0, stddev=1)*np.sqrt(1/512)
    x = tf.random.normal((1, 512), mean = 0, stddev=1)
    y = tf.matmul(x, w)
    mean += tf.reduce_mean(y).numpy()
    stddev += tf.math.reduce_std(y).numpy()
print(mean/num_iter, stddev/num_iter)   

-0.0004035784816369414 0.9976319864988327


## With activations & Scaling weights approach
* Beware, we have excluded activations completely until now!
* We find including activations, is not helping our unstable grad problem (can vanish)
* The weights did not vanish (0)

In [160]:
def sigmoid(z):
    return 1/(1 + np.exp(-z))

mean, stddev = 0, 0
num_iter = 1000
for i in range(num_iter):
    w = tf.random.normal((512, 512))*np.sqrt(1/512)
    x = tf.random.normal((1, 512))#, mean = 0, stddev=1)
    # y = sigmoid(tf.matmul(x, w))
    y = np.tanh(tf.matmul(x, w))
    mean += tf.reduce_mean(y).numpy()
    stddev += tf.math.reduce_std(y).numpy()
print(mean/num_iter, stddev/num_iter)

0.001258103971136734 0.627060652911663


In [161]:
## Xavier Init
mean, stddev = 0, 0
num_iter = 100
for i in range(num_iter):
    w = tf.random.uniform((512, 512))*np.sqrt(1/512)
    x = tf.random.normal((1, 512))
    y = np.tanh(tf.matmul(x, w))
    mean += tf.reduce_mean(y).numpy()
    stddev += tf.math.reduce_std(y).numpy()
print(mean/num_iter, stddev/num_iter)
# This is causing the activation gradients to almost vanish

0.00859007661230862 0.23150178030133248


In [162]:
# Maintain same variance frwd & back prop : We get similar results as above
def xavier(m, h):
    return tf.random.uniform((m, h))*np.sqrt(6/(m + h))

mean, stddev = 0, 0
num_iter = 100
for i in range(num_iter):
    w = xavier(512, 512)
    x = tf.random.normal((1, 512))
    y = np.tanh(tf.matmul(x, w))
    mean += tf.reduce_mean(y).numpy()
    stddev += tf.math.reduce_std(y).numpy()
print(mean/num_iter, stddev/num_iter)

-0.09331645273603499 0.30846175507642326


In [163]:
# Maintain same variance frwd & back prop : We get similar results as above
mean, stddev = 0, 0
num_iter = 100
for i in range(num_iter):
    w = tf.random.uniform((512, 512))*np.sqrt(2/512)
    x = tf.random.normal((1, 512))
    y = tf.nn.relu(tf.matmul(x, w))
    mean += tf.reduce_mean(y).numpy()
    stddev += tf.math.reduce_std(y).numpy()
print(mean/num_iter, stddev/num_iter)

0.27805046624562235 0.1972163485432975
