### Weight Initialization & The Vanishing Gradient Problem

+ gradient vanishing problem, 미분값 최대 0.25,
+ 이것을 $0.25^7 = 0.000061035 $


In [1]:
%load_ext tensorboard

In [13]:
import tensorflow as tf
from tensorflow.keras.datasets import mnist
import numpy as np
import matplotlib.pyplot as plt
import datetime

tf.keras.datasets.mnist

In [4]:
#1
(x_train, y_train), (x_test, y_test) = mnist.load_data()

#2:normalize images
x_train = x_train.astype('float32')
x_test  = x_test.astype('float32')
x_train /= 255.0 # [0, 1]
x_test  /= 255.0

#3: one-hot encoding
y_train = tf.keras.utils.to_categorical(y_train) # (60000, 10)
y_test = tf.keras.utils.to_categorical(y_test)   # (10000, 10)

In [5]:
#4: build a model
init = tf.keras.initializers.Constant(0.1)
##init = tf.keras.initializers.RandomUniform(-0.5, 0.5) # 'random_uniform’

In [6]:
# kernel_initializer='glorot_uniform',

model = tf.keras.Sequential()
model.add(tf.keras.layers.Flatten(input_shape=(28, 28)))
model.add(tf.keras.layers.Dense(units=5, activation='sigmoid', kernel_initializer=init))
model.add(tf.keras.layers.Dense(units=5, activation='sigmoid', kernel_initializer=init))
model.add(tf.keras.layers.Dense(units=5, activation='sigmoid', kernel_initializer=init))
model.add(tf.keras.layers.Dense(units=5, activation='sigmoid', kernel_initializer=init))
model.add(tf.keras.layers.Dense(units=5, activation='sigmoid', kernel_initializer=init))
model.add(tf.keras.layers.Dense(units=10,activation='softmax', kernel_initializer=init))
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 flatten (Flatten)           (None, 784)               0         
                                                                 
 dense (Dense)               (None, 5)                 3925      
                                                                 
 dense_1 (Dense)             (None, 5)                 30        
                                                                 
 dense_2 (Dense)             (None, 5)                 30        
                                                                 
 dense_3 (Dense)             (None, 5)                 30        
                                                                 
 dense_4 (Dense)             (None, 5)                 30        
                                                                 
 dense_5 (Dense)             (None, 10)                6

In [7]:
opt = tf.keras.optimizers.RMSprop(learning_rate=0.01)
model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy'])

In [8]:
log_dir = "./logs/3201/"
file_writer = tf.summary.create_file_writer(log_dir + "/gradient")
file_writer.set_as_default()

In [14]:
#6:  calculate averages and histograms of gradients in layers
class GradientCallback(tf.keras.callbacks.Callback):
  def __init__(self, freq=10):
  ##        super(GradientCallback, self).__init__()
      self.freq = freq

  def on_epoch_end(self, epoch, logs):
      if epoch%self.freq != 0:
          return

      with tf.GradientTape() as tape:
          y_pred = model(x_train)  # tensor, logits
          loss   = tf.keras.losses.binary_crossentropy(y_train, y_pred)
      grads = tape.gradient(loss, model.trainable_weights)

      for n in range(1, len(model.layers)):
          i2 = (n-1)*2 # weights
          i1 = i2 + 1  # biases

          bias_avg   = tf.reduce_mean(tf.abs(grads[i1]))
          weight_avg = tf.reduce_mean(tf.abs(grads[i2]))

          tf.summary.scalar("layer_%d/avg/bias"%n, data=bias_avg, step=epoch)
          tf.summary.scalar("layer_%d/avg/weight"%n, data=weight_avg, step=epoch)

          tf.summary.histogram("layer_%d/hist/bias"%n, data=grads[i1], step=epoch)
          tf.summary.histogram("layer_%d/hist/weight"%n, data=grads[i2], step=epoch)

          tf.print('epoch: {}, layer-{}, i1: {}, i2: {}'.format(epoch, n, i1, i2))

      feature_extractor = tf.keras.Model(
          inputs=model.inputs,
          outputs=[layer.output for layer in model.layers],
      )
      features = feature_extractor(x_train)
      for i, f in enumerate(features):
        print('type_i: {}, type_f: {}'.format(type(i), type(f)))
        output_avg = tf.reduce_mean(tf.abs(f))
        tf.summary.scalar("layer_%d/avg/output"%i, data=output_avg, step=epoch)
        tf.summary.histogram("layer_%d/hist/output"%i, data=f, step=epoch)

  def on_train_end(self, logs):
      tf.summary.flush()

callback1 = GradientCallback() # freq = 10
callback2 = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq= 10) #profile_batch=0

In [15]:
#7: train and evaluate the model
ret = model.fit(x_train, y_train, epochs=101, batch_size=200, validation_split=0.2,
                 verbose=1, callbacks=[callback1, callback2])

Epoch 1/101
epoch: 0, layer-2, i1: 3, i2: 2
epoch: 0, layer-3, i1: 5, i2: 4
epoch: 0, layer-4, i1: 7, i2: 6
epoch: 0, layer-5, i1: 9, i2: 8
epoch: 0, layer-6, i1: 11, i2: 10
type_i: <class 'int'>, type_f: <class 'tensorflow.python.framework.ops.EagerTensor'>
type_i: <class 'int'>, type_f: <class 'tensorflow.python.framework.ops.EagerTensor'>
type_i: <class 'int'>, type_f: <class 'tensorflow.python.framework.ops.EagerTensor'>
type_i: <class 'int'>, type_f: <class 'tensorflow.python.framework.ops.EagerTensor'>
type_i: <class 'int'>, type_f: <class 'tensorflow.python.framework.ops.EagerTensor'>
type_i: <class 'int'>, type_f: <class 'tensorflow.python.framework.ops.EagerTensor'>
type_i: <class 'int'>, type_f: <class 'tensorflow.python.framework.ops.EagerTensor'>
Epoch 2/101
Epoch 3/101
Epoch 4/101
Epoch 5/101
Epoch 6/101
Epoch 7/101
Epoch 8/101
Epoch 9/101
Epoch 10/101
Epoch 11/101
epoch: 10, layer-2, i1: 3, i2: 2
epoch: 10, layer-3, i1: 5, i2: 4
epoch: 10, layer-4, i1: 7, i2: 6
ep

In [18]:
%tensorboard --logdir ./logs/3201

Launching TensorBoard...