In [None]:
# deep learning success depends on the variance of the incoming and outgoing of the layer
# 
# need variance to be similar for signal in and out of the layer
# use different initialization techniques dependent on the 
# activation function

# sigmoid activation saturates at 0 and 1, causing vanishing gradients

# relu is replaced with leaky relu
# when derivative of negative relu, it is 0
# therefore gradient descent will do nothing and neurons die
# leaky relu gives some slope to the negative direction of relu

# also ELU activation is good
# SELU - scaled ELU with Lecun initialization for dense sequential networks works very well
# normalizes the data automatically if satisfy the above condition

# generally selu > elu > leaky relu > relu > tanh > logistic

# using leaky relu

model = keras.models.Sequential([
    [...]
    keras.layers.Dense(10, kernel_initializer='he_normal'),
    keras.layers.LeakyReLu(alpha=.2)
    [...]
])

In [1]:
import tensorflow as tf
from tensorflow import keras
# selu

layer = keras.layers.Dense(10, activation='selu',
                          kernel_initializer='lecun_normal')

# in general
# none, tanh, logistic, softmax use Glorot initialization
# ReLu and variants use He initialization
# SELU uses LeCun initialization
# this helps avoid vanishing and exploding gradients at the beginning of training

# but no guarantee to stop it

In [2]:
# do batch normalization before or after activation function of each hidden layer

# it will zero center and normalize the inputs
# model will learn optimal scale and shift vectors for each hidden layer
# if you add on layer as first layer of neural net, no need for standard scaler anymore

# standardize input, rescale and shift

model = keras.models.Sequential([
    keras.layers.Flatten(input_shape=[28, 28]),
    keras.layers.BatchNormalization(),
    keras.layers.Dense(300, activation='elu', kernel_initializer='he_normal'),
    keras.layers.BatchNormalization(),
    keras.layers.Dense(100, activation='elu', kernel_initializer='he_normal'),
    keras.layers.BatchNormalization(),
    keras.layers.Dense(10, activation='softmax')
])

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 flatten (Flatten)           (None, 784)               0         
                                                                 
 batch_normalization (BatchN  (None, 784)              3136      
 ormalization)                                                   
                                                                 
 dense_1 (Dense)             (None, 300)               235500    
                                                                 
 batch_normalization_1 (Batc  (None, 300)              1200      
 hNormalization)                                                 
                                                                 
 dense_2 (Dense)             (None, 100)               30100     
                                                                 
 batch_normalization_2 (Batc  (None, 100)              4

In [None]:
# use gradient clipping to reduce exploding gradient for recurrent neural networks, as 
# batch normalization is hard to use on RNNs

optimizer = keras.optimizers.SGD(clipvalue=1.0)
model.compile(loss='mse', optimizer=optimizer)

# to insure the gradient clipping doesnt change direction of gradient vector,
# set clipnorm instead of clipvalue

In [None]:
# transfer learning is to reuse layers of dnns that accomplish similar tasks
# fix these reused layers weights and add to as necessary for your task

# pretend we still had our fashion mnist model

model_A = keras.models.load_model('my_model_A.h5')
model_B_on_A = keras.models.Sequential(model_A.layers[:-1])
model_B_on_A.add(keras.layers.Dense(1, activation='sigmoid')) # want binary classifier for shirts and sandals
# models a and b now share layers up to the output layer
# training B will now affect A, can prevent this by cloning A
model_A_clone = keras.models.clone_model(model_A)
model_A_clone.set_weights(model_A.get_weights())

# to give new layers reasonable time to learn weights, set reused trainable to false

for layer in model_B_on_A.layers[:-1]:
    layer.trainable = False
    
model_B_on_A.compile(loss='binary_crossentropy', optimizer='sgd',
                    metrics=['accuracy'])
# transfer learning is best for very deep networks

In [None]:
# momentum based sgd

optimizer = keras.optimizers.SGD(lr=.001, momentum=.9)

# or faster than just momentum to use nesterov optimization

optimizer = keras.optimizers.SGD(lr=.001, momentum=.9, nesterov=True)

# adagrad good for simple quadratic problems

# rmsprop is better version of adagrad for more complex problems like ann

optimizer = keras.optimizers.RMSprop(lr=.001, rho=.9)

# adam optimizer is most common, combines momentum and rmsprop

optimizer = keras.optimizers.Adam(lr=.001, beta_1=.9, beta_2=.999)
# nadam includes nesterov trick