In [1]:
# different initionalizers to solve vanishing gradients problems
# P 432-433

# keras ReLU uses Glorot initialization with a uniform distribution
# setting keras.layers.Dense(..., kernel_initializer="he_normal")
# means using He initialization



# he_avg_init = keras.initializers.VarianceScaling(scale=2., mode='fan_avg',distribution='uniform')
# keras.layers.Dense(10, activation="sigmoid",
# kernel_initializer=he_avg_init)

# this sets He initialization based on fanavg instead of fanin

In [2]:
# Nonsaturating Activation Functions
# ReLU doesnt saturating ,but neurons may die and continue to output only 0
# this happends when the weighted sum of the weights are positive
# ReLU's gradient is zero at this case so the backpropagation doesnt work

# leaky ReLU P434
# parametric leaky ReLU P434
# randomized leaky ReLU P434 (outperform ReLU on large image datasets)

In [3]:
# Exponential linear unit( ELU)
# outperforms all ReLU
# P 435-436
# the darkback is that it is slower to compute

# Scaled ELU ( SELU) P437
# how to use in keras P438

In [5]:
# Batch normalization
# 1.avoids standardscaler
# 2.avoids exploding gradients and vanishing gradients

# however, not useful in RNNs or other complex types of neural networks

import tensorflow as tf
from tensorflow import keras
import numpy as np
model = keras.models.Sequential([
    keras.layers.Flatten(input_shape=[28,28]),
    keras.layers.BatchNormalization(),
    keras.layers.Dense(300, kernel_initializer="he_normal",use_bias=False),
    keras.layers.BatchNormalization(),
    keras.layers.Activation("elu"),
    keras.layers.Dense(100, kernel_initializer="he_normal", use_bias = False),
    keras.layers.BatchNormalization(),
    keras.layers.Activation("elu"),
    keras.layers.Dense(10,activation="softmax")
    
])
# this codes adds BatchNormalization before activation functions
# P444
# better test if this works better on current dataset

In [28]:
# Gradient Clipping
optimizer = keras.optimizers.SGD(clipvalue=1.0)
model.compile(loss="mse", optimizer=optimizer)
# this method has a lot problems, not recommended
# clipnorm=1.0 doesnt change its orientation


NameError: name 'model' is not defined

In [6]:
# Transfer Learning with Keras
(X_train_full, y_train_full), (X_test, y_test) = keras.datasets.fashion_mnist.load_data()
X_train_full = X_train_full / 255.0
X_test = X_test / 255.0
X_valid, X_train = X_train_full[:5000], X_train_full[5000:]
y_valid, y_train = y_train_full[:5000], y_train_full[5000:]

def split_dataset(X, y):
    y_5_or_6 = (y == 5) | (y == 6) # sandals or shirts
    y_A = y[~y_5_or_6]
    y_A[y_A > 6] -= 2 # class indices 7, 8, 9 should be moved to 5, 6, 7
    y_B = (y[y_5_or_6] == 6).astype(np.float32) # binary classification task: is it a shirt (class 6)?
    return ((X[~y_5_or_6], y_A),
            (X[y_5_or_6], y_B))

(X_train_A, y_train_A), (X_train_B, y_train_B) = split_dataset(X_train, y_train)
(X_valid_A, y_valid_A), (X_valid_B, y_valid_B) = split_dataset(X_valid, y_valid)
(X_test_A, y_test_A), (X_test_B, y_test_B) = split_dataset(X_test, y_test)
X_train_B = X_train_B[:200]
y_train_B = y_train_B[:200]

# X_train_A: all images of all items except for sandals and shirts (classes 5 and 6).
# X_train_B: a much smaller training set of just the first 200 images of sandals or shirts.

In [7]:
tf.random.set_seed(42)
np.random.seed(42)
model_A = keras.models.Sequential()
model_A.add(keras.layers.Flatten(input_shape=[28,28]))
for n_hidden in (300, 100, 50, 50, 50):
    model_A.add(keras.layers.Dense(n_hidden, activation="selu"))
model_A.add(keras.layers.Dense(8, activation="softmax"))


In [8]:
model_A.compile(loss="sparse_categorical_crossentropy",
                optimizer=keras.optimizers.SGD(learning_rate=1e-3),
                metrics=["accuracy"])

In [9]:
keras.backend.clear_session()

In [10]:

early_stopping_cb = keras.callbacks.EarlyStopping(patience=5)
histroy = model_A.fit(X_train_A, y_train_A, epochs=50,
                      validation_data=(X_valid_A, y_valid_A),
                      callbacks=early_stopping_cb)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [11]:
model_A.save("my_model_A.h5")

In [12]:
# train a model B to see how model_B_on_A improves training speed and accuracy
model_B = keras.models.Sequential()
model_B.add(keras.layers.Flatten(input_shape=[28, 28]))
for n_hidden in (300, 100, 50, 50, 50):
    model_B.add(keras.layers.Dense(n_hidden, activation="selu"))
model_B.add(keras.layers.Dense(1,activation="sigmoid"))

In [13]:
model_B.compile(loss="binary_crossentropy",
                optimizer=keras.optimizers.SGD(learning_rate=1e-3),
                metrics=["accuracy"])

In [14]:
early_stopping_cb = keras.callbacks.EarlyStopping(patience=5)
histroy = model_B.fit(X_train_B, y_train_B, epochs=50,
                      validation_data=(X_valid_B, y_valid_B),
                      callbacks=early_stopping_cb)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [15]:
model_B.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 flatten (Flatten)           (None, 784)               0         
                                                                 
 dense (Dense)               (None, 300)               235500    
                                                                 
 dense_1 (Dense)             (None, 100)               30100     
                                                                 
 dense_2 (Dense)             (None, 50)                5050      
                                                                 
 dense_3 (Dense)             (None, 50)                2550      
                                                                 
 dense_4 (Dense)             (None, 50)                2550      
                                                                 
 dense_5 (Dense)             (None, 1)                 5

In [16]:

model_A = keras.models.load_model("my_model_A.h5")
model_A.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 flatten_4 (Flatten)         (None, 784)               0         
                                                                 
 dense_8 (Dense)             (None, 300)               235500    
                                                                 
 dense_9 (Dense)             (None, 100)               30100     
                                                                 
 dense_10 (Dense)            (None, 50)                5050      
                                                                 
 dense_11 (Dense)            (None, 50)                2550      
                                                                 
 dense_12 (Dense)            (None, 50)                2550      
                                                                 
 dense_13 (Dense)            (None, 8)                

In [17]:
model_A.layers

[<keras.layers.reshaping.flatten.Flatten at 0x1ba95c893a0>,
 <keras.layers.core.dense.Dense at 0x1ba957d4a90>,
 <keras.layers.core.dense.Dense at 0x1ba95c89c40>,
 <keras.layers.core.dense.Dense at 0x1ba95c8e880>,
 <keras.layers.core.dense.Dense at 0x1ba95c76df0>,
 <keras.layers.core.dense.Dense at 0x1ba95c973d0>,
 <keras.layers.core.dense.Dense at 0x1ba95c8e700>]

In [18]:
# first, reuse all layers
model_B_on_A = keras.models.Sequential(model_A.layers[:-1])
# however, drop the last layer because it has a different output shape
# this codes make model_B_on_A contains part of A, clone another model_A can avoid
# model_A to be changed while training model_B_on_A

In [19]:
model_B_on_A.add(keras.layers.Dense(1, activation="sigmoid"))
model_A_clone = keras.models.clone_model(model_A)
model_A_clone.set_weights(model_A.get_weights())

In [20]:
model_B_on_A.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 flatten_4 (Flatten)         (None, 784)               0         
                                                                 
 dense_8 (Dense)             (None, 300)               235500    
                                                                 
 dense_9 (Dense)             (None, 100)               30100     
                                                                 
 dense_10 (Dense)            (None, 50)                5050      
                                                                 
 dense_11 (Dense)            (None, 50)                2550      
                                                                 
 dense_12 (Dense)            (None, 50)                2550      
                                                                 
 dense_6 (Dense)             (None, 1)                

In [21]:
for layer in model_B_on_A.layers[:-1]:
    layer.trainable = False
model_B_on_A.compile(loss="binary_crossentropy", 
                     optimizer="sgd", metrics=["accuracy"])
histroy = model_B_on_A.fit(X_train_B, y_train_B, epochs=16,
                           validation_data=(X_valid_B, y_valid_B))

Epoch 1/16
Epoch 2/16
Epoch 3/16
Epoch 4/16
Epoch 5/16
Epoch 6/16
Epoch 7/16
Epoch 8/16
Epoch 9/16
Epoch 10/16
Epoch 11/16
Epoch 12/16
Epoch 13/16
Epoch 14/16
Epoch 15/16
Epoch 16/16


In [22]:
for layer in model_B_on_A.layers[:-1]:
    layer.trainable = True
model_B_on_A.compile(loss="binary_crossentropy",
                     optimizer=keras.optimizers.SGD(learning_rate=1e-3),
                     metrics=["accuracy"])
history = model_B_on_A.fit(X_train_B, y_train_B, epochs=16,
                 validation_data=(X_valid_B, y_valid_B))


Epoch 1/16
Epoch 2/16
Epoch 3/16
Epoch 4/16
Epoch 5/16
Epoch 6/16
Epoch 7/16
Epoch 8/16
Epoch 9/16
Epoch 10/16
Epoch 11/16
Epoch 12/16
Epoch 13/16
Epoch 14/16
Epoch 15/16
Epoch 16/16


In [23]:
check_B_on_A = model_B_on_A.evaluate(X_test_B, y_test_B)



In [24]:
check_B = model_B.evaluate(X_test_B, y_test_B)



In [25]:
check_B_on_A[1] - check_B[1]
# the model's test accuracy is higher with fewer epochs of training
# however, transfer learning does not always work well
# but performs quite well on small dense networks

0.01250004768371582

In [81]:
# Unsupervised Pretraining
# P451- 453

In [82]:
# faster optimizers
# P 454-
# 1.Momentum Optimization
#   adds a momentum vector, speeding traditional Gradient Descent by to 10 times
#   it helps rolling local optima faster and DNN that dont use Batch Normalization

# implementation
optimizer = keras.optimizers.SGD(learning_rate=0.001, momentum=0.9)
# just add a momentum, ( 0.9 is a nice default)

In [26]:
# 2.Nosterov Accelerated Gradient
#   this is always faster than vanilla momentum optimization

#implementation
optimizer = keras.optimizers.SGD(learning_rate=0.001, momentum=0.9,
                                 nesterov=True)


In [27]:
# 3.AdaGrad
#   this optimizer points directly to the global optiminum instead of going
#    down to the direction of the local optimum
#    AdaGrad stops too fast before reaching the global optimum though it is fast
#    avoid using Adagrad when training neural network is a wise choice

#    however, it fits well on Linear Regression or other simpler questions
#    P458

In [29]:
# 4.RMSProp
#   RMSProp fixes the problem of AdaGrad to never converging to the global optimum
#   by accumulating only the gradients from the most recent iterations
#   it does so by using exponential decay in the first step
#   P459

#implementation
optimizer = keras.optimizers.RMSprop(learning_rate=0.001, rho=0.9)
# rho = 0.9 works well most of the time


In [30]:
# 5.Adam ( Adaptive moment estimation)
#   combines the ideas of momentum optimization and RMSProp
#   it keeps track of an exponentially decaying average of past gradients
#   P459-460

#implementation
optimizer = keras.optimizers.Adam(learning_rate = 0.001, beta_1 =0.9, beta_2 = 0.999)
# the default learning_rate is good enough because Adam is an adaptive learning rate
# algorithm (dont have to choose the best learning_rate, instead, setting it
# to default does most of the work)

In [31]:
# 6.AdaMax
#   more stable than Adam but depends on datasets, Adam is better most of the time
#   than AdaMax
#   P461

In [32]:
# 7.Nadam
#   Nadam is Adam + Nesterov trick, which often converges slightly faster than Adam

In [33]:
# a comparison of these optimizers
# P463

In [34]:
# Learning Rate Scheduling
# instead of using an optimal but constant learning rate,
# starting with a low learning rate, increase and then drop again may be a better
# solution and this is  call learning shcedules
# 1. Power Scheduling 
# P464

# implementation
optimizer = keras.optimizers.SGD(learning_rate=0.01, decay=1e-4)
# setting the decay implements power scheduling, controlling
# how fast the learning_rate decays
# decay is the inverse of s ( the numbers of steps it takes to divide the learning
# rate by one more unit)

In [43]:
# 2.Exponential scheduling
# P465

# implementation
def exponential_decay_fn(epoch):
    return 0.01*0.1**(epoch/20)
# If you do not want to hardcode η and s, you can create a function that returns a
# configured function:

def exponential_decay(lr0, s):
    def exponential_decay_fn(epoch):
        return lr0*0.1**(epoch/s)
    return exponential_decay_fn
exponential_decay_fn =exponential_decay(lr0=0.01, s=20)
#この書き方だと
# exponential_decay(xx, xx)(xx)
#の書き方をする
lr_scheduler = keras.callbacks.LearningRateScheduler(exponential_decay_fn)
#  histroy = model.fit(X_train_scaled, y_train, [...], callbacks =[lr_scheduler]) 
# learning_rate will be updated at the beginning of each epoch

# .fit sets epochs to start from 1, to avoid hurting the weights with a too big
# learning_rate, setting fit( initial_epoch=xxx) manually


# e.g.
# def fn(x):
#     def gn(y):
#         return x+y+5
#     return gn
# fn(5)(5)


In [36]:
# 3.Piecewise constant scheduling
# P465

#implementation
def piecewise_constant_fn(epoch):
    if epoch <5:
        return 0.01
    elif epoch <15:
        return 0.005
    else:
        return 0.001
# and pass it to the callback
# to update the learning rate at each iteration rather than at each epoch,
# see Github examples In[84] !!!


In [37]:
# 4.Performance scheduling
# P465

# implementation
lr_scheduler = keras.callbacks.ReduceLROnPlateau(factor= 0.5, patience=5)
# when the val_score doesnt improve for 5 epochs, learning_rate is multiplied by 0.5


In [None]:
# an alternative way to implement learning rate scheduling
# example on exponential_decay_fn
s = 20*len(X_train) //32 # number of steps in 20 epochs(batch_size = 32)
learning_rate = keras.optimizers.schedules.ExponentialDecay(0.01, s, 0.1)
optimizer = keras.optimizers.SGD(learning_rate)

# with this method, saved models have the learning rate and its schedule as well


In [38]:
# 5.1cycle scheduling
# P465

# Github In[96]

In [1]:
# avoiding overfitting Through Regularization
# typical ones:
# 1. Batch Normalization
# 2. early stopping

In [2]:
# 3. l1 and l2 Regularization
# l2 regularization to constrain a neural networks's connection weights
# l1 regularization when training sparse model

# implementation
layer = keras.layers.Dense(100, activation="elu",
                           kernel_initializer="he_normal",
                           kernel_regularizer=keras.regularizers.l2(0.01))
# a regularizer is called at each step during training to compute the regularization
# loss, which is then added to the final loss

# keras.regularizers.l1(x)
# keras.regularizers.l1_l2(x,x)
# also works

from functools import partial
RegularizedDense = partial(keras.layers.Dense,
                           activation="elu",
                           kernel_initializer="he_normal",
                           kernel_regularizer=keras.regularizers.l2(0.01))
model = keras.models.Sequential([
    keras.layer.Flatten(input_shape=[28,28]),
    RegularizedDense(300),
    RegularizedDense(100),
    RegularizedDense(10, activation="softmax",
                     kernel_initializer="glorot_uniform")
])

NameError: name 'keras' is not defined

In [3]:
# 4.Dropout
#   at every training step, every neuron has a probability p of being temporarily
#   dropped out, meaning it will be entirely ignored during this training step,
#   but it may be active during the next step.
# p is called dropout rate ( automatically set to 10%)


# implementation
model = keras.models.Sequential([
    keras.layers.Flatten(input_shape=[28,28]),
    keras.layers.Dropout(rate=0.2),
    keras.layers.Dense(300, activation="elu", kernel_initializer="he_normal"),
    keras.layers.Dropout(rate=0.2),
    keras.layers.Dense(100, activation="elu", kernel_initializer="he_normal"),
    keras.layers.Dropout(rate=0.2),
    keras.layers.Dense(10, activation="softmax", kernal_initializer="glorot_uniform")
])

# increase the dropout rate when observing the model to be overfitting

NameError: name 'keras' is not defined

In [None]:
# 5.Monte Carlo (MC) Dropout

#implementation
y_probas = np.stack([model(X_test_scaled, training=True) for sample in range(100)])
y_proba = y_probas.mean(axis=0)
# stack, axis=0 は e.g. np.stack([[0,1,2],[3,4,5]]) = [[1,2,3],[4,5,6]]
# this code makes 100 predictions over the test ste, training=True ensures the
# Dropout layer is active

# to check the predictions made when dropout is activated:
np.round(y_probas[:, :1], 2)
# take average
np.round(y_proba[:1], 2)
# standard deviation of the probability estimates
y_std = y_probas.std(axis=0) #.stdは標準偏差の計算
np.round(y_std[:1], 2)
# finally check the accuracy
accuracy = np.sum(y_pred == y_test)/len(y_test)

In [None]:
# if special layers like BatchNormalization layers exist,
# using MCDropout class instead
class MCDropout(keras.layers.Dropout):
    def call(self, inputs):
        return super().call(inputs, training=True)

In [None]:
# 6.Max-Norm Regularization
# P477

# implementation
keras.layers.Dense(100, activation="elu", kernel_initializer="he_normal",
                   kernel_constraint = keras.constraints.max_norm(1.))
# after each training iteration, the object returned by max_norm is called and
# the layers's weights are replaced by rescaled weights in return
# !! when using convolutional layers, set the max_norm constraint's 
# properly, and this is usually axis=[0,1,2]

In [None]:
# A brief summary 
# P478