In [2]:
import numpy as np
import tensorflow as tf

### Batch Normalization

In [4]:
model = tf.keras.Sequential([
  tf.keras.layers.Flatten(input_shape=[28,28]),
  
  # BN layer as a first, input normalization layer
  tf.keras.layers.BatchNormalization(),

  # BN layer after each hidden layer
  tf.keras.layers.Dense(300, activation="relu", kernel_initializer="he_normal"),
  tf.keras.layers.BatchNormalization(),

  tf.keras.layers.Dense(100, activation="relu", kernel_initializer="he_normal"),
  tf.keras.layers.BatchNormalization(),

  tf.keras.layers.Dense(10, activation="softmax")
])

model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 flatten_1 (Flatten)         (None, 784)               0         
                                                                 
 batch_normalization_3 (Bat  (None, 784)               3136      
 chNormalization)                                                
                                                                 
 dense_3 (Dense)             (None, 300)               235500    
                                                                 
 batch_normalization_4 (Bat  (None, 300)               1200      
 chNormalization)                                                
                                                                 
 dense_4 (Dense)             (None, 100)               30100     
                                                                 
 batch_normalization_5 (Bat  (None, 100)              

### Transfer Learning

For this example, we'll use fashion MNIST. We'll assume that someone trained a model for 8/10 classes of the dataset (all except sandal
and T-shirt). We'll call this model A. 

The problem you're trying to solve is classifying T-Shirts (positive) from sandals (negative). You use model A as your base model for
transfer learning.

**Warning!**

This example is hella contrived. In reality, transfer learning does not work well on small NN as it these networks learn very
specific patterns to the data that are not very reusable. In practice, you'd want to use transfer learning only on DNNs that 
are complex and deep.

In [5]:
fashion_mnist = tf.keras.datasets.fashion_mnist.load_data()
(X_train_full, y_train_full), (X_test, y_test) = fashion_mnist
X_train, y_train = X_train_full[:-5000], y_train_full[:-5000]
X_valid, y_valid = X_train_full[-5000:], y_train_full[-5000:]
X_train, X_valid, X_test = X_train / 255, X_valid / 255, X_test / 255

In [6]:
# Training model A - the model we'll use as the base for the transfer learning. In reality, this model will be from a library of 
# pretrained models.

class_names = ["T-shirt/top", "Trouser", "Pullover", "Dress", "Coat",
               "Sandal", "Shirt", "Sneaker", "Bag", "Ankle boot"]

pos_class_id = class_names.index("Pullover")
neg_class_id = class_names.index("T-shirt/top")

def split_dataset(X, y):
    y_for_B = (y == pos_class_id) | (y == neg_class_id)
    y_A = y[~y_for_B]
    y_B = (y[y_for_B] == pos_class_id).astype(np.float32)
    old_class_ids = list(set(range(10)) - set([neg_class_id, pos_class_id]))
    for old_class_id, new_class_id in zip(old_class_ids, range(8)):
        y_A[y_A == old_class_id] = new_class_id  # reorder class ids for A
    return ((X[~y_for_B], y_A), (X[y_for_B], y_B))

(X_train_A, y_train_A), (X_train_B, y_train_B) = split_dataset(X_train, y_train)
(X_valid_A, y_valid_A), (X_valid_B, y_valid_B) = split_dataset(X_valid, y_valid)
(X_test_A, y_test_A), (X_test_B, y_test_B) = split_dataset(X_test, y_test)
X_train_B = X_train_B[:200]
y_train_B = y_train_B[:200]

tf.random.set_seed(42)

model_A = tf.keras.Sequential([
    tf.keras.layers.Flatten(input_shape=[28, 28]),
    tf.keras.layers.Dense(100, activation="relu",
                          kernel_initializer="he_normal"),
    tf.keras.layers.Dense(100, activation="relu",
                          kernel_initializer="he_normal"),
    tf.keras.layers.Dense(100, activation="relu",
                          kernel_initializer="he_normal"),
    tf.keras.layers.Dense(8, activation="softmax")
])

model_A.compile(loss="sparse_categorical_crossentropy",
                optimizer=tf.keras.optimizers.SGD(learning_rate=0.001),
                metrics=["accuracy"])
history = model_A.fit(X_train_A, y_train_A, epochs=20, validation_data=(X_valid_A, y_valid_A))
model_A.save("my_model_A")



Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
INFO:tensorflow:Assets written to: my_model_A/assets


INFO:tensorflow:Assets written to: my_model_A/assets


In [8]:
# Preparing model B - the model that will use model A as its base for transfer learning

# Make a clone of model A (to not make changes to the actual model A)
model_A_clone = tf.keras.models.clone_model(model_A)

# Cloning a model does not clone the weights, just the architecture. We need to add the weights
model_A_clone.set_weights(model_A.get_weights())

# Getting all of model A's layers except the output layer
model_B_on_A = tf.keras.Sequential(model_A_clone.layers[:-1])

# Adding a new output layer for binary classification
model_B_on_A.add(tf.keras.layers.Dense(1, activation="sigmoid"))

# Freeze all layers (make them non-trainable) except the new layer
for layer in model_B_on_A.layers[:-1]:
  layer.trainable = False

# Compile Model B
optimizer = tf.keras.optimizers.legacy.SGD(learning_rate=0.001)
model_B_on_A.compile(loss="binary_crossentropy", optimizer=optimizer, metrics=["accuracy"])

In [9]:
# First we run a few epochs with every layer frozen except the output layer. This will ensure that the output layer's
# weights are not totally random (before we unfreeze the lower layers). If we unfreeze the lower layers (where the weights
# are tuned), the large gradients from the random weights will wreck the fine tuning.

history = model_B_on_A.fit(X_train_B, y_train_B, epochs=4, validation_data=(X_valid_B, y_valid_B))

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


In [11]:
# Now the weights of the output layers are not totally random, we can unfreeze the lower layers and proceed with the
# training

for layer in model_B_on_A.layers[:-1]:
  layer.trainable = True

optimizer = tf.keras.optimizers.legacy.SGD(learning_rate=0.001)
model_B_on_A.compile(loss="binary_crossentropy", optimizer=optimizer, metrics=["accuracy"])

history = model_B_on_A.fit(X_train_B, y_train_B, epochs=32, validation_data=(X_valid_B, y_valid_B))

Epoch 1/32


Epoch 2/32
Epoch 3/32
Epoch 4/32
Epoch 5/32
Epoch 6/32
Epoch 7/32
Epoch 8/32
Epoch 9/32
Epoch 10/32
Epoch 11/32
Epoch 12/32
Epoch 13/32
Epoch 14/32
Epoch 15/32
Epoch 16/32
Epoch 17/32
Epoch 18/32
Epoch 19/32
Epoch 20/32
Epoch 21/32
Epoch 22/32
Epoch 23/32
Epoch 24/32
Epoch 25/32
Epoch 26/32
Epoch 27/32
Epoch 28/32
Epoch 29/32
Epoch 30/32
Epoch 31/32
Epoch 32/32


In [14]:
model_B_on_A.evaluate(X_test_B, y_test_B)



[0.1989995390176773, 0.9440000057220459]

## Learning Schedules

In [16]:
# Power scheduling
optimizer = tf.keras.optimizers.legacy.SGD(learning_rate=0.01, decay=1e-4)


In [21]:
# Exponential scheduling
def exponential_decay(lr0, s):
  def exponential_decay_fn(epoch):
    return lr0 * 0.1 ** (epoch / s)
  return exponential_decay_fn

exponential_decay_fn = exponential_decay(lr0=0.01, s=20)

# This callback updates the optimizer's learning_rate attribute at the beginning of each epoch
lr_schedule = tf.keras.callbacks.LearningRateScheduler(exponential_decay_fn)

optimizer = tf.keras.optimizers.legacy.SGD()
model.compile(loss="sparse_categorical_crossentropy", optimizer=optimizer, metrics=["accuracy"])
history = model.fit(X_train, y_train, epochs=3, callbacks=[lr_schedule])


Epoch 1/3
Epoch 2/3
Epoch 3/3
