<a href="https://colab.research.google.com/github/amifunny/Deep-Learning-Notebook/blob/master/Are_all_layers_created_equal.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
""" 
  In this Notebook we check robustness of Deep Networks.

  We check performance of model by probing effect of test performance by these two methods on each layers : -

  --> Re-initialization
  --> Randomization

  Checkout Paper --> 'https://arxiv.org/pdf/1902.01996.pdf'

"""

In [0]:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import tensorflow_datasets as tfds


In [0]:
#  Our OLD SIMPLE HANDY MNIST DATASET

raw_data,info = tfds.load('mnist',split=['train','test'],with_info=True)

print(info)

In [0]:
#  The Usual Stuff
train_data,test_data = raw_data[0],raw_data[1]

def map_func(data):

  img = tf.squeeze( ( tf.cast( data['image'] , tf.float32 )-127.5 )/127.5 )
  label = tf.cast( data['label'] , tf.float32 )
  return img,label

train_data = train_data.map( map_func )

train_data = train_data.shuffle(3000)
train_batches = train_data.batch(256)

test_data = test_data.map( map_func )

test_data = test_data.shuffle(3000)
test_batches = test_data.batch(256)

for one_b in train_batches.take(1):
  show = plt.imshow(one_b[0][0,:,:],cmap='gray')
  plt.show()
  print( one_b[0].shape )
  print( one_b[1].shape )

In [0]:
# Construct a simple FCN with 3 x 256 layer

def get_model(shape_tuple):
  inputs = tf.keras.layers.Input( shape_tuple )

  out = tf.keras.layers.Flatten()(inputs)
  out = tf.keras.layers.Dense(256,activation='relu')(out)
  out = tf.keras.layers.Dense(256,activation='relu')(out)
  out = tf.keras.layers.Dense(256,activation='relu')(out)

  outputs = tf.keras.layers.Dense( 10 , activation='softmax' )(out)

  model = 0
  model = tf.keras.Model( inputs , outputs )
  print( model.summary() )
  return model

model = get_model( (28,28) )

In [0]:
def store_weights(model):

  initial_weights={}
  trainable_layers = []

  for idx,layer in enumerate( model.layers ):

    if layer.trainable and layer.trainable_variables!=[]:
      initial_weights[idx] = layer.get_weights()
      trainable_layers.append( idx )

  print( len( trainable_layers ) )
  print( initial_weights.keys() ) 

  return trainable_layers,initial_weights

trainable_layers,initial_weights = store_weights(model)

In [0]:
#********************
batch_size = 256
ctr=0
#********************

def training(train_data,model, epochs=6 , l_rate = 0.0001 ):
  
  optimizer = tf.keras.optimizers.Adam( l_rate )
  loss_fn = tf.keras.losses.SparseCategoricalCrossentropy()

  with tf.device('/device:GPU:0'):

    for e in range(epochs):
      
      print("********* EPOCH :: {}".format(e))
      ctr = 0
      
      train_data = train_data.shuffle(10000)
      train_batches = train_data.batch(256)

      for batch in train_batches:
      
        with tf.GradientTape() as tape:

          pred = model( batch[0] )
          cost = loss_fn( batch[1] , pred )

          if ctr%20==0:
            print("Loss is === >  {}".format(cost))

          ctr = ctr+1
          
          grads = tape.gradient( cost,model.trainable_variables )
          optimizer.apply_gradients(zip(grads,model.trainable_variables))


In [0]:
training( train_data , model , 7 )

In [0]:
def test_error( model , test_batches ):

  acc = tf.keras.metrics.Accuracy()
  acc.reset_states()

  for batch in test_batches:
     
    pred = model( batch[0] )
    pred_argmax = tf.argmax( pred , -1 )

    acc.update_state( batch[1] , pred_argmax )

  test_error =  1.0-acc.result()
  print("Error is ==> {}".format( test_error ) )

  return test_error

In [0]:
test_error( model , test_batches )

In [0]:
def check_robustness( layer , model , test_batches , initial_weights , robust_type):

  randomizer = tf.keras.initializers.GlorotUniform()

  # store the weights for again setting to model
  weight = model.get_layer(index=layer).get_weights()

  if robust_type=='randomize':
    model.get_layer(index=layer).set_weights( [ randomizer( weight[0].shape ) , randomizer( weight[1].shape ) ] )
  elif robust_type=='initialize':
    model.get_layer(index=layer).set_weights( initial_weights[layer] )

  error = test_error( model , test_batches )

  # again set the true weight
  model.get_layer(index=layer).set_weights( weight )

  return error.numpy()


In [0]:
print("Checking Randomization Robustness :-- ")
for i,layer_idx in enumerate(trainable_layers):

  robust_error = check_robustness( layer_idx , model , test_batches , initial_weights , 'randomize' )
  print("Layer {} :: --  {}".format(i+1,robust_error))

print("Checking Initialization Robustness :-- ")
for i,layer_idx in enumerate(trainable_layers):

  robust_error = check_robustness( layer_idx , model , test_batches , initial_weights , 'initialize' )
  print("Layer {} :: --  {}".format(i+1,robust_error))

"""

  Oberserve the results , randomiztion completly destroy the ability of neural network. Which is obvious.

  But Re-intialization show us pretty interesting Result.
  
  Layer 1 is what we call "critical layer". as Re -initializing it increase the error significantly.

  But all other layers , when initialized on by one has no considerable effect on test_error.
  These are called "ambient layer".

  These Finding match the experiments done in paper.
  We will find similar result of SOTA models like VGG ,ResNet ie some layers are "critical" and others are "ambient".

"""

In [0]:
#  Now Not so simple CIFAR-10 Dataset

raw_data,info = tfds.load('cifar10',split=['train','test'],with_info=True)
print(info)

#  AGAIN The Usual Stuff
cifar_train_data,cifar_test_data = raw_data[0],raw_data[1]

def map_func(data):

  img = tf.squeeze( ( tf.cast( data['image'] , tf.float32 )-127.5 )/127.5 )
  label = tf.cast( data['label'] , tf.float32 )
  return img,label

cifar_train_data = cifar_train_data.map( map_func )

cifar_train_data = cifar_train_data.shuffle(3000)
cifar_train_batches = cifar_train_data.batch(256)

cifar_test_data = cifar_test_data.map( map_func )

cifar_test_data = cifar_test_data.shuffle(3000)
cifar_test_batches = cifar_test_data.batch(256)

for one_b in cifar_train_batches.take(1):
  show = plt.imshow( one_b[0][0,:,:,:] )
  plt.show()
  print( one_b[0].shape )
  print( one_b[1].shape )

In [0]:
cifar_model = get_model( (32,32,3) )

In [0]:
trainable_layers , initial_weights = store_weights( cifar_model )

In [0]:
training( cifar_train_data , cifar_model , 25 , 0.0001 )

In [0]:
test_error( cifar_model , cifar_test_batches )

In [0]:
print("Checking Randomization Robustness :-- ")
for i,layer_idx in enumerate(trainable_layers):

  robust_error = check_robustness( layer_idx , cifar_model , cifar_test_batches , initial_weights , 'randomize' )
  print("Layer {} :: --  {}".format(i+1,robust_error))

print("Checking Initialization Robustness :-- ")
for i,layer_idx in enumerate(trainable_layers):

  robust_error = check_robustness( layer_idx , cifar_model , cifar_test_batches , initial_weights , 'initialize' )
  print("Layer {} :: --  {}".format(i+1,robust_error))


"""
  CIFAR-10 is way more complex than MNIST and hece even without weights changing,
  it gives ~0.45 error on test set.

  Again our experiment match with the paper.
  
  Number of Critical layers increase with increase in complexity of task.

"""
