<a href="https://colab.research.google.com/github/amifunny/Deep-Learning-Notebook/blob/master/Lottery_Ticket_Hypothesis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
"""
  This notebook closely follow this paper - 'https://arxiv.org/pdf/1803.03635.pdf'.
  Its based on Principles of Pruning.
  Unique Idea Behind this is that When we get sub network that is pruned,
  it is said to have won the initialization lottery.
  And hence can be retrained using the starting variable.
"""

In [0]:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt 

from tensorflow.keras import layers
import random

from tensorflow.python.framework import tensor_shape

In [0]:
import tensorflow_datasets as tfds
raw_data,info = tfds.load('cifar10',split=['train','test'],with_info=True)

print(info)

In [0]:
batch_size = 256

In [0]:
train_data,test_data = raw_data[0],raw_data[1]

def map_func(data):

  img = tf.squeeze( ( tf.cast( data['image'] , tf.float32 )-127.5 )/127.5 )
  label = tf.cast( data['label'] , tf.float32 )

  return img,label

train_data = train_data.map( map_func )

train_data = train_data.shuffle(3000)
train_batches = train_data.batch( batch_size , drop_remainder=True )

test_data = test_data.map( map_func )

test_data = test_data.shuffle(3000)
test_batches = test_data.batch( batch_size , drop_remainder=True )

for one_b in train_batches.take(1):
  print( one_b[1][0] )
  show = plt.imshow(one_b[0][0,:,:],cmap='gray')
  plt.show()
  print( one_b[0].shape )
  print( one_b[1].shape )



In [0]:
# A sinple Conv-6 model with ~5 million params
inputs = tf.keras.layers.Input( (32,32,3) )

out = layers.Conv2D( 128 , (3,3) , activation='relu' , padding='SAME' )(inputs)
out = layers.Conv2D( 128 , (3,3) , activation='relu' , padding='SAME' )(out)
out = layers.MaxPool2D()(out)
out = layers.Conv2D( 256 , (3,3) , activation='relu' , padding='SAME' )(out)
out = layers.Conv2D( 256 , (3,3) , activation='relu' , padding='SAME' )(out)
out = layers.MaxPool2D()(out)
out = layers.Conv2D( 512 , (3,3) , activation='relu' , padding='SAME' )(out)
out = layers.MaxPool2D()(out)
out = layers.Conv2D( 512 , (3,3) , activation='relu' , padding='SAME' )(out)
out = layers.MaxPool2D()(out)

out = layers.Flatten()(out)
out = layers.Dense( 512 , activation = 'relu' )(out)
out = layers.Dense( 256 , activation = 'relu' )(out)
outputs = layers.Dense( 10 , activation = 'softmax' )(out)

model = tf.keras.Model( inputs , outputs )
model.summary()
# Save the init weights for pruning
model.save_weights( 'reset_init.h5' )
model_reset_w = model.weights

In [0]:
# Test the model
def test_error( model , test_batches ):

  acc = tf.keras.metrics.Accuracy()
  acc.reset_states()

  for batch in test_batches:

    pred = model( batch[0] )
    pred_argmax = tf.argmax( pred , -1 )

    acc.update_state( batch[1] , pred_argmax )
    y = batch[1]

  test_error =  1.0-acc.result()
  print("Error is ==> {}".format( test_error ) )

  return test_error

In [0]:
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy()
mean = tf.keras.metrics.Mean()

train_model = model

def training(l_rate = 0.001,epochs=10,batch_size = 256):

  optimizer = tf.keras.optimizers.RMSprop( l_rate )

  ctr = 0
  with tf.device('/device:GPU:0'):

    for e in range(epochs):
      
      print( 100*test_error( train_model , test_batches ) )

      print("********* EPOCH :: {}".format(e))

      train_ex = train_data.shuffle(10000)
      train_batches = train_ex.batch( batch_size , drop_remainder=True )
      ctr = 0
      mean.reset_states()

      for batch in train_batches:
      
        with tf.GradientTape() as tape:

          pred = train_model( batch[0] , training=True )
          
          cost = loss_fn( batch[1] , pred )

          if ctr%40==0:
            print("Loss is === >  {}".format(cost))

          ctr = ctr+1

        mean.update_state(cost)        
        grads = tape.gradient( cost,train_model.trainable_variables )
        optimizer.apply_gradients(zip(grads,train_model.trainable_variables))

      print("EPOCh cost is ===>  {}".format(mean.result()))


In [0]:
# Initial Full Training
training()

In [0]:
model.save_weights( 'Trained_Weights.h5' )

In [0]:
print( "Percent is ==> {}".format( 100*test_error( model , test_batches )) )

In [0]:
"""
  
  So now we have a trained Model with hefty ~5 million parameters.
  Trained on CIFAR-10 dataset.
  Now we will endevour our Pruning Journey.

  Steps to follow according to PAPER :-

  0. Declare and Store Model Initializations (done!)
  1. GET a Trained Model. (done!)
  2. Make weights zeros on basis of their ranking and Fine-Tune.
  3. Reset remaining to initial Parameters.
  4. Repeat steps 2 & 3 untill performance degrade.

"""

In [0]:
# Pruning on basis on Weights Value

def pruning( prev_degree , degree ):

  current_degree = prev_degree + degree
  new_weights = []

  for i,variable in enumerate(model.weights):
    
    var_shape = variable.shape
    current_flat = tf.reshape( variable , (1,-1) )
    limiting_idx = tf.cast( current_flat.shape[-1]*current_degree , tf.int32 )
    sorted_idx = tf.argsort( tf.math.abs(current_flat) ,axis=-1,direction='ASCENDING' )
    index_to_zero = np.squeeze(sorted_idx.numpy()[0,:limiting_idx] )

    reset_variable = model_reset_w[i].numpy()
    reset_flat = np.reshape( reset_variable , (1,-1) )
    reset_flat[ 0 , index_to_zero ] = 0.0

    pruned_var = tf.convert_to_tensor( reset_flat )

    rebuilt = tf.reshape( pruned_var , var_shape )
    new_weights.append(rebuilt)

  model.set_weights( new_weights )  

  return current_degree

In [0]:
# Lets do the pruning
degree = 0.1
prev_degree = 0.0
total_steps = 15

model.load_weights( 'Trained_Weights.h5' )

In [0]:
prev_degree = pruning(prev_degree,degree)

In [0]:
prune_degree_list = []
prune_acc_list = []

# Ofcourse Pruning is Computationally expensive

for prune_step in range(total_steps):
  
  # After upto 0.5 prune rate
  if prune_step==4:
    degree = 0.05

  if prev_degree>0.90:
    break

  # Fine Tuning
  training(0.0005,5)
  # Pruning
  prev_degree = pruning(prev_degree,degree)

  print("*******************************************************************")
  print("-------------------- One Step Prune -------------------------------")
  prune_acc = 100*test_error( model , test_batches )
  print( "Percent is ==> {}".format( prune_acc ) )
  prune_degree_list.append( prev_degree )
  prune_acc_list.append( prune_acc )
  print("*******************************************************************")


  model.save_weights('pruneW_with_degree_{:.2f}.h5'.format(prev_degree))



In [0]:
import matplotlib.pyplot as plt

plt.plot( prune_degree_list , prune_acc_list )
plt.xlabel('Prune Error')
plt.ylabel('Prune Degree')
plt.show()

In [0]:
# We get a good sparse netwrok upto 70% less parameters
# But equally good accuracy.
# Now according to paper the subnetwork
# We got after pruning the weights had win the initialization lottery!!

In [0]:
model.load_weights('pruneW_with_degree_0.70.h5')
best_pruned_weights =  model.weights
print( "Percent is ==> {}".format( 100*test_error( model , test_batches )) )

In [0]:
# We ranodmize non-zero weights
random_pruned_weights = []

for variable in model.weights:
    
  random_w = tf.where( variable!=0 , x=tf.random.normal(variable.shape) , y=variable )
  random_pruned_weights.append( random_w )

model.set_weights( random_pruned_weights )  

In [0]:
# of course we will get no accuracy
# bcz of random weights
print( "Percent is ==> {}".format( 100*test_error( model , test_batches )) )

In [0]:
# and train the model
training(0.00001,10)
# Ofcourse this network can't be trained with random initialization.
# Hence our Hypothesis is true,
# The Final Pruned Network has nodes which has won Init Lottery!
# And we get a network with 70% less parameters!