# Quantization + LTH: LeNet-300-100 for MNIST

In [1]:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import math
import tensorflow_model_optimization as tfmot
# from tensorflow_model_optimization.sparsity import keras as sparsity
# from tensorflow.keras import datasets, layers, models

from tensorflow.keras.layers import AveragePooling2D, Conv2D, MaxPooling2D, ReLU
from tensorflow.keras import models, layers, datasets
from tensorflow.keras.layers import Dense, Flatten, Reshape, Input, InputLayer
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.initializers import RandomNormal

from sklearn.metrics import accuracy_score, precision_score, recall_score


  import pandas.util.testing as tm


In [2]:
tf.__version__

'2.2.0'

In [3]:
%env CUDA_DEVICE_ORDER=PCI_BUS_ID
%env CUDA_VISIBLE_DEVICES=2

env: CUDA_DEVICE_ORDER=PCI_BUS_ID
env: CUDA_VISIBLE_DEVICES=2


In [3]:
batch_size = 60
num_classes = 10
num_epochs = 100

In [4]:
# Data preprocessing and cleadning:
# input image dimensions
img_rows, img_cols = 28, 28

# Load MNIST dataset-
(X_train, y_train), (X_test, y_test) = tf.keras.datasets.mnist.load_data()

In [5]:
if tf.keras.backend.image_data_format() == 'channels_first':
    X_train = X_train.reshape(X_train.shape[0], 1, img_rows, img_cols)
    X_test = X_test.reshape(X_test.shape[0], 1, img_rows, img_cols)
    input_shape = (1, img_rows, img_cols)
else:
    X_train = X_train.reshape(X_train.shape[0], img_rows, img_cols, 1)
    X_test = X_test.reshape(X_test.shape[0], img_rows, img_cols, 1)
    input_shape = (img_rows, img_cols, 1)

print("\n'input_shape' which will be used = {0}\n".format(input_shape))


'input_shape' which will be used = (28, 28, 1)



In [6]:
# Convert datasets to floating point types-
X_train = X_train.astype('float32')
X_test = X_test.astype('float32')

# Normalize the training and testing datasets-
X_train /= 255.0
X_test /= 255.0

In [7]:
# convert class vectors/target to binary class matrices or one-hot encoded values-
y_train = tf.keras.utils.to_categorical(y_train, num_classes)
y_test = tf.keras.utils.to_categorical(y_test, num_classes)

In [8]:
y_train.shape, y_test.shape

((60000, 10), (10000, 10))

In [9]:
print("\nX_train.shape = {0}, y_train.shape = {1}".format(X_train.shape, y_train.shape))
print("\nX_test.shape = {0}, y_test.shape = {1}\n".format(X_test.shape, y_test.shape))


X_train.shape = (60000, 28, 28, 1), y_train.shape = (60000, 10)

X_test.shape = (10000, 28, 28, 1), y_test.shape = (10000, 10)



In [10]:
# Reshape training and testing sets-
X_train = X_train.reshape(X_train.shape[0], 784)
X_test = X_test.reshape(X_test.shape[0], 784)

In [11]:
print("\nDimensions of training and testing sets are:")
print("X_train.shape = {0}, y_train.shape = {1}".format(X_train.shape, y_train.shape))
print("X_test.shape = {0}, y_test.shape = {1}".format(X_test.shape, y_test.shape))


Dimensions of training and testing sets are:
X_train.shape = (60000, 784), y_train.shape = (60000, 10)
X_test.shape = (10000, 784), y_test.shape = (10000, 10)


### Prepare MNIST dataset for _GradientTape_ training:

In [12]:
# Create training and testing datasets-
train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train))
test_dataset = tf.data.Dataset.from_tensor_slices((X_test, y_test))

In [13]:
train_dataset = train_dataset.shuffle(buffer_size = 20000, reshuffle_each_iteration = True).batch(batch_size = batch_size, drop_remainder = False)

In [14]:
test_dataset = test_dataset.batch(batch_size=batch_size, drop_remainder=False)

In [15]:
# Choose an optimizer and loss function for training-
loss_fn = tf.keras.losses.CategoricalCrossentropy()
optimizer = tf.keras.optimizers.Adam(lr = 0.0012)

In [16]:
# Select metrics to measure the error & accuracy of model.
# These metrics accumulate the values over epochs and then
# print the overall result-
train_loss = tf.keras.metrics.Mean(name = 'train_loss')
train_accuracy = tf.keras.metrics.CategoricalAccuracy(name = 'train_accuracy')

test_loss = tf.keras.metrics.Mean(name = 'test_loss')
test_accuracy = tf.keras.metrics.CategoricalAccuracy(name = 'test_accuracy')

In [17]:
def lenet_nn():
	"""
	Function to define the architecture of a neural network model
	following 300 100 Dense Fully-Connected architecture for MNIST
	dataset.
    
	Output: Returns designed and compiled neural network model
	"""
    
	model = Sequential()
	model.add(InputLayer(input_shape=(784, )))
	# model.add(Flatten())
	model.add(
		Dense(
			units = 300, activation='relu',
			kernel_initializer=tf.initializers.GlorotUniform()
			)
		)

	# model.add(l.Dropout(0.2))

	model.add(
		Dense(
			units = 100, activation='relu',
			kernel_initializer=tf.initializers.GlorotUniform()
			)
		)
        
	# model.add(l.Dropout(0.1))

	model.add(
		Dense(
			units = num_classes, activation='softmax'
			)
		)
    

	# Compile pruned NN-
	model.compile(
		loss=tf.keras.losses.categorical_crossentropy,
		# optimizer='adam',
		optimizer=tf.keras.optimizers.Adam(lr = 0.0012),
		metrics=['accuracy'])
    
	return model


In [18]:
# Initialize model-
model = lenet_nn()

In [19]:
# Load weights of winning ticket-
model.load_weights("/home/arjun/Desktop/Codes/Lottery_Hypothesis-Resources/Latest_Works/LTH_Experiments/Experiment_number_5/LeNet_300_100_MNIST/LeNet_300_MNIST_Magnitude_Winning_Ticket_Distribution_91.18900266306589.h5")

In [59]:
# Count number of non-zero parameters-
winning_params = 0

for layer in model.trainable_weights:
    nonzeroparams = tf.math.count_nonzero(layer, axis = None).numpy()
    print("layer: {0} has {1} non-zero parameters".format(layer.shape, nonzeroparams))
    winning_params += nonzeroparams

print("\nTotal # of non-zero parameters = {0}\n".format(winning_params))

layer: (784, 300) has 20204 non-zero parameters
layer: (300,) has 0 non-zero parameters
layer: (300, 100) has 2577 non-zero parameters
layer: (100,) has 0 non-zero parameters
layer: (100, 10) has 314 non-zero parameters
layer: (10,) has 0 non-zero parameters

Total # of non-zero parameters = 23095



In [21]:
import tensorflow.keras.backend as K


# METHOD-1: This also counts biases

trainable_wts = np.sum([K.count_params(w) for w in model.trainable_weights])
non_trainable_wts = np.sum([K.count_params(w) for w in model.non_trainable_weights])

print("\nNumber of training weights = {0} and non-trainabel weights = {1}\n".format(
    trainable_wts, non_trainable_wts
))
print("Total number of parameters = {0}\n".format(trainable_wts + non_trainable_wts))



Number of training weights = 266610 and non-trainabel weights = 0.0

Total number of parameters = 266610.0



In [23]:
print("\n{0:.4f}% of parameters have been pruned\n".format((trainable_wts - params) / trainable_wts * 100))


91.3375% of parameters have been pruned



In [33]:
# Create mask using winning ticket-

# Instantiate a new neural network model for which, the mask is to be created,
mask_model = lenet_nn()
    
# Load weights of PRUNED model-
mask_model.set_weights(model.get_weights())
    
# For each layer, for each weight which is 0, leave it, as is.
# And for weights which survive the pruning,reinitialize it to ONE (1)-
for wts in mask_model.trainable_weights:
    wts.assign(tf.where(tf.equal(wts, 0.), 0., 1.))


In [34]:
# Count number of non-zero masks-
mask_params = 0

for layer in mask_model.trainable_weights:
    nonzeroparams = tf.math.count_nonzero(layer, axis = None).numpy()
    print("layer: {0} has {1} non-zero masks".format(layer.shape, nonzeroparams))
    mask_params += nonzeroparams

print("\nTotal # of non-zero masks = {0}\n".format(mask_params))

layer: (784, 300) has 20204 non-zero masks
layer: (300,) has 0 non-zero masks
layer: (300, 100) has 2577 non-zero masks
layer: (100,) has 0 non-zero masks
layer: (100, 10) has 314 non-zero masks
layer: (10,) has 0 non-zero masks

Total # of non-zero masks = 23095



In [60]:
if mask_params == winning_params:
    print("\nnumber of non-zero parameters and masks matches!")
else:
    print("\nERROR! number of non-zero parameters and masks DO NOT MATCH!")


number of non-zero parameters and masks matches!


### Clone and fine-tune pre-trained model with quantization aware training:

In [24]:
quantize_model = tfmot.quantization.keras.quantize_model

# q_aware stands for for quantization aware.
q_aware_model = quantize_model(model)

In [25]:
# 'quantize_model' requires recompilation-
q_aware_model.compile(
    optimizer = tf.keras.optimizers.Adam(lr = 0.0012),
    loss=tf.keras.losses.categorical_crossentropy,
    metrics=['accuracy']
)


In [26]:
# Get quantization aware model summary-
q_aware_model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
quant_dense (QuantizeWrapper (None, 300)               235505    
_________________________________________________________________
quant_dense_1 (QuantizeWrapp (None, 100)               30105     
_________________________________________________________________
quant_dense_2 (QuantizeWrapp (None, 10)                1015      
Total params: 266,625
Trainable params: 266,610
Non-trainable params: 15
_________________________________________________________________


### Train winning ticket model-

In [39]:
# User input parameters for Early Stopping in manual implementation-
minimum_delta = 0.001
patience = 3

In [40]:
best_val_loss = 100
loc_patience = 0

In [41]:
# Initialize a new LeNet-300-100 model-
winning_ticket_model = lenet_nn()

# Load weights of winning ticket-
winning_ticket_model.set_weights(model.get_weights())

In [42]:
# Define 'train_one_step()' and 'test_step()' functions here-
@tf.function
def train_one_step(model, mask_model, optimizer, x, y):
    '''
    Function to compute one step of gradient descent optimization
    '''
    with tf.GradientTape() as tape:
        # Make predictions using defined model-
        y_pred = model(x)

        # Compute loss-
        loss = loss_fn(y, y_pred)
        
    # Compute gradients wrt defined loss and weights and biases-
    grads = tape.gradient(loss, model.trainable_variables)
    
    # type(grads)
    # list
    
    # List to hold element-wise multiplication between-
    # computed gradient and masks-
    grad_mask_mul = []
    
    # Perform element-wise multiplication between computed gradients and masks-
    for grad_layer, mask in zip(grads, mask_model.trainable_weights):
        grad_mask_mul.append(tf.math.multiply(grad_layer, mask))
    
    # Apply computed gradients to model's weights and biases-
    optimizer.apply_gradients(zip(grad_mask_mul, model.trainable_variables))

    # Compute accuracy-
    train_loss(loss)
    train_accuracy(y, y_pred)

    return None
    
    
@tf.function
def test_step(model, optimizer, data, labels):
    """
    Function to test model performance
    on testing dataset
    """
    
    predictions = model(data)
    t_loss = loss_fn(labels, predictions)

    test_loss(t_loss)
    test_accuracy(labels, predictions)

    return None



In [43]:
# Train model using 'GradientTape'-
    
# Initialize parameters for Early Stopping manual implementation-
# best_val_loss = 100
# loc_patience = 0
    
for epoch in range(num_epochs):
    
    if loc_patience >= patience:
        print("\n'EarlyStopping' called!\n")
        break
        
    # Reset the metrics at the start of the next epoch
    train_loss.reset_states()
    train_accuracy.reset_states()
    test_loss.reset_states()
    test_accuracy.reset_states()
            
    
    for x, y in train_dataset:
        train_one_step(winning_ticket_model, mask_model, optimizer, x, y)


    for x_t, y_t in test_dataset:
        test_step(winning_ticket_model, optimizer, x_t, y_t)

    template = 'Epoch {0}, Loss: {1:.4f}, Accuracy: {2:.4f}, Test Loss: {3:.4f}, Test Accuracy: {4:4f}'
    
    '''
    # 'i' is the index for number of pruning rounds-
    history_main[i]['accuracy'][epoch] = train_accuracy.result() * 100
    history_main[i]['loss'][epoch] = train_loss.result()
    history_main[i]['val_loss'][epoch] = test_loss.result()
    history_main[i]['val_accuracy'][epoch] = test_accuracy.result() * 100
    ''' 

    print(template.format(
        epoch + 1, train_loss.result(),
        train_accuracy.result()*100, test_loss.result(),
        test_accuracy.result()*100)
         )
    
    # Count number of non-zero parameters in each layer and in total-
    # print("layer-wise manner model, number of nonzero parameters in each layer are: \n")
    model_sum_params = 0
    
    for layer in winning_ticket_model.trainable_weights:
        # print(tf.math.count_nonzero(layer, axis = None).numpy())
        model_sum_params += tf.math.count_nonzero(layer, axis = None).numpy()
    
    print("Total number of trainable parameters = {0}\n".format(model_sum_params))

    
    # Code for manual Early Stopping:
    if np.abs(test_loss.result() < best_val_loss) >= minimum_delta:
        # update 'best_val_loss' variable to lowest loss encountered so far-
        best_val_loss = test_loss.result()
        
        # reset 'loc_patience' variable-
        loc_patience = 0
        
    else:  # there is no improvement in monitored metric 'val_loss'
        loc_patience += 1  # number of epochs without any improvement


Epoch 1, Loss: 0.1101, Accuracy: 97.2017, Test Loss: 0.0658, Test Accuracy: 97.989998
Total number of trainable parameters = 23095

Epoch 2, Loss: 0.0340, Accuracy: 99.0883, Test Loss: 0.0567, Test Accuracy: 98.199997
Total number of trainable parameters = 23095

Epoch 3, Loss: 0.0211, Accuracy: 99.4500, Test Loss: 0.0553, Test Accuracy: 98.259995
Total number of trainable parameters = 23095

Epoch 4, Loss: 0.0136, Accuracy: 99.6750, Test Loss: 0.0586, Test Accuracy: 98.229996
Total number of trainable parameters = 23095

Epoch 5, Loss: 0.0097, Accuracy: 99.7700, Test Loss: 0.0639, Test Accuracy: 98.110001
Total number of trainable parameters = 23095

Epoch 6, Loss: 0.0064, Accuracy: 99.8750, Test Loss: 0.0619, Test Accuracy: 98.299995
Total number of trainable parameters = 23095


'EarlyStopping' called!



### Train _Quantized_ winning ticket model:

In [44]:
# User input parameters for Early Stopping in manual implementation-
minimum_delta = 0.001
patience = 3

In [45]:
best_val_loss = 100
loc_patience = 0

In [41]:
# Initialize a new LeNet-300-100 model-
# winning_ticket_model = lenet_nn()

# Load weights of winning ticket-
# winning_ticket_model.set_weights(model.get_weights())

In [46]:
# Define 'train_one_step()' and 'test_step()' functions here-
@tf.function
def train_one_step(model, mask_model, optimizer, x, y):
    '''
    Function to compute one step of gradient descent optimization
    '''
    with tf.GradientTape() as tape:
        # Make predictions using defined model-
        y_pred = model(x)

        # Compute loss-
        loss = loss_fn(y, y_pred)
        
    # Compute gradients wrt defined loss and weights and biases-
    grads = tape.gradient(loss, model.trainable_variables)
    
    # type(grads)
    # list
    
    # List to hold element-wise multiplication between-
    # computed gradient and masks-
    grad_mask_mul = []
    
    # Perform element-wise multiplication between computed gradients and masks-
    for grad_layer, mask in zip(grads, mask_model.trainable_weights):
        grad_mask_mul.append(tf.math.multiply(grad_layer, mask))
    
    # Apply computed gradients to model's weights and biases-
    optimizer.apply_gradients(zip(grad_mask_mul, model.trainable_variables))

    # Compute accuracy-
    train_loss(loss)
    train_accuracy(y, y_pred)

    return None
    
    
@tf.function
def test_step(model, optimizer, data, labels):
    """
    Function to test model performance
    on testing dataset
    """
    
    predictions = model(data)
    t_loss = loss_fn(labels, predictions)

    test_loss(t_loss)
    test_accuracy(labels, predictions)

    return None



In [49]:
history_q_aware = q_aware_model.fit(
    x = X_train, y = y_train,
    batch_size = batch_size,
    epochs = 3
)


Epoch 1/3
Epoch 2/3
Epoch 3/3


In [54]:
_, baseline_model_accuracy = winning_ticket_model.evaluate(X_test, y_test, verbose=0)
_, q_aware_model_accuracy = q_aware_model.evaluate(X_test, y_test, verbose=0)


### There is minimal to no loss in test accuracy after quantization aware training, compared to the baseline:

In [55]:
print('Baseline test accuracy:', baseline_model_accuracy)
print('Quant test accuracy:', q_aware_model_accuracy)

Baseline test accuracy: 0.9829999804496765
Quant test accuracy: 0.9796000123023987


In [57]:
q_params = 0

for layer in q_aware_model.trainable_weights:
    params = tf.math.count_nonzero(layer, axis = None).numpy()
    print("layer: {0} has {1} non-zero parameter".format(layer.shape, params))
    q_params += params
    
print("\nTotal number of non-zero parameters = {0}\n".format(q_params))

layer: (300,) has 294 non-zero parameter
layer: (784, 300) has 204983 non-zero parameter
layer: (100,) has 99 non-zero parameter
layer: (300, 100) has 29100 non-zero parameter
layer: (10,) has 10 non-zero parameter
layer: (100, 10) has 990 non-zero parameter

Total number of non-zero parameters = 235476



In [61]:
winning_params

23095

In [47]:
# Train model using 'GradientTape'-
    
# Initialize parameters for Early Stopping manual implementation-
# best_val_loss = 100
# loc_patience = 0
    
for epoch in range(num_epochs):
    
    if loc_patience >= patience:
        print("\n'EarlyStopping' called!\n")
        break
        
    # Reset the metrics at the start of the next epoch
    train_loss.reset_states()
    train_accuracy.reset_states()
    test_loss.reset_states()
    test_accuracy.reset_states()
            
    
    for x, y in train_dataset:
        train_one_step(q_aware_model, mask_model, optimizer, x, y)


    for x_t, y_t in test_dataset:
        test_step(q_aware_model, optimizer, x_t, y_t)

    template = 'Epoch {0}, Loss: {1:.4f}, Accuracy: {2:.4f}, Test Loss: {3:.4f}, Test Accuracy: {4:4f}'
    
    '''
    # 'i' is the index for number of pruning rounds-
    history_main[i]['accuracy'][epoch] = train_accuracy.result() * 100
    history_main[i]['loss'][epoch] = train_loss.result()
    history_main[i]['val_loss'][epoch] = test_loss.result()
    history_main[i]['val_accuracy'][epoch] = test_accuracy.result() * 100
    ''' 

    print(template.format(
        epoch + 1, train_loss.result(),
        train_accuracy.result()*100, test_loss.result(),
        test_accuracy.result()*100)
         )
    
    # Count number of non-zero parameters in each layer and in total-
    # print("layer-wise manner model, number of nonzero parameters in each layer are: \n")
    model_sum_params = 0
    
    for layer in winning_ticket_model.trainable_weights:
        # print(tf.math.count_nonzero(layer, axis = None).numpy())
        model_sum_params += tf.math.count_nonzero(layer, axis = None).numpy()
    
    print("Total number of trainable parameters = {0}\n".format(model_sum_params))

    
    # Code for manual Early Stopping:
    if np.abs(test_loss.result() < best_val_loss) >= minimum_delta:
        # update 'best_val_loss' variable to lowest loss encountered so far-
        best_val_loss = test_loss.result()
        
        # reset 'loc_patience' variable-
        loc_patience = 0
        
    else:  # there is no improvement in monitored metric 'val_loss'
        loc_patience += 1  # number of epochs without any improvement


InvalidArgumentError:  var and grad do not have the same shape[10] [100,10]
	 [[node Adam/Adam/update_4/ResourceApplyAdam (defined at <ipython-input-37-9c297d161e54>:29) ]] [Op:__inference_train_one_step_20360]

Errors may have originated from an input operation.
Input Source operations connected to node Adam/Adam/update_4/ResourceApplyAdam:
 Mul_4 (defined at <ipython-input-37-9c297d161e54>:26)	
 sequential/quant_dense_2/BiasAdd/ReadVariableOp/resource (defined at /home/arjun/.local/lib/python3.8/site-packages/tensorflow_model_optimization/python/core/quantization/keras/quantize_wrapper.py:162)

Function call stack:
train_one_step
