# Data Augmentation: Conv-6 CNN LTH on CIFAR-10

In [1]:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import math
# import tensorflow_model_optimization as tfmot
# from tensorflow_model_optimization.sparsity import keras as sparsity
# from tensorflow.keras import datasets, layers, models

from tensorflow.keras.layers import AveragePooling2D, Conv2D, MaxPooling2D, ReLU
from tensorflow.keras import models, layers, datasets
from tensorflow.keras.layers import Dense, Flatten, Reshape, Input, InputLayer
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.initializers import RandomNormal

from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.cluster import DBSCAN, KMeans

from tensorflow.keras.preprocessing.image import ImageDataGenerator


In [2]:
tf.__version__

'2.2.0'

In [3]:
AUTOTUNE = tf.data.experimental.AUTOTUNE

In [4]:
%env CUDA_DEVICE_ORDER=PCI_BUS_ID
%env CUDA_VISIBLE_DEVICES = 2

env: CUDA_DEVICE_ORDER=PCI_BUS_ID
env: CUDA_VISIBLE_DEVICES=2


In [5]:
batch_size = 60
num_classes = 10
num_epochs = 100

In [6]:
# Data preprocessing and cleaning:
# input image dimensions
img_rows, img_cols = 32, 32

# Load CIFAR-10 dataset-
(X_train, y_train), (X_test, y_test) = tf.keras.datasets.cifar10.load_data()

In [7]:
print("X_train.shape = {0}, y_train.shape = {1}".format(X_train.shape, y_train.shape))
print("X_test.shape = {0}, y_test.shape = {1}".format(X_test.shape, y_test.shape))

X_train.shape = (50000, 32, 32, 3), y_train.shape = (50000, 1)
X_test.shape = (10000, 32, 32, 3), y_test.shape = (10000, 1)


In [8]:
if tf.keras.backend.image_data_format() == 'channels_first':
    X_train = X_train.reshape(X_train.shape[0], 3, img_rows, img_cols)
    X_test = X_test.reshape(X_test.shape[0], 3, img_rows, img_cols)
    input_shape = (3, img_rows, img_cols)
else:
    X_train = X_train.reshape(X_train.shape[0], img_rows, img_cols, 3)
    X_test = X_test.reshape(X_test.shape[0], img_rows, img_cols, 3)
    input_shape = (img_rows, img_cols, 3)

print("\n'input_shape' which will be used = {0}\n".format(input_shape))



'input_shape' which will be used = (32, 32, 3)



In [9]:
# Convert datasets to floating point types-
X_train = X_train.astype('float32')
X_test = X_test.astype('float32')

# Normalize the training and testing datasets-
X_train /= 255.0
X_test /= 255.0

In [10]:
# convert class vectors/target to binary class matrices or one-hot encoded values-
y_train = tf.keras.utils.to_categorical(y_train, num_classes)
y_test = tf.keras.utils.to_categorical(y_test, num_classes)

In [11]:
print("\nDimensions of training and testing sets are:")
print("X_train.shape = {0}, y_train.shape = {1}".format(X_train.shape, y_train.shape))
print("X_test.shape = {0}, y_test.shape = {1}".format(X_test.shape, y_test.shape))


Dimensions of training and testing sets are:
X_train.shape = (50000, 32, 32, 3), y_train.shape = (50000, 10)
X_test.shape = (10000, 32, 32, 3), y_test.shape = (10000, 10)


### Prepare CIFAR10 dataset for _GradientTape_ training:

In [46]:
# Create training and testing datasets-
train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train))
test_dataset = tf.data.Dataset.from_tensor_slices((X_test, y_test))

In [47]:
train_dataset_features = tf.data.Dataset.from_tensor_slices(X_train)
train_dataset_labels = tf.data.Dataset.from_tensor_slices(y_train)
test_dataset_features = tf.data.Dataset.from_tensor_slices(X_test)
test_dataset_labels = tf.data.Dataset.from_tensor_slices(y_test)

In [48]:
train_dataset = train_dataset.shuffle(buffer_size = 20000, reshuffle_each_iteration = True).batch(batch_size = batch_size, drop_remainder = False)

In [49]:
test_dataset = test_dataset.batch(batch_size=batch_size, drop_remainder=False)

In [None]:
for x_t, y_t in test_dataset:
    print(x_t.shape, y_t.shape)

In [51]:
# Choose an optimizer and loss function for training-
loss_fn = tf.keras.losses.CategoricalCrossentropy()
optimizer = tf.keras.optimizers.Adam(lr = 0.0003)

In [52]:
lr = 0.0003

In [53]:
# Select metrics to measure the error & accuracy of model.
# These metrics accumulate the values over epochs and then
# print the overall result-
train_loss = tf.keras.metrics.Mean(name = 'train_loss')
train_accuracy = tf.keras.metrics.CategoricalAccuracy(name = 'train_accuracy')

test_loss = tf.keras.metrics.Mean(name = 'test_loss')
test_accuracy = tf.keras.metrics.CategoricalAccuracy(name = 'test_accuracy')

### Data Augmentation:

Write a function to augment the images. Map it over the the dataset.
This returns a dataset that augments the data on the fly.

In [17]:
"""
def augment(image):
    # image,label = convert(image, label)
    # image = tf.image.convert_image_dtype(image, tf.float32) # Cast and normalize the image to [0,1]

    '''
    image = tf.reshape(image, shape = [28 ,28, 1])
    image = tf.image.resize_with_crop_or_pad(image, 34, 34) # Add 6 pixels of padding
    image = tf.image.random_crop(image, size=[28, 28, 1]) # Random crop back to 28x28
    image = tf.image.random_brightness(image, max_delta=0.5) # Random brightness
    '''
    image = tf.image.rotation_range = 90
    image = tf.image.width_shift_range = 0.1
    image = tf.image.height_shift_range = 0.1
    image = tf.image.horizontal_flip = True
    
    # rotation_range=90, width_shift_range=0.1, height_shift_range=0.1,horizontal_flip=True

    return image
    
"""

In [20]:
# num_train_examples = X_train.shape[0]
# num_train_examples

50000

In [23]:
"""
augmented_train_batches = (
    train_dataset_features
    # train_dataset
    # .take(NUM_EXAMPLES)
    .cache()
    .shuffle(num_train_examples // 4)

    # The augmentation is added here.
    .map(augment, num_parallel_calls=AUTOTUNE)
    .batch(batch_size)
    .prefetch(AUTOTUNE)
)


# Setup the validation dataset. This doesn't change whether or not you're using the augmentation.
validation_batches = (
    test_dataset_features
    # .map(convert, num_parallel_calls=AUTOTUNE)
    .batch(batch_size)
)

"""

### Data Augmentation done using _ImageDataGenerator_:

In [54]:
# Example of using 'tf.keras.preprocessing.image import ImageDataGenerator class's - flow(x, y)':

datagen = ImageDataGenerator(
    # featurewise_center=True,
    # featurewise_std_normalization=True,
    rotation_range = 90,
    width_shift_range = 0.1,
    height_shift_range = 0.1,
    horizontal_flip = True
)


# flow():
# Takes data & label arrays, generates batches of augmented data.

# datagen.flow(X_train, y_train, batch_size=batch_size, shuffle=True)
'''
for x, y in datagen.flow(X_train, y_train, batch_size=batch_size, shuffle=True):
	print(x.shape, y.shape)
'''

'\nfor x, y in datagen.flow(X_train, y_train, batch_size=batch_size, shuffle=True):\n\tprint(x.shape, y.shape)\n'

In [20]:
def conv6_cnn():
    """
    Function to define the architecture of a neural network model
    following Conv-6 architecture for CIFAR-10 dataset and using
    provided parameter which are used to prune the model.
    
    Conv-6 architecture-
    64, 64, pool  -- convolutional layers
    128, 128, pool -- convolutional layers
    256, 256, pool -- convolutional layers
    256, 256, 10  -- fully connected layers
    
    Output: Returns designed and compiled neural network model
    """
    
    l = tf.keras.layers
    
    model = Sequential()
    
    model.add(
        Conv2D(
            filters = 64, kernel_size = (3, 3),
            activation='relu', kernel_initializer = tf.initializers.GlorotNormal(),
            strides = (1, 1), padding = 'same',
            input_shape=(32, 32, 3)
        )    
    )
        
    model.add(
        Conv2D(
            filters = 64, kernel_size = (3, 3),
            activation='relu', kernel_initializer = tf.initializers.GlorotNormal(),
            strides = (1, 1), padding = 'same'
        )
    )
    
    model.add(
        MaxPooling2D(
            pool_size = (2, 2),
            strides = (2, 2)
        )
    )
    
    model.add(
        Conv2D(
            filters = 128, kernel_size = (3, 3),
            activation='relu', kernel_initializer = tf.initializers.GlorotNormal(),
            strides = (1, 1), padding = 'same'
        )
    )

    model.add(
        Conv2D(
            filters = 128, kernel_size = (3, 3),
            activation='relu', kernel_initializer = tf.initializers.GlorotNormal(),
            strides = (1, 1), padding = 'same'
        )
    )

    model.add(
        MaxPooling2D(
            pool_size = (2, 2),
            strides = (2, 2)
        )
    )

    model.add(
        Conv2D(
            filters = 256, kernel_size = (3, 3),
            activation='relu', kernel_initializer = tf.initializers.GlorotNormal(),
            strides = (1, 1), padding = 'same'
        )
    )

    model.add(
        Conv2D(
            filters = 256, kernel_size = (3, 3),
            activation='relu', kernel_initializer = tf.initializers.GlorotNormal(),
            strides = (1, 1), padding = 'same'
        )
    )

    model.add(
        MaxPooling2D(
            pool_size = (2, 2),
            strides = (2, 2)
        )
    )
    
    model.add(Flatten())
    
    model.add(
        Dense(
            units = 256, activation='relu',
            kernel_initializer = tf.initializers.GlorotNormal()
        )
    )
    
    model.add(
        Dense(
            units = 256, activation='relu',
            kernel_initializer = tf.initializers.GlorotNormal()
        )
    )
    
    model.add(
        Dense(
            units = 10, activation='softmax'
        )
    )
    

    # Compile pruned CNN-
    model.compile(
        loss=tf.keras.losses.categorical_crossentropy,
        # optimizer='adam',
        optimizer=tf.keras.optimizers.Adam(lr = 0.0003),
        metrics=['accuracy']
    )
    
    
    return model


In [21]:
# Instantiate a new Conv-2 CNN model-
orig_model = conv6_cnn()

In [22]:
# Load random weights from before-
orig_model.load_weights("Conv_6_CIFAR10_Magnitude_Based_Winning_Ticket_Distribution_92.55423622890814.h5")

In [23]:
# Count number of non-zero parameters-
winning_params = 0

for layer in orig_model.trainable_weights:
    nonzeroparams = tf.math.count_nonzero(layer, axis = None).numpy()
    # print("layer: {0} has {1} non-zero parameters".format(layer.shape, nonzeroparams))
    winning_params += nonzeroparams

print("\nTotal # of non-zero parameters = {0}\n".format(winning_params))


Total # of non-zero parameters = 167210



In [24]:
import tensorflow.keras.backend as K


# METHOD-1: This also counts biases

trainable_wts = np.sum([K.count_params(w) for w in orig_model.trainable_weights])
non_trainable_wts = np.sum([K.count_params(w) for w in orig_model.non_trainable_weights])

print("\nNumber of training weights = {0} and non-trainabel weights = {1}\n".format(
    trainable_wts, non_trainable_wts
))
print("Total number of parameters = {0}\n".format(trainable_wts + non_trainable_wts))



Number of training weights = 2262602 and non-trainabel weights = 0.0

Total number of parameters = 2262602.0



In [25]:
print("\n{0:.4f}% of parameters have been pruned\n".format((trainable_wts - winning_params) / trainable_wts * 100))


92.6098% of parameters have been pruned



In [26]:
# Create mask using winning ticket-

# Instantiate a new neural network model for which, the mask is to be created,
mask_model = conv6_cnn()
    
# Load weights of PRUNED model-
mask_model.set_weights(orig_model.get_weights())
    
# For each layer, for each weight which is 0, leave it, as is.
# And for weights which survive the pruning,reinitialize it to ONE (1)-
for wts in mask_model.trainable_weights:
    wts.assign(tf.where(tf.equal(wts, 0.), 0., 1.))


In [27]:
# Count number of non-zero masks-
mask_params = 0

for layer in mask_model.trainable_weights:
    nonzeroparams = tf.math.count_nonzero(layer, axis = None).numpy()
    # print("layer: {0} has {1} non-zero masks".format(layer.shape, nonzeroparams))
    mask_params += nonzeroparams

print("\nTotal # of non-zero masks = {0}\n".format(mask_params))


Total # of non-zero masks = 167210



In [28]:
if mask_params == winning_params:
    print("\nnumber of non-zero parameters and masks matches!")
else:
    print("\nERROR! number of non-zero parameters and masks DO NOT MATCH!")


number of non-zero parameters and masks matches!


### Train winning ticket using Data Augmentation:

In [61]:
# User input parameters for Early Stopping in manual implementation-
minimum_delta = 0.001
patience = 3

In [62]:
best_val_loss = 100
loc_patience = 0

In [48]:
# best_trained_weights = []

In [63]:
# Initialize a new LeNet-300-100 model-
winning_ticket_model = conv6_cnn()

# Load weights of winning ticket-
winning_ticket_model.set_weights(orig_model.get_weights())

In [64]:
# Define 'train_one_step()' and 'test_step()' functions here-
@tf.function
def train_one_step(model, mask_model, optimizer, x, y):
    '''
    Function to compute one step of gradient descent optimization
    '''
    with tf.GradientTape() as tape:
        # Make predictions using defined model-
        y_pred = model(x)

        # Compute loss-
        loss = loss_fn(y, y_pred)
        
    # Compute gradients wrt defined loss and weights and biases-
    grads = tape.gradient(loss, model.trainable_variables)
    
    # type(grads)
    # list
    
    # List to hold element-wise multiplication between-
    # computed gradient and masks-
    grad_mask_mul = []
    
    # Perform element-wise multiplication between computed gradients and masks-
    for grad_layer, mask in zip(grads, mask_model.trainable_weights):
        grad_mask_mul.append(tf.math.multiply(grad_layer, mask))
    
    # Apply computed gradients to model's weights and biases-
    optimizer.apply_gradients(zip(grad_mask_mul, model.trainable_variables))

    # Compute accuracy-
    train_loss(loss)
    train_accuracy(y, y_pred)

    return None
    
    
@tf.function
def test_step(model, optimizer, data, labels):
    """
    Function to test model performance
    on testing dataset
    """
    
    predictions = model(data)
    t_loss = loss_fn(labels, predictions)

    test_loss(t_loss)
    test_accuracy(labels, predictions)

    return None



In [59]:
curr_step = 0
    
for x, y in datagen.flow(X_train, y_train, batch_size = batch_size, shuffle = True):
    # print("x.shape = {0}, y.shape = {1}".format(x.shape, y.shape))
    # x.shape = (60, 32, 32, 3), y.shape = (60, 10)
    
    train_one_step(winning_ticket_model, mask_model, optimizer, x, y)
    # print("current step = ", curr_step)
    curr_step += 1
        
    if curr_step >= X_train.shape[0] // batch_size:
        print("\nTerminating training (datagen.flow())")
        break
 


Terminating training (datagen.flow())


In [60]:
for x_t, y_t in test_dataset:
    test_step(winning_ticket_model, optimizer, x_t, y_t)


In [65]:
# Train model using 'GradientTape'-
    
# Initialize parameters for Early Stopping manual implementation-
# best_val_loss = 100
# loc_patience = 0
    
for epoch in range(num_epochs):
    
    if loc_patience >= patience:
        print("\n'EarlyStopping' called!\n")
        break
        
    # Reset the metrics at the start of the next epoch
    train_loss.reset_states()
    train_accuracy.reset_states()
    test_loss.reset_states()
    test_accuracy.reset_states()
    
    curr_step = 0
    
    for x, y in datagen.flow(X_train, y_train, batch_size = batch_size, shuffle = True):
    # for x, y in zip(augmented_train_batches, y_train):
        train_one_step(winning_ticket_model, mask_model, optimizer, x, y)
        # print("current step = ", curr_step)
        curr_step += 1
        
        if curr_step >= X_train.shape[0] // batch_size:
            print("\nTerminating training (datagen.flow())")
            break


    for x_t, y_t in test_dataset:
    # for x_t, y_t in zip(validation_batches, y_test):
        test_step(winning_ticket_model, optimizer, x_t, y_t)
    
    '''
    for x, y in train_dataset:
        train_one_step(winning_ticket_model, mask_model, optimizer, x, y)


    for x_t, y_t in test_dataset:
        test_step(winning_ticket_model, optimizer, x_t, y_t)

    '''

    
    template = 'Epoch {0}, Loss: {1:.4f}, Accuracy: {2:.4f}, Test Loss: {3:.4f}, Test Accuracy: {4:4f}'
    
    '''
    # 'i' is the index for number of pruning rounds-
    history_main[i]['accuracy'][epoch] = train_accuracy.result() * 100
    history_main[i]['loss'][epoch] = train_loss.result()
    history_main[i]['val_loss'][epoch] = test_loss.result()
    history_main[i]['val_accuracy'][epoch] = test_accuracy.result() * 100
    ''' 

    print(template.format(
        epoch + 1, train_loss.result(),
        train_accuracy.result()*100, test_loss.result(),
        test_accuracy.result()*100)
         )
    
    # Count number of non-zero parameters in each layer and in total-
    # print("layer-wise manner model, number of nonzero parameters in each layer are: \n")
    model_sum_params = 0
    
    for layer in winning_ticket_model.trainable_weights:
        # print(tf.math.count_nonzero(layer, axis = None).numpy())
        model_sum_params += tf.math.count_nonzero(layer, axis = None).numpy()
    
    print("Total number of trainable parameters = {0}\n".format(model_sum_params))
    
    '''
    if test_loss.result() < best_val_loss:
        best_trained_weights = copy.deepcopy(winning_ticket_model.get_weights())
        print("\ntest_loss.result = {0:.4f}, best_val_loss = {1:.4f}, copied 'best weights'\n".format(test_loss.result(), best_val_loss))
    '''
    
    # Code for manual Early Stopping:
    if np.abs(test_loss.result() < best_val_loss) >= minimum_delta:
        # update 'best_val_loss' variable to lowest loss encountered so far-
        best_val_loss = test_loss.result()
        
        # reset 'loc_patience' variable-
        loc_patience = 0
        
    else:  # there is no improvement in monitored metric 'val_loss'
        loc_patience += 1  # number of epochs without any improvement



Terminating training (datagen.flow())
Epoch 1, Loss: 1.8008, Accuracy: 34.9520, Test Loss: 1.6015, Test Accuracy: 42.500000
Total number of trainable parameters = 167210


Terminating training (datagen.flow())
Epoch 2, Loss: 1.5840, Accuracy: 42.9432, Test Loss: 1.5690, Test Accuracy: 44.250000
Total number of trainable parameters = 167210


Terminating training (datagen.flow())
Epoch 3, Loss: 1.4510, Accuracy: 48.2013, Test Loss: 1.2719, Test Accuracy: 55.349998
Total number of trainable parameters = 167210


Terminating training (datagen.flow())
Epoch 4, Loss: 1.3696, Accuracy: 51.1345, Test Loss: 1.2625, Test Accuracy: 56.690002
Total number of trainable parameters = 167210


Terminating training (datagen.flow())
Epoch 5, Loss: 1.3086, Accuracy: 53.5774, Test Loss: 1.1230, Test Accuracy: 60.709999
Total number of trainable parameters = 167210


Terminating training (datagen.flow())
Epoch 6, Loss: 1.2606, Accuracy: 55.5502, Test Loss: 1.0963, Test Accuracy: 62.150002
Total number of

In [66]:
# Make predictions using trained model-
y_pred = np.argmax(winning_ticket_model.predict(X_test), axis = -1)

In [68]:
y_test_np = np.argmax(y_test, axis = 1)

In [69]:
y_pred.shape, y_test_np.shape

((10000,), (10000,))

In [70]:
accuracy = accuracy_score(y_test_np, y_pred)
precision = precision_score(y_test_np, y_pred, average = 'macro')
recall = recall_score(y_test_np, y_pred, average = 'macro')

In [71]:
print("\nConv-6 CNN (Winning Ticket Trained model) metrics:")
print("accuracy = {0:.4f}, precision = {1:.4f} & recall = {2:.4f}\n".format(accuracy, precision, recall))


Conv-6 CNN (Winning Ticket Trained model) metrics:
accuracy = 0.6643, precision = 0.6790 & recall = 0.6643



### Train winning ticket without Data Augmentation:

In [72]:
# User input parameters for Early Stopping in manual implementation-
minimum_delta = 0.001
patience = 3

In [73]:
best_val_loss = 100
loc_patience = 0

In [74]:
# Initialize a new LeNet-300-100 model-
winning_ticket_model = conv6_cnn()

# Load weights of winning ticket-
winning_ticket_model.set_weights(orig_model.get_weights())

In [75]:
# Define 'train_one_step()' and 'test_step()' functions here-
@tf.function
def train_one_step(model, mask_model, optimizer, x, y):
    '''
    Function to compute one step of gradient descent optimization
    '''
    with tf.GradientTape() as tape:
        # Make predictions using defined model-
        y_pred = model(x)

        # Compute loss-
        loss = loss_fn(y, y_pred)
        
    # Compute gradients wrt defined loss and weights and biases-
    grads = tape.gradient(loss, model.trainable_variables)
    
    # type(grads)
    # list
    
    # List to hold element-wise multiplication between-
    # computed gradient and masks-
    grad_mask_mul = []
    
    # Perform element-wise multiplication between computed gradients and masks-
    for grad_layer, mask in zip(grads, mask_model.trainable_weights):
        grad_mask_mul.append(tf.math.multiply(grad_layer, mask))
    
    # Apply computed gradients to model's weights and biases-
    optimizer.apply_gradients(zip(grad_mask_mul, model.trainable_variables))

    # Compute accuracy-
    train_loss(loss)
    train_accuracy(y, y_pred)

    return None
    
    
@tf.function
def test_step(model, optimizer, data, labels):
    """
    Function to test model performance
    on testing dataset
    """
    
    predictions = model(data)
    t_loss = loss_fn(labels, predictions)

    test_loss(t_loss)
    test_accuracy(labels, predictions)

    return None



In [77]:
# Train model using 'GradientTape'-
    
# Initialize parameters for Early Stopping manual implementation-
# best_val_loss = 100
# loc_patience = 0
    
for epoch in range(num_epochs):
    
    if loc_patience >= patience:
        print("\n'EarlyStopping' called!\n")
        break
        
    # Reset the metrics at the start of the next epoch
    train_loss.reset_states()
    train_accuracy.reset_states()
    test_loss.reset_states()
    test_accuracy.reset_states()
    
    # curr_step = 0
    
    for x, y in train_dataset:
        train_one_step(winning_ticket_model, mask_model, optimizer, x, y)
    
    '''
    # for x, y in zip(augmented_train_batches, y_train):
    for x, y in datagen.flow(X_train, y_train, batch_size = batch_size, shuffle = True):
        train_one_step(winning_ticket_model, mask_model, optimizer, x, y)
        # print("current step = ", curr_step)
        curr_step += 1
        
        if curr_step >= X_train.shape[0] // batch_size:
            print("\nTerminating training (datagen.flow())")
            break
    '''

    for x_t, y_t in test_dataset:
    # for x_t, y_t in zip(validation_batches, y_test):
        test_step(winning_ticket_model, optimizer, x_t, y_t)
    
    '''
    for x, y in train_dataset:
        train_one_step(winning_ticket_model, mask_model, optimizer, x, y)


    for x_t, y_t in test_dataset:
        test_step(winning_ticket_model, optimizer, x_t, y_t)

    '''

    
    template = 'Epoch {0}, Loss: {1:.4f}, Accuracy: {2:.4f}, Test Loss: {3:.4f}, Test Accuracy: {4:4f}'
    
    '''
    # 'i' is the index for number of pruning rounds-
    history_main[i]['accuracy'][epoch] = train_accuracy.result() * 100
    history_main[i]['loss'][epoch] = train_loss.result()
    history_main[i]['val_loss'][epoch] = test_loss.result()
    history_main[i]['val_accuracy'][epoch] = test_accuracy.result() * 100
    ''' 

    print(template.format(
        epoch + 1, train_loss.result(),
        train_accuracy.result()*100, test_loss.result(),
        test_accuracy.result()*100)
         )
    
    # Count number of non-zero parameters in each layer and in total-
    # print("layer-wise manner model, number of nonzero parameters in each layer are: \n")
    model_sum_params = 0
    
    for layer in winning_ticket_model.trainable_weights:
        # print(tf.math.count_nonzero(layer, axis = None).numpy())
        model_sum_params += tf.math.count_nonzero(layer, axis = None).numpy()
    
    print("Total number of trainable parameters = {0}\n".format(model_sum_params))
    
    '''
    if test_loss.result() < best_val_loss:
        best_trained_weights = copy.deepcopy(winning_ticket_model.get_weights())
        print("\ntest_loss.result = {0:.4f}, best_val_loss = {1:.4f}, copied 'best weights'\n".format(test_loss.result(), best_val_loss))
    '''
    
    # Code for manual Early Stopping:
    if np.abs(test_loss.result() < best_val_loss) >= minimum_delta:
        # update 'best_val_loss' variable to lowest loss encountered so far-
        best_val_loss = test_loss.result()
        
        # reset 'loc_patience' variable-
        loc_patience = 0
        
    else:  # there is no improvement in monitored metric 'val_loss'
        loc_patience += 1  # number of epochs without any improvement


Epoch 1, Loss: 1.1903, Accuracy: 57.5960, Test Loss: 0.9175, Test Accuracy: 67.940002
Total number of trainable parameters = 167210

Epoch 2, Loss: 0.7849, Accuracy: 72.7580, Test Loss: 0.7680, Test Accuracy: 73.930000
Total number of trainable parameters = 167210

Epoch 3, Loss: 0.6472, Accuracy: 77.6740, Test Loss: 0.7098, Test Accuracy: 76.059998
Total number of trainable parameters = 167210

Epoch 4, Loss: 0.5587, Accuracy: 80.6160, Test Loss: 0.6894, Test Accuracy: 76.779999
Total number of trainable parameters = 167210

Epoch 5, Loss: 0.4946, Accuracy: 83.0480, Test Loss: 0.6635, Test Accuracy: 77.930000
Total number of trainable parameters = 167210

Epoch 6, Loss: 0.4456, Accuracy: 84.5440, Test Loss: 0.6789, Test Accuracy: 77.910004
Total number of trainable parameters = 167210

Epoch 7, Loss: 0.4038, Accuracy: 85.9980, Test Loss: 0.6579, Test Accuracy: 78.680000
Total number of trainable parameters = 167210

Epoch 8, Loss: 0.3704, Accuracy: 87.1760, Test Loss: 0.6524, Test Acc