# Post Training Quantization Vs. Quantization Aware Training 

## Load MNIST Data

In [1]:
import tensorflow as tf
import numpy as np

# Load MNIST dataset
mnist = tf.keras.datasets.mnist
(train_images, train_labels), (test_images, test_labels) = mnist.load_data()

# Normalize the images to [0, 1]
train_images = train_images / 255.0
test_images = test_images / 255.0

# Expand the images to 3D (height, width, channel) for CNN
train_images = np.expand_dims(train_images, axis=-1)
test_images = np.expand_dims(test_images, axis=-1)

2024-05-08 22:07:43.510649: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-05-08 22:07:43.617139: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2024-05-08 22:07:43.617161: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2024-05-08 22:07:44.222005: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directo

## Train a Base Model 

In [2]:
# Define a simple CNN model
model = tf.keras.Sequential([
    tf.keras.layers.Conv2D(32, kernel_size=(3, 3), activation='relu', input_shape=(28, 28, 1)),
    tf.keras.layers.MaxPooling2D(pool_size=(2, 2)),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(10, activation='softmax')
])

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the base model
model.fit(train_images, train_labels, epochs=2, validation_data=(test_images, test_labels))

# Evaluate the model
loss, accuracy = model.evaluate(test_images, test_labels)
print(f"Base Model Accuracy: {accuracy}")

# Save the model
model.save('mnist_original_cnn_model.h5')

# Print the size of the model file
import os
model_size = os.path.getsize('mnist_original_cnn_model.h5') / (1024 * 1024)  # Size in MB
print("Original Model size: {:.2f} MB".format(model_size))

2024-05-08 22:07:45.170964: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2024-05-08 22:07:45.171040: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcublas.so.11'; dlerror: libcublas.so.11: cannot open shared object file: No such file or directory
2024-05-08 22:07:45.171075: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcublasLt.so.11'; dlerror: libcublasLt.so.11: cannot open shared object file: No such file or directory
2024-05-08 22:07:45.171109: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcufft.so.10'; dlerror: libcufft.so.10: cannot open shared object file: No such file or directory
2024-05-08 22:07:45.171142: W tensorfl

Epoch 1/2
Epoch 2/2
Base Model Accuracy: 0.982200026512146
Original Model size: 7.97 MB


## Post Training Quantization - Full Integer Quantization

In [3]:
# Define the representative data generator
def representative_data_gen():
    for input_value in tf.data.Dataset.from_tensor_slices(train_images).batch(1).take(100):
        # Scale the input to UINT8 range and ensure the data is in float32 before casting to uint8 to simulate the quantization process
        input_value = tf.cast(input_value * 255, tf.float32)
        yield [input_value]

# Set up the converter for the Keras model
converter = tf.lite.TFLiteConverter.from_keras_model(model)
converter.optimizations = [tf.lite.Optimize.DEFAULT]
converter.representative_dataset = representative_data_gen
converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]
converter.inference_input_type = tf.uint8
converter.inference_output_type = tf.uint8
converter.target_spec.supported_types = [tf.int8]

# Convert the model
tflite_model_quant = converter.convert()

# Save the quantized model
with open('mnist_cnn_model_quant.tflite', 'wb') as f:
    f.write(tflite_model_quant)

# Load the quantized TFLite model
interpreter = tf.lite.Interpreter(model_content=tflite_model_quant)
interpreter.allocate_tensors()

# Helper function to run inference on a set of data and return accuracy
def evaluate_tflite_model(interpreter, x_data, y_true):
    input_index = interpreter.get_input_details()[0]['index']
    output_index = interpreter.get_output_details()[0]['index']
    prediction_digits = []
    for test_image in x_data:
        # Pre-processing: scale to UINT8
        test_image = np.expand_dims(test_image * 255, axis=0).astype(np.uint8)
        interpreter.set_tensor(input_index, test_image)
        interpreter.invoke()  # Run inference
        output_data = interpreter.get_tensor(output_index)
        prediction_digits.append(np.argmax(output_data[0]))
    accurate_count = sum(prediction_digits[i] == y_true[i] for i in range(len(y_true)))
    return accurate_count / len(y_true)

# Evaluate the quantized model
accuracy_quant = evaluate_tflite_model(interpreter, test_images, test_labels)
print("Base Model Accuracy: {:.2f}%".format(accuracy * 100))
print("Test accuracy after quantization: {:.2f}%".format(accuracy_quant * 100))

# Calculate and print the quantized model size
full_integer_quant_model_size = len(tflite_model_quant) / (1024 * 1024)
print(f'Full Integer Quantized Model Size: {full_integer_quant_model_size:.2f} MB')
model_size = os.path.getsize('mnist_original_cnn_model.h5') / (1024 * 1024)  # Size in MB
print("Original Model size: {:.2f} MB".format(model_size))



INFO:tensorflow:Assets written to: /tmp/tmp02bl3ony/assets


INFO:tensorflow:Assets written to: /tmp/tmp02bl3ony/assets
2024-05-08 22:08:39.190184: W tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc:362] Ignored output_format.
2024-05-08 22:08:39.190214: W tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc:365] Ignored drop_control_dependency.
2024-05-08 22:08:39.190710: I tensorflow/cc/saved_model/reader.cc:45] Reading SavedModel from: /tmp/tmp02bl3ony
2024-05-08 22:08:39.191749: I tensorflow/cc/saved_model/reader.cc:89] Reading meta graph with tags { serve }
2024-05-08 22:08:39.191772: I tensorflow/cc/saved_model/reader.cc:130] Reading SavedModel debug info (if present) from: /tmp/tmp02bl3ony
2024-05-08 22:08:39.195468: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:357] MLIR V1 optimization pass is not enabled
2024-05-08 22:08:39.196228: I tensorflow/cc/saved_model/loader.cc:229] Restoring SavedModel bundle.
2024-05-08 22:08:39.227608: I tensorflow/cc/saved_model/loader.cc:213] Running initializatio

Base Model Accuracy: 98.22%
Test accuracy after quantization: 96.58%
Full Integer Quantized Model Size: 0.67 MB
Original Model size: 7.97 MB


## Quantization Aware Training - Full Integer Quantization

In [4]:
import tensorflow as tf
import tensorflow_model_optimization as tfmot

# Assuming 'model' is your original model
qat_model = tfmot.quantization.keras.quantize_model(model)

# Compile the quantization-aware trained model
qat_model.compile(optimizer='adam',
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])

# Train and fine-tune model with quantization awareness
qat_model.fit(train_images, train_labels, epochs=2, validation_split=0.1)


# Set up the converter for the quantization-aware trained model
converter = tf.lite.TFLiteConverter.from_keras_model(qat_model)
converter.optimizations = [tf.lite.Optimize.DEFAULT]
converter.representative_dataset = representative_data_gen
converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]
converter.inference_input_type = tf.uint8
converter.inference_output_type = tf.uint8

# Convert the model
qat_tflite_model = converter.convert()

# Save the quantized model
with open('mnist_qat_model_quant.tflite', 'wb') as f:
    f.write(qat_tflite_model)


Instructions for updating:
Lambda fuctions will be no more assumed to be used in the statement where they are used, or at least in the same block. https://github.com/tensorflow/tensorflow/issues/56089


Instructions for updating:
Lambda fuctions will be no more assumed to be used in the statement where they are used, or at least in the same block. https://github.com/tensorflow/tensorflow/issues/56089


Epoch 1/2
Epoch 2/2




INFO:tensorflow:Assets written to: /tmp/tmpgv6rwmk_/assets


INFO:tensorflow:Assets written to: /tmp/tmpgv6rwmk_/assets
2024-05-08 22:09:38.716550: W tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc:362] Ignored output_format.
2024-05-08 22:09:38.716587: W tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc:365] Ignored drop_control_dependency.
2024-05-08 22:09:38.716723: I tensorflow/cc/saved_model/reader.cc:45] Reading SavedModel from: /tmp/tmpgv6rwmk_
2024-05-08 22:09:38.718908: I tensorflow/cc/saved_model/reader.cc:89] Reading meta graph with tags { serve }
2024-05-08 22:09:38.718936: I tensorflow/cc/saved_model/reader.cc:130] Reading SavedModel debug info (if present) from: /tmp/tmpgv6rwmk_
2024-05-08 22:09:38.729265: I tensorflow/cc/saved_model/loader.cc:229] Restoring SavedModel bundle.
2024-05-08 22:09:38.780347: I tensorflow/cc/saved_model/loader.cc:213] Running initialization op on SavedModel bundle at path: /tmp/tmpgv6rwmk_
2024-05-08 22:09:38.793114: I tensorflow/cc/saved_model/loader.cc:305] SavedModel

In [5]:
# Load the quantized TFLite model
interpreter = tf.lite.Interpreter(model_content=qat_tflite_model)
interpreter.allocate_tensors()

# Evaluate the quantized model
accuracy_quant = evaluate_tflite_model(interpreter, test_images, test_labels)
print("Base Model Accuracy: {:.2f}%".format(accuracy * 100))
print("Test accuracy after QAT and full integer quantization: {:.2f}%".format(accuracy_quant * 100))
# Calculate and print the quantized model size
full_integer_qat_model_size = len(qat_tflite_model) / (1024 * 1024)
print(f'Full Integer Quantized Model Size (QAT): {full_integer_qat_model_size:.2f} MB')
model_size = os.path.getsize('mnist_original_cnn_model.h5') / (1024 * 1024)  # Size in MB
print("Original Model size: {:.2f} MB".format(model_size))

Base Model Accuracy: 98.22%
Test accuracy after QAT and full integer quantization: 98.55%
Full Integer Quantized Model Size (QAT): 0.67 MB
Original Model size: 7.97 MB


## Pruning and Post Training Quantization using Full Integer Quantization 

In [6]:
from tensorflow.keras.models import load_model

# Load your original model if it's not loaded
model_ = tf.keras.models.load_model('mnist_original_cnn_model.h5')

# Define the model for pruning
pruning_params = {
    'pruning_schedule': tfmot.sparsity.keras.PolynomialDecay(
        initial_sparsity=0.70,
        final_sparsity=0.90,
        begin_step=0,
        end_step=np.ceil(len(train_images) / 32).astype(np.int32) * 1  # example for 1 epochs, 32 batch size
    )
}

pruned_model = tfmot.sparsity.keras.prune_low_magnitude(model_, **pruning_params)

# Compile the pruned model
pruned_model.compile(optimizer='adam',
                     loss='sparse_categorical_crossentropy',
                     metrics=['accuracy'])

# Train the pruned model
callbacks = [tfmot.sparsity.keras.UpdatePruningStep()]
pruned_model.fit(train_images, train_labels, batch_size=32, epochs=1, validation_split=0.1, callbacks=callbacks)

# Remove pruning wrappers for further processing
model_for_export = tfmot.sparsity.keras.strip_pruning(pruned_model)

# Save the pruned model
model_for_export.save('pruned_model.h5')





In [7]:
# Save the pruned model
pruned_model.save('pruned_model_with_masks.h5')

## Check the sparsity of original and pruned model

In [8]:
def get_model_sparsity(model):
    total_elements = 0
    zero_elements = 0
    for layer in model.layers:
        if isinstance(layer, tf.keras.layers.Dense) or isinstance(layer, tf.keras.layers.Conv2D):
            weights = layer.get_weights()[0]
            total_elements += np.size(weights)
            zero_elements += np.sum(weights == 0)
    sparsity = zero_elements / total_elements
    return sparsity

In [9]:
sparsity = get_model_sparsity(model_for_export)
print(f'Prunned Model Sparsity: {sparsity:.2%}')

# Load the original pre-trained model
sparsity_original = get_model_sparsity(model)
print(f'Original Model Sparsity: {sparsity_original:.2%}')

Prunned Model Sparsity: 89.94%
Original Model Sparsity: 0.00%


## Pruning and PTQ using FIQ

In [10]:
# Load the pruned model
pruned_model = load_model('pruned_model.h5')

# Convert the pruned model to TensorFlow Lite with full integer quantization
converter = tf.lite.TFLiteConverter.from_keras_model(pruned_model)
converter.optimizations = [tf.lite.Optimize.DEFAULT, tf.lite.Optimize.EXPERIMENTAL_SPARSITY]
converter.representative_dataset = representative_data_gen
converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]
converter.inference_input_type = tf.uint8
converter.inference_output_type = tf.uint8

tflite_pruned_quant_model = converter.convert()

# Save the fully quantized model
with open('pruned_quantized_model.tflite', 'wb') as f:
    f.write(tflite_pruned_quant_model)





INFO:tensorflow:Assets written to: /tmp/tmpu5j556mv/assets


INFO:tensorflow:Assets written to: /tmp/tmpu5j556mv/assets
2024-05-08 22:10:06.862639: W tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc:362] Ignored output_format.
2024-05-08 22:10:06.862669: W tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc:365] Ignored drop_control_dependency.
2024-05-08 22:10:06.862780: I tensorflow/cc/saved_model/reader.cc:45] Reading SavedModel from: /tmp/tmpu5j556mv
2024-05-08 22:10:06.863552: I tensorflow/cc/saved_model/reader.cc:89] Reading meta graph with tags { serve }
2024-05-08 22:10:06.863574: I tensorflow/cc/saved_model/reader.cc:130] Reading SavedModel debug info (if present) from: /tmp/tmpu5j556mv
2024-05-08 22:10:06.865738: I tensorflow/cc/saved_model/loader.cc:229] Restoring SavedModel bundle.
2024-05-08 22:10:06.878680: I tensorflow/cc/saved_model/loader.cc:213] Running initialization op on SavedModel bundle at path: /tmp/tmpu5j556mv
2024-05-08 22:10:06.883419: I tensorflow/cc/saved_model/loader.cc:305] SavedModel

In [11]:
# Load the pruned quantized TFLite model
interpreter = tf.lite.Interpreter(model_content=tflite_pruned_quant_model)
interpreter.allocate_tensors()

# Evaluate the quantized model
accuracy_pruned_quant = evaluate_tflite_model(interpreter, test_images, test_labels)
print("Base Model Accuracy: {:.2f}%".format(accuracy * 100))
print("Test accuracy for pruned+PTQ(FIQ): {:.2f}%".format(accuracy_pruned_quant * 100))
# Calculate and print the quantized model size
pruned_full_integer_qat_model_size = len(tflite_pruned_quant_model) / (1024 * 1024)
print(f'Full Integer Quantized Model Size (QAT): {pruned_full_integer_qat_model_size:.2f} MB')
model_size = os.path.getsize('mnist_original_cnn_model.h5') / (1024 * 1024)  # Size in MB
print("Original Model size: {:.2f} MB".format(model_size))

Base Model Accuracy: 98.22%
Test accuracy for pruned+PTQ(FIQ): 90.40%
Full Integer Quantized Model Size (QAT): 0.21 MB
Original Model size: 7.97 MB


## Pruning and QAT using FIQ

In [13]:
import tensorflow as tf
import tensorflow_model_optimization as tfmot

# Correctly set the custom object scope using the expected class reference
custom_objects = {
    'PruneLowMagnitude': tfmot.sparsity.keras.prune_low_magnitude
}

# Load your original model with the custom object scope for pruning
with tf.keras.utils.custom_object_scope(custom_objects):
    pruned_model = tf.keras.models.load_model('pruned_model.h5')

# Applying quantization-aware training within the same custom object scope
with tf.keras.utils.custom_object_scope(custom_objects):
    quant_aware_model = tfmot.quantization.keras.quantize_model(pruned_model)

# Compile the quantization-aware model
quant_aware_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
quant_aware_model.fit(train_images, train_labels, batch_size=32, epochs=2, validation_split=0.1, callbacks=[tfmot.sparsity.keras.UpdatePruningStep()])

# Strip the pruning wrappers after training
model_for_export2 = tfmot.sparsity.keras.strip_pruning(quant_aware_model)
model_for_export2.save('pruned_and_quant_aware_model.h5')






Epoch 1/2
Epoch 2/2




In [15]:
# Convert the quantization-aware and pruned model to TFLite using full integer quantization
converter = tf.lite.TFLiteConverter.from_keras_model(model_for_export2)
converter.optimizations = [tf.lite.Optimize.DEFAULT, tf.lite.Optimize.EXPERIMENTAL_SPARSITY]
converter.representative_dataset = representative_data_gen
converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]
converter.inference_input_type = tf.uint8
converter.inference_output_type = tf.uint8

tflite_quant_model2 = converter.convert()

# Save the fully quantized model
with open('pruned_quantized_model.tflite', 'wb') as f:
    f.write(tflite_quant_model2)



INFO:tensorflow:Assets written to: /tmp/tmpgg61ytix/assets


INFO:tensorflow:Assets written to: /tmp/tmpgg61ytix/assets
2024-05-08 22:12:31.927678: W tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc:362] Ignored output_format.
2024-05-08 22:12:31.927713: W tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc:365] Ignored drop_control_dependency.
2024-05-08 22:12:31.927835: I tensorflow/cc/saved_model/reader.cc:45] Reading SavedModel from: /tmp/tmpgg61ytix
2024-05-08 22:12:31.929773: I tensorflow/cc/saved_model/reader.cc:89] Reading meta graph with tags { serve }
2024-05-08 22:12:31.929798: I tensorflow/cc/saved_model/reader.cc:130] Reading SavedModel debug info (if present) from: /tmp/tmpgg61ytix
2024-05-08 22:12:31.945006: I tensorflow/cc/saved_model/loader.cc:229] Restoring SavedModel bundle.
2024-05-08 22:12:31.982735: I tensorflow/cc/saved_model/loader.cc:213] Running initialization op on SavedModel bundle at path: /tmp/tmpgg61ytix
2024-05-08 22:12:31.999109: I tensorflow/cc/saved_model/loader.cc:305] SavedModel

In [16]:
# Load the pruned quantized TFLite model
interpreter = tf.lite.Interpreter(model_content=tflite_quant_model2)
interpreter.allocate_tensors()

# Evaluate the quantized model
accuracy_pruned_quant = evaluate_tflite_model(interpreter, test_images, test_labels)
print("Base Model Accuracy: {:.2f}%".format(accuracy * 100))
print("Test accuracy for pruned+PTQ(FIQ): {:.2f}%".format(accuracy_pruned_quant * 100))
# Calculate and print the quantized model size
pruned_full_integer_qat_model_size = len(tflite_quant_model2) / (1024 * 1024)
print(f'Full Integer Quantized Model Size (QAT): {pruned_full_integer_qat_model_size:.2f} MB')
model_size = os.path.getsize('mnist_original_cnn_model.h5') / (1024 * 1024)  # Size in MB
print("Original Model size: {:.2f} MB".format(model_size))

Base Model Accuracy: 98.22%
Test accuracy for pruned+PTQ(FIQ): 98.24%
Full Integer Quantized Model Size (QAT): 0.63 MB
Original Model size: 7.97 MB
