<a href="https://colab.research.google.com/github/khaledwaleedsamir/optimize-and-deploy-DL-models/blob/mobilenet/MobileNet_Quantization_Final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 1- Imports

In [None]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.applications import MobileNet
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D
from sklearn.metrics import classification_report, confusion_matrix
from tensorflow.keras.optimizers import Adam
import tensorflow as tf
import tensorflow_datasets as tfds
import numpy as np
import time
import random
import matplotlib.pyplot as plt
import os
import pandas as pd

# 2 - Loading the dataset

In [None]:
# Loading the dataset
(train_dataset, dev_dataset, test_dataset), info = tfds.load(
    'cats_vs_dogs',
    split=['train[:80%]', 'train[80%:90%]', 'train[90%:]'],
    with_info=True,
    as_supervised=True  # Returns (image, label) pairs
) # fix seed of random data
len(train_dataset),len(dev_dataset),len(test_dataset)
# Constants
IMG_SIZE = 224
BATCH_SIZE = 32
SEED = 42  # Set your desired seed value

# Fix the random seeds for reproducibility
# Global Seed for TensorFlow (Optional): To ensure other random operations (e.g., augmentations, weight initialization) are also reproducible, you can set a global seed #
tf.random.set_seed(SEED)
np.random.seed(SEED)
random.seed(SEED)
# Preprocessing function
def preprocess(image, label):
    image = tf.image.resize(image, (IMG_SIZE, IMG_SIZE))  # Resize to IMG_SIZE
    image = image / 255.0  # Normalize to [0, 1]
    return image, label

# Apply preprocessing
#seed=SEED in shuffle():This ensures that the shuffling is reproducible. If you use the same seed across runs, the shuffled dataset will be identical.#
# reshuffle_each_iteration=False:Ensures that shuffling happens once only and does not change between epochs. #
train_dataset = train_dataset.map(preprocess).shuffle(1000, seed=SEED, reshuffle_each_iteration=False).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
dev_dataset = dev_dataset.map(preprocess).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
test_dataset = test_dataset.map(preprocess).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

# 3- Loading the Original saved model (protobuff file)

In [None]:
# Mount google drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
original_model_path = '/content/drive/MyDrive/saved_models/mobilenet_saved_model'
# Loading the original model without quantization
model = tf.saved_model.load(original_model_path)

# 4- Float16 Quantization
✅ What happens?

The model weights are converted from float32 to float16, reducing the model size by half. Computation still happens in float32 on most hardware unless the device supports float16 acceleration (e.g., some GPUs and TPUs).
Ensures that numerical precision loss is minimal.

In [None]:
# Convert to TFLite format Dynamic Range Quantization
converter = tf.lite.TFLiteConverter.from_saved_model(original_model_path)
converter.optimizations = [tf.lite.Optimize.DEFAULT]
converter.target_spec.supported_types = [tf.float16]
tflite_model_fp16 = converter.convert()
tflite_model_fp16_path = "/content/drive/My Drive/saved_models/mobilenet_quantized_fp16.tflite"
with open(tflite_model_fp16_path, "wb") as f:
    f.write(tflite_model_fp16)

print(f"Quantized Model Saved at: {tflite_model_fp16_path}")

Quantized Model Saved at: /content/drive/My Drive/saved_models/mobilenet_quantized_fp16.tflite


# 5- Dynamic Range Quantization
✅ What happens?

The model weights are quantized to INT8 or UINT8, but activations remain in float32 during inference.
The model is smaller and runs faster than float16 or full float32 models.
Since activations remain in float32, there is less precision loss compared to full INT8 quantization.

In [None]:
# Convert to TFLite format Dynamic Range Quantization
converter = tf.lite.TFLiteConverter.from_saved_model(original_model_path)
converter.optimizations = [tf.lite.Optimize.DEFAULT]  # Apply dynamic range quantization
# Convert and save
tflite_model_int8 = converter.convert()
tflite_model_int8_path = "/content/drive/My Drive/saved_models/mobilenet_quantized_part_INT8.tflite"
tflite_model_full_quantized_int8_path='/content/drive/MyDrive/saved_models/cats_vs_dogs_full_int8.tflite'
with open(tflite_model_int8_path, "wb") as f:
    f.write(tflite_model_int8)

print(f"Quantized Model Saved at: {tflite_model_int8_path}")

Quantized Model Saved at: /content/drive/My Drive/saved_models/mobilenet_quantized_part_INT8.tflite


# 6- Comparing sizes of the Model

In [None]:
original_model_path = "/content/drive/My Drive/saved_models/mobilenet_model.tflite"
# Get file size in MB
size_in_mb_original = os.path.getsize(original_model_path) / (1024 * 1024)
size_in_mb_float16 = os.path.getsize(tflite_model_fp16_path) / (1024 * 1024)
size_in_mb_INT8 = os.path.getsize(tflite_model_int8_path) / (1024 * 1024)
size_in_mb_full_INT8=os.path.getsize(tflite_model_full_quantized_int8_path) / (1024 * 1024)
print(f"Model Size before Quantization: {size_in_mb_original:.2f} MB")
print(f"Model Size after float16 Quantization: {size_in_mb_float16:.2f} MB")
print(f"Model Size after INT8 Quantization: {size_in_mb_INT8:.2f} MB")
print(f"Model Size after Full INT8 Quantization: {size_in_mb_full_INT8:.2f} MB")

Model Size before Quantization: 12.71 MB
Model Size after float16 Quantization: 6.37 MB
Model Size after INT8 Quantization: 3.35 MB
Model Size after Full INT8 Quantization: 3.48 MB


# 7- Comparing performances of the Model


In [None]:
# Function to run inference on TFLite model
def evaluate_tflite_model(interpreter, dataset):
    correct = 0
    total = 0
    # Get input tensor details (for scale and zero point)
    input_scale, input_zero_point = input_details[0]['quantization']  # (scale, zero_point)
    # was used for debugging
     #if input_scale == 0:
     #   raise ValueError("Quantization scale is 0. Ensure model is fully quantized.")
    start_time = time.time()
    for images, labels in dataset:
        for i in range(len(images)):
            # Prepare input image
            input_data = np.expand_dims(images[i].numpy(), axis=0)
            # Ensure INT8 conversion (Fully Quantized Model)
            if input_details[0]['dtype'] == np.int8:
                input_data = ((input_data / input_scale) + input_zero_point).astype(np.int8)
            else:
                input_data = input_data.astype(np.float32)
            # Adjust dtype based on quantization type
           # if input_details[0]['dtype'] == np.uint8:  # INT8 Model
            #    input_data = (input_data * 255).astype(np.uint8)  # Scale back to INT8 range
            #else:  # FLOAT16 or FLOAT32 Model
             #   input_data = input_data.astype(np.float32)  # Keep FP32

            # Set the input tensor
            interpreter.set_tensor(input_details[0]['index'], input_data)

            # Start timer
            start_time = time.time()

            # Run inference
            interpreter.invoke()

            # Get the output tensor
            output_data = interpreter.get_tensor(output_details[0]['index'])

            # Convert output to class label (binary classification)
            predicted_label = 1 if output_data[0][0] > 0.5 else 0  # 1 for Dog, 0 for Cat

            # Stop timer
            stop_time = time.time()
            # Compare with ground truth
            if predicted_label == labels[i].numpy():
                correct += 1
            total += 1
    execution_time = stop_time - start_time
    print(f"Inference Time for a single image: {execution_time:.2f} seconds")
    accuracy = (correct / total) * 100
    return accuracy

## 1- Original Model

In [None]:
# Load the model
interpreter = tf.lite.Interpreter(model_path=original_model_path)
interpreter.allocate_tensors()
# Get input and output details
input_details = interpreter.get_input_details()
output_details = interpreter.get_output_details()
# Run evaluation on TFLite model
accuracy = evaluate_tflite_model(interpreter, test_dataset)
print(f"TFLite Model Accuracy: {accuracy:.2f}%")

Inference Time for a single image: 0.02 seconds
TFLite Model Accuracy: 98.80%


## Float16 Quantizatied model

In [None]:
interpreter = tf.lite.Interpreter(model_path=tflite_model_fp16_path)
interpreter.allocate_tensors()

# Get input and output details
input_details = interpreter.get_input_details()
output_details = interpreter.get_output_details()


# Run evaluation on TFLite model
accuracy = evaluate_tflite_model(interpreter, test_dataset)
print(f"TFLite Model Accuracy: {accuracy:.2f}%")

Inference Time for a single image: 0.02 seconds
TFLite Model Accuracy: 98.80%


## 3- Dynamic Range Quantizatied (INT8 weights)

In [None]:
interpreter = tf.lite.Interpreter(model_path=tflite_model_int8_path)
interpreter.allocate_tensors()
# Get input and output details
input_details = interpreter.get_input_details()
output_details = interpreter.get_output_details()
# Run evaluation on TFLite model
accuracy = evaluate_tflite_model(interpreter, test_dataset)
print(f"TFLite Model Accuracy: {accuracy:.2f}%")

Inference Time for a single image: 0.03 seconds
TFLite Model Accuracy: 98.62%


In [None]:
# @title Dynamic Range Full Quantized
interpreter = tf.lite.Interpreter(model_path=tflite_model_full_quantized_int8_path)
interpreter.allocate_tensors()
# Get input and output details
input_details = interpreter.get_input_details()
output_details = interpreter.get_output_details()
# Run evaluation on TFLite model
accuracy = evaluate_tflite_model(interpreter, test_dataset)
print(f"TFLite Model Accuracy: {accuracy:.2f}%")

Inference Time for a single image: 0.02 seconds
TFLite Model Accuracy: 98.37%


In [None]:
data = {
    'Model': ['Original Model', 'FP16 Quantized', 'INT8 Quantized','Full INT8 Quantized'],
    'Accuracy (%)': [98.80, 98.80, 98.62,98.37],
    'Inference Time for a single image (ms)': [30, 20, 30, 20 ],
    'Model Size (MB)': [size_in_mb_original, size_in_mb_float16, size_in_mb_INT8,size_in_mb_full_INT8],
}
results_table = pd.DataFrame(data)
results_table

Unnamed: 0,Model,Accuracy (%),Inference Time for a single image (ms),Model Size (MB)
0,Original Model,98.8,30,12.708714
1,FP16 Quantized,98.8,20,6.368641
2,INT8 Quantized,98.62,30,3.351746
3,Full INT8 Quantized,98.37,20,3.476463
