### Import Packages

In [1]:
# Necessary imports

import tensorflow as tf
from tensorflow import keras
import numpy as np
import tempfile

In [2]:
import pathlib

### Loading MNIST DIGITS Dataset and Training CNN2

In [3]:
# Load MNIST dataset
mnist = tf.keras.datasets.mnist
(train_images, train_labels), (test_images, test_labels) = mnist.load_data()

# Normalize the input image so that each pixel value is between 0 to 1.
train_images = train_images.astype(np.float32) / 255.0
test_images = test_images.astype(np.float32) / 255.0

# Define the model architecture
model = tf.keras.Sequential([
  tf.keras.layers.InputLayer(input_shape=(28, 28)),
  tf.keras.layers.Reshape(target_shape=(28, 28, 1)),
  tf.keras.layers.Conv2D(filters=12, kernel_size=(3, 3), activation='relu'),
  tf.keras.layers.MaxPooling2D(pool_size=(2, 2)),
  tf.keras.layers.Flatten(),
  tf.keras.layers.Dense(10)
])

# Train the digit classification model
model.compile(optimizer='adam',
              loss=tf.keras.losses.SparseCategoricalCrossentropy(
                  from_logits=True),
              metrics=['accuracy'])
model.fit(
  train_images,
  train_labels,
  epochs=5,
  validation_data=(test_images, test_labels)
)

Train on 60000 samples, validate on 10000 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7f8435957780>

In [4]:
# Saving Model
model.save('1_digits_mnist_model.h5')

### CNN2: Original model's (1_digits_mnist_model.h5) accuracy, model load and inference time and unit Inference Time

In [5]:
# Load trained .h5 model
model = tf.keras.models.load_model('./1_digits_mnist_model.h5')

In [6]:
# Evaluate the model on test set
score = model.evaluate(test_images, test_labels, verbose=0)

# Print test accuracy
print('\n', 'Test accuracy:', score[1])


 Test accuracy: 0.9787


In [7]:
import time
data = test_images[0]
data = data.reshape((1, 28, 28))
def orig_model_infer_time():
  start_time_full = time.time()
  model = tf.keras.models.load_model('./1_digits_mnist_model.h5', custom_objects=None, compile=True)
  start_time_infer = time.time()
  model.predict(data)
  results = {'Time to load model and then infer': (time.time() - start_time_full),
             'Time to only infer': (time.time() - start_time_infer)}
  
  return results

In [8]:
orig_model_infer_time()

{'Time to load model and then infer': 0.2886533737182617,
 'Time to only infer': 0.05460047721862793}

### Quantization-aware (Q-aware) training of CNN2

In [9]:
!pip install tensorflow_model_optimization
import tensorflow_model_optimization as tfmot

quantize_model = tfmot.quantization.keras.quantize_model

# q_aware stands for for quantization aware.
q_aware_model = quantize_model(model)

# `quantize_model` requires a recompile.
q_aware_model.compile(optimizer='adam',
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])

q_aware_model.summary()

You should consider upgrading via the '/home/db/.virtualenvs/LR/bin/python3 -m pip install --upgrade pip' command.[0m
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: expected an indented block (<unknown>, line 14)
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: expected an indented block (<unknown>, line 14)
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: expected an indented block (<unknown>, line 14)
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: expected an indented block (<unknown>, line 14)
Please report this to

In [10]:
# Train and evaluate the model against baseline

# train_images_subset = train_images[0:1000] # out of 60000
# train_labels_subset = train_labels[0:1000]

q_aware_model.fit(train_images, train_labels,
                  batch_size=10, epochs=5, validation_split=0.1)

Train on 54000 samples, validate on 6000 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7f840c59b860>

In [11]:
# Evaluate the model on test set
score = q_aware_model.evaluate(test_images, test_labels, verbose=0)

# Print test accuracy

print('\n', 'Test accuracy:', score[1])


 Test accuracy: 0.981


In [12]:
q_aware_model.save('2_digits_mnist_model_qaware.h5')

### Q-aware trained models unit inference time

In [13]:
data = test_images[0]
data = data.reshape((1, 28, 28))
print (data.shape)
data_y = train_labels[0:1]

(1, 28, 28)


In [14]:
# Unit inference time 
start_time_infer = time.time()
score = q_aware_model.evaluate(data, data_y, verbose=0)
results1 = {'Time to only infer': (time.time() - start_time_infer) }
print (results1)

{'Time to only infer': 0.03508925437927246}


### Pruning CNN2 (3_digits_mnist_model_pruning.h5)

In [15]:
! pip install -q tensorflow-model-optimization

You should consider upgrading via the '/home/db/.virtualenvs/LR/bin/python3 -m pip install --upgrade pip' command.[0m


In [16]:
import tensorflow_model_optimization as tfmot

prune_low_magnitude = tfmot.sparsity.keras.prune_low_magnitude

# Compute end step to finish pruning after 2 epochs.
batch_size = 128
epochs = 2
validation_split = 0.1 # 10% of training set will be used for validation set. 

num_images = train_images.shape[0] * (1 - validation_split)
end_step = np.ceil(num_images / batch_size).astype(np.int32) * epochs

# Define model for pruning.
pruning_params = {
      'pruning_schedule': tfmot.sparsity.keras.PolynomialDecay(initial_sparsity=0.50,
                                                               final_sparsity=0.80,
                                                               begin_step=0,
                                                               end_step=end_step)
}

model_for_pruning = prune_low_magnitude(model, **pruning_params)

# `prune_low_magnitude` requires a recompile.
model_for_pruning.compile(optimizer='adam',
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])

model_for_pruning.summary()

Instructions for updating:
Please use `layer.add_weight` method instead.
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
prune_low_magnitude_reshape  (None, 28, 28, 1)         1         
_________________________________________________________________
prune_low_magnitude_conv2d ( (None, 26, 26, 12)        230       
_________________________________________________________________
prune_low_magnitude_max_pool (None, 13, 13, 12)        1         
_________________________________________________________________
prune_low_magnitude_flatten  (None, 2028)              1         
_________________________________________________________________
prune_low_magnitude_dense (P (None, 10)                40572     
Total params: 40,805
Trainable params: 20,410
Non-trainable params: 20,395
_________________________________________________________________


In [17]:
logdir = tempfile.mkdtemp()

callbacks = [
  tfmot.sparsity.keras.UpdatePruningStep(),
  tfmot.sparsity.keras.PruningSummaries(log_dir=logdir),
]
  
model_for_pruning.fit(train_images, train_labels,
                  batch_size=batch_size, epochs=epochs, validation_split=validation_split,
                  callbacks=callbacks)

Train on 54000 samples, validate on 6000 samples
Epoch 1/2
Epoch 2/2


<tensorflow.python.keras.callbacks.History at 0x7f84a18ba978>

In [18]:
_, model_for_pruning_accuracy = model_for_pruning.evaluate(
   test_images, test_labels, verbose=0)

print('Pruned test accuracy:', model_for_pruning_accuracy)

Pruned test accuracy: 0.9728


In [19]:
model_for_export = tfmot.sparsity.keras.strip_pruning(model_for_pruning)

pruned_keras_file = './3_digits_mnist_model_pruning.h5'
tf.keras.models.save_model(model_for_export, pruned_keras_file, include_optimizer=False)
print('Saved pruned Keras model to:', pruned_keras_file)

Saved pruned Keras model to: ./3_digits_mnist_model_pruning.h5


In [20]:
# This cell is usesd to find the Time to load model and then infer and Time to only infer
import time
data = test_images[0]
data = data.reshape((1, 28, 28))
def orig_model_infer_time():
  start_time_full = time.time()
  model = tf.keras.models.load_model('./3_digits_mnist_model_pruning.h5', custom_objects=None, compile=True)
  start_time_infer = time.time()
  model.predict(data)
  results = {'Time to load model and then infer': (time.time() - start_time_full)}
  results1 = {'Time to only infer': (time.time() - start_time_infer) }
  print (results)
  print (results1)
    
orig_model_infer_time()

{'Time to load model and then infer': 0.13429832458496094}
{'Time to only infer': 0.07329654693603516}


In [21]:
model_for_export = tfmot.sparsity.keras.strip_pruning(model_for_pruning)

tf.keras.models.save_model(model_for_export, '3_digits_mnist_model_pruning.h5', include_optimizer=False)

### Convert CNN2 (1_digits_mnist_model.h5) to TF Lite (4_digits_mnist_model_tflite.tflite)

In [22]:
# Load trained .h5 model
model = tf.keras.models.load_model('./1_digits_mnist_model.h5')

In [23]:
converter = tf.lite.TFLiteConverter.from_keras_model(model)
tflite_model_file = converter.convert()

In [24]:
tflite_models_dir = pathlib.Path("digits_mnist_tflite_models/")
tflite_models_dir.mkdir(exist_ok=True, parents=True)
tflite_model_file_dir = tflite_models_dir/"4_digits_mnist_model_tflite.tflite"
tflite_model_file_dir.write_bytes(tflite_model_file)

83280

### Convert CNN2 (1_digits_mnist_model.h5) to Integer with float fallback Quantized version (5_digits_mnist_Integer_float_model.tflite)

In [25]:
converter.optimizations = [tf.lite.Optimize.DEFAULT]

In [26]:
mnist_train, _ = tf.keras.datasets.mnist.load_data()
images = tf.cast(mnist_train[0], tf.float32) / 255.0
mnist_ds = tf.data.Dataset.from_tensor_slices((images)).batch(1)
def representative_data_gen():
  for input_value in mnist_ds.take(100):
    yield [input_value]

converter.representative_dataset = representative_data_gen

In [27]:
tflite_model_quant = converter.convert()
tflite_model_quant_file = tflite_models_dir/"5_digits_mnist_Integer_float_model.tflite"
tflite_model_quant_file.write_bytes(tflite_model_quant)

23208

### Convert CNN2 (1_digits_mnist_model.h5) to Float Quantized version (6_digits_mnist_float16_model.tflite)

In [28]:
converter.optimizations = [tf.lite.Optimize.DEFAULT]
converter.target_spec.supported_types = [tf.float16]

In [29]:
tflite_fp16_model = converter.convert()
tflite_model_fp16_file = tflite_models_dir/"6_digits_mnist_float16_model.tflite"
tflite_model_fp16_file.write_bytes(tflite_fp16_model)

42972

### Convert CNN2 (1_digits_mnist_model.h5) to Integer Only Quantized version (7_digits_mnist_Integer_model.tflite)

In [30]:
converter = tf.lite.TFLiteConverter.from_keras_model(model)
tflite_model = converter.convert()

In [31]:
converter.optimizations = [tf.lite.Optimize.DEFAULT]

In [32]:
mnist_train, _ = tf.keras.datasets.mnist.load_data()
images = tf.cast(mnist_train[0], tf.float32) / 255.0
mnist_ds = tf.data.Dataset.from_tensor_slices((images)).batch(1)
def representative_data_gen():
  for input_value in mnist_ds.take(100):
    yield [input_value]

converter.representative_dataset = representative_data_gen

In [33]:
converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]
converter.inference_input_type = tf.int8  # or tf.uint8
converter.inference_output_type = tf.int8  # or tf.uint8
tflite_int_quant_model = converter.convert()
tflite_model_integeronly_file = tflite_models_dir/"7_digits_mnist_Integer_model.tflite"
tflite_model_integeronly_file.write_bytes(tflite_int_quant_model)

23208

### Evaluate Post training Quantized versions of CNN2

In [34]:
# Evaluate the mode
mnist = tf.keras.datasets.mnist
(train_images, train_labels), (test_images, test_labels) = mnist.load_data()
def evaluate_model(interpreter):
  start_time = time.time()

  input_index = interpreter.get_input_details()[0]["index"]
  output_index = interpreter.get_output_details()[0]["index"]

  # Run predictions on every image in the "test" dataset.
  prediction_digits = []
  for test_image in test_images:
    # Pre-processing: add batch dimension and convert to float32 to match with
    # the model's input data format.
    test_image = np.expand_dims(test_image, axis=0).astype(np.float32)
    interpreter.set_tensor(input_index, test_image)
  
    # Run inference.
    interpreter.invoke()

    # Post-processing: remove batch dimension and find the digit with highest
    # probability.
    output = interpreter.tensor(output_index)
    digit = np.argmax(output()[0])
    prediction_digits.append(digit)

  # Compare prediction results with ground truth labels to calculate accuracy.
  accurate_count = 0
  for index in range(len(prediction_digits)):
    if prediction_digits[index] == test_labels[index]:
      accurate_count += 1
  accuracy = accurate_count * 1.0 / len(prediction_digits)

  results = {'time': (time.time() - start_time),
             'accuracy': accuracy}

 
  # Loading Test Image
  test_img = np.expand_dims(test_images[0], axis=0).astype(np.float32)

  interpreter.set_tensor(input_index, test_img)
  start_time_infer = 0
  start_time_infer = time.time()
  interpreter.invoke()

  predictions = interpreter.get_tensor(output_index)

  result1 = {"Time to only Infer" : (time.time() - start_time_infer),
            "Time to load Quantized model and Infer ": (time.time() - start_time)}
  
  return results, result1

### CNN2: Integer with float fallback quantized model's accuracy and inference time

In [35]:
import time
tflite_model_file = tflite_models_dir/'5_digits_mnist_Integer_float_model.tflite'
interpreter = tf.lite.Interpreter(model_path=str(tflite_model_file))
interpreter.allocate_tensors()

In [36]:
evaluate_model(interpreter)

({'time': 1.6143968105316162, 'accuracy': 0.9539},
 {'Time to only Infer': 0.00014972686767578125,
  'Time to load Quantized model and Infer ': 1.6145589351654053})

### CNN2: Float quantized model's accuracy and inference time

In [37]:
tflite_float_model_file = tflite_models_dir/'6_digits_mnist_float16_model.tflite'
interpreter_float = tf.lite.Interpreter(model_path=str(tflite_float_model_file))
interpreter_float.allocate_tensors()

In [38]:
evaluate_model(interpreter_float)

({'time': 1.0327961444854736, 'accuracy': 0.7017},
 {'Time to only Infer': 0.00010633468627929688,
  'Time to load Quantized model and Infer ': 1.0329153537750244})

### CNN2: Integer Only Quantized Model's Accuracy and Inference Time

In [40]:
tflite_int_model_file = tflite_models_dir/'7_digits_mnist_Integer_model.tflite'
interpreter_int = tf.lite.Interpreter(model_path=str(tflite_int_model_file))
interpreter_int.allocate_tensors()

In [41]:
evaluate_model(interpreter_int)

({'time': 1.638054370880127, 'accuracy': 0.9539},
 {'Time to only Infer': 0.0001494884490966797,
  'Time to load Quantized model and Infer ': 1.6382172107696533})

## Joint model optimization of CNN2 - applying both pre + post training optimizations

### Convert pruned CNN2 (3_digits_mnist_model_pruning.h5) into TFlite (8_digits_mnist_model_pruning.tflite)

In [43]:
  model = tf.keras.models.load_model('./3_digits_mnist_model_pruning.h5', custom_objects=None, compile=True)



In [44]:
import tensorflow_model_optimization as tfmot
model_for_export = tfmot.sparsity.keras.strip_pruning(model)
converter = tf.lite.TFLiteConverter.from_keras_model(model_for_export)
pruned_tflite_model = converter.convert()

pruned_tflite_file = tflite_models_dir/'8_digits_mnist_model_pruning.tflite'

with open(pruned_tflite_file, 'wb') as f:
  f.write(pruned_tflite_model)

print('Saved pruned TFLite model to:', pruned_tflite_file)

Saved pruned TFLite model to: 8_digits_mnist_model_pruning.tflite


In [45]:
tflite_pruning_model_file = '8_digits_mnist_model_pruning.tflite'
interpreter_pruning = tf.lite.Interpreter(model_path=str(tflite_pruning_model_file))
interpreter_pruning.allocate_tensors()

In [46]:
evaluate_model(interpreter_pruning)

({'time': 0.9892525672912598, 'accuracy': 0.7401},
 {'Time to only Infer': 9.942054748535156e-05,
  'Time to load Quantized model and Infer ': 0.9893736839294434})

### Convert Pruned CNN (3_digits_mnist_model_pruning.h5) to Int with float fallback quantized version (9_digits_mnist_model_pruning_int_with_float.tflite)

In [47]:
converter = tf.lite.TFLiteConverter.from_keras_model(model_for_export)
converter.optimizations = [tf.lite.Optimize.DEFAULT]
quantized_and_pruned_tflite_model = converter.convert()

quantized_and_pruned_tflite_file = tflite_models_dir/'9_digits_mnist_model_pruning_int_with_float.tflite'

with open(quantized_and_pruned_tflite_file, 'wb') as f:
  f.write(quantized_and_pruned_tflite_model)

print('Saved quantized and pruned TFLite model to:', quantized_and_pruned_tflite_file)

Saved quantized and pruned TFLite model to: 9_digits_mnist_model_pruning_int_with_float.tflite


In [48]:
tflite_pruning_10X_model_file = '9_digits_mnist_model_pruning_int_with_float.tflite'
interpreter_pruning_10X = tf.lite.Interpreter(model_path=str(tflite_pruning_10X_model_file))
interpreter_pruning_10X.allocate_tensors()

In [49]:
evaluate_model(interpreter_pruning_10X)

({'time': 1.088677167892456, 'accuracy': 0.7396},
 {'Time to only Infer': 0.00011301040649414062,
  'Time to load Quantized model and Infer ': 1.0888197422027588})

### Convert Pruned CNN (3_digits_mnist_model_pruning.h5) to Float 16 Quantized version (10_digits_mnist_float16_purning_model.tflite) ###  Purning

In [50]:
converter = tf.lite.TFLiteConverter.from_keras_model(model_for_export)
converter.optimizations = [tf.lite.Optimize.DEFAULT]
converter.target_spec.supported_types = [tf.float16]

In [51]:
tflite_fp16_purning_model = converter.convert()
tflite_model_fp16_purning_file = tflite_models_dir/"10_digits_mnist_float16_purning_model.tflite"
tflite_model_fp16_purning_file.write_bytes(tflite_fp16_purning_model)

42972

In [52]:
tflite_float16_purning_model_file = tflite_models_dir/'10_digits_mnist_float16_purning_model.tflite'
interpreter_pruning_float16_purning_model = tf.lite.Interpreter(model_path=str(tflite_float16_purning_model_file))
interpreter_pruning_float16_purning_model.allocate_tensors()
evaluate_model(interpreter_pruning_float16_purning_model)

({'time': 1.0293407440185547, 'accuracy': 0.7402},
 {'Time to only Infer': 0.00011658668518066406,
  'Time to load Quantized model and Infer ': 1.029484748840332})

### Convert Pruned CNN (3_digits_mnist_model_pruning.h5) to Int only Quantized version (11_digits_mnist_Integer_purning_model.tflite)

In [53]:
converter = tf.lite.TFLiteConverter.from_keras_model(model_for_export)
converter.optimizations = [tf.lite.Optimize.DEFAULT]

In [54]:
mnist_train, _ = tf.keras.datasets.mnist.load_data()
images = tf.cast(mnist_train[0], tf.float32) / 255.0
mnist_ds = tf.data.Dataset.from_tensor_slices((images)).batch(1)
def representative_data_gen():
  for input_value in mnist_ds.take(100):
    yield [input_value]

converter.representative_dataset = representative_data_gen

In [55]:
converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]
converter.inference_input_type = tf.int8  # or tf.uint8
converter.inference_output_type = tf.int8  # or tf.uint8
tflite_int_purning_quant_model = converter.convert()
tflite_model_integeronly_purning_file = tflite_models_dir/"11_digits_mnist_Integer_purning_model.tflite"
tflite_model_integeronly_purning_file.write_bytes(tflite_int_purning_quant_model)

23208

In [56]:
tflite_intonly_purning_model_file = tflite_models_dir/'11_digits_mnist_Integer_purning_model.tflite'
interpreter_intonly_purning_model = tf.lite.Interpreter(model_path=str(tflite_intonly_purning_model_file))
interpreter_intonly_purning_model.allocate_tensors()
evaluate_model(interpreter_intonly_purning_model)

({'time': 1.5820226669311523, 'accuracy': 0.947},
 {'Time to only Infer': 0.00015592575073242188,
  'Time to load Quantized model and Infer ': 1.5821983814239502})

### Create TFlite version (12_digits_mnist_model_qaware.tflite) of qaware trained CNN2 (2_digits_mnist_model_qaware.h5)

In [57]:
# model = tf.keras.models.load_model('./digits_mnist_model_qaware.h5', custom_objects=None, compile=True)
# Cannot load a saved qaware .h5

converter = tf.lite.TFLiteConverter.from_keras_model(q_aware_model)
converter.optimizations = [tf.lite.Optimize.DEFAULT]

quantized_tflite_model = converter.convert()

quantized_tflite_file = tflite_models_dir/'12_digits_mnist_model_qaware.tflite'

with open(quantized_tflite_file, 'wb') as f:
  f.write(quantized_tflite_model)

print('Saved qaware trained TFLite model to:', quantized_tflite_file)

Saved qaware trained TFLite model to: 12_digits_mnist_model_qaware.tflite


In [58]:
import time
interpreter = tf.lite.Interpreter(model_content=quantized_tflite_model)
interpreter.allocate_tensors()

In [59]:
evaluate_model(interpreter)

({'time': 1.4810619354248047, 'accuracy': 0.1587},
 {'Time to only Infer': 0.0002460479736328125,
  'Time to load Quantized model and Infer ': 1.4814565181732178})

### Convert qaware trained CNN (2_digits_mnist_model_qaware.h5) to Int with float quantized version (13_digits_mnist_Int_float_qaware_model.tflite) 

In [60]:
converter = tf.lite.TFLiteConverter.from_keras_model(q_aware_model)
converter.optimizations = [tf.lite.Optimize.DEFAULT]
mnist_train, _ = tf.keras.datasets.mnist.load_data()
images = tf.cast(mnist_train[0], tf.float32) / 255.0
mnist_ds = tf.data.Dataset.from_tensor_slices((images)).batch(1)
def representative_data_gen():
  for input_value in mnist_ds.take(100):
    yield [input_value]

converter.representative_dataset = representative_data_gen

In [61]:
tflite_model_qaware_int_float = converter.convert()
tflite_model_quant_file = tflite_models_dir/"13_digits_mnist_Int_float_qaware_model.tflite"
tflite_model_quant_file.write_bytes(tflite_model_qaware_int_float)

24064

In [62]:
tflite_qaware_model_file = tflite_models_dir/'13_digits_mnist_Int_float_qaware_model.tflite'
interpreter_tflite_qaware = tf.lite.Interpreter(model_path=str(tflite_qaware_model_file))
interpreter_tflite_qaware.allocate_tensors()
evaluate_model(interpreter_tflite_qaware)

({'time': 2.072984457015991, 'accuracy': 0.9275},
 {'Time to only Infer': 0.00019884109497070312,
  'Time to load Quantized model and Infer ': 2.0732014179229736})

### Convert qaware trained CNN (2_digits_mnist_model_qaware.h5) to Float16 quantized version (14_digits_mnist_float16_qaware.tflite)

In [63]:
converter.optimizations = [tf.lite.Optimize.DEFAULT]
converter.target_spec.supported_types = [tf.float16]
tflite_fp16_model = converter.convert()
tflite_model_fp16_file = tflite_models_dir/"14_digits_mnist_float16_qaware.tflite"
tflite_model_fp16_file.write_bytes(tflite_fp16_model)

43568

In [64]:
tflite_qaware_float16_model_file = tflite_models_dir/'14_digits_mnist_float16_qaware.tflite'
interpreter_tflite_qaware_f16 = tf.lite.Interpreter(model_path=str(tflite_qaware_float16_model_file))
interpreter_tflite_qaware_f16.allocate_tensors()
evaluate_model(interpreter_tflite_qaware_f16)

({'time': 1.4069643020629883, 'accuracy': 0.1584},
 {'Time to only Infer': 0.00014066696166992188,
  'Time to load Quantized model and Infer ': 1.4071266651153564})

### Int only quantization of Quantization-aware trained CNN2 - NA Quantization not yet supported for op: FAKE_QUANT

In [65]:
converter = tf.lite.TFLiteConverter.from_keras_model(q_aware_model)
tflite_model = converter.convert()
converter.optimizations = [tf.lite.Optimize.DEFAULT]
mnist_train, _ = tf.keras.datasets.mnist.load_data()
images = tf.cast(mnist_train[0], tf.float32) / 255.0
mnist_ds = tf.data.Dataset.from_tensor_slices((images)).batch(1)
def representative_data_gen():
  for input_value in mnist_ds.take(100):
    yield [input_value]

converter.representative_dataset = representative_data_gen
converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]
converter.inference_input_type = tf.int8  # or tf.uint8
converter.inference_output_type = tf.int8  # or tf.uint8
tflite_int_quant_model = converter.convert()
tflite_model_integeronly_file = tflite_models_dir/"digits_mnist_int_qaware.tflite"
tflite_model_integeronly_file.write_bytes(tflite_int_quant_model)

RuntimeError: Quantization not yet supported for op: FAKE_QUANT