In [3]:
import sys
from matplotlib import pyplot
from tensorflow.keras.datasets import cifar10
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D
from tensorflow.keras.layers import SeparableConv2D
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.layers import MaxPooling2D
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import Flatten
from tensorflow.keras.optimizers import SGD
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import os
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.preprocessing.image import ImageDataGenerator

Jupyter notebook to build, train, and deply CNN models for TF Lite

In [2]:
filename = "tiny_vgg_q" # Set the name for the model output

early_stop_training = True # Early stop training

#___________________________
training_batch_size = 20
training_epochs = 100
training_patience = 10

#___________________________
quantize_aware_training = True
quantize_training_epochs = 20
quantize_training_patience = 5

target_exponent = 5 # Target exponent bit size for custom floating-point 
target_mantissa = 2 # Target mantissa bit size for custom floating-point

In [3]:
google_colab = False # Set True when using on google colab
if google_colab:
  from google.colab import drive
  drive.mount('/content/gdrive')

CNN ARCHITECTURE:

In [4]:
# define cnn model
def define_model_CNN():
	model = Sequential()
	
	model.add(Conv2D(40, (3, 3), activation='relu', kernel_initializer='he_uniform', padding='same', input_shape=(32, 32, 3)))
	model.add(BatchNormalization())
	model.add(MaxPooling2D((2, 2)))
	model.add(Dropout(0.3))

	model.add(Conv2D(60, (3, 3), activation='relu', kernel_initializer='he_uniform', padding='same'))
	model.add(BatchNormalization())
	model.add(MaxPooling2D((2, 2)))
	model.add(Dropout(0.3))

	model.add(Conv2D(120, (3, 3), activation='relu', kernel_initializer='he_uniform', padding='same'))
	model.add(BatchNormalization())
	model.add(MaxPooling2D((2, 2)))
	model.add(Dropout(0.3))

	model.add(Flatten())
	model.add(Dense(120, activation='relu', kernel_initializer='he_uniform'))
	model.add(Dropout(0.5))
	model.add(Dense(10, activation='softmax'))
	# compile model
	#opt = SGD(learning_rate=0.001, momentum=0.9)
	opt = "adam"
	model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy'])
	return model

Quantized aware training method.

In [5]:
import struct

def bin2float(b):
    ''' Convert binary string to a float.

    Attributes:
        :b: Binary string to transform.
    '''
    h = int(b, 2).to_bytes(8, byteorder="big")
    return struct.unpack('>d', h)[0]


def float2bin(f):
    ''' Convert float to 64-bit binary string.

    Attributes:
        :f: Float number to transform.
    '''
    [d] = struct.unpack(">Q", struct.pack(">d", f))
    return f'{d:064b}'

In [6]:
def quantize_float (float_number, exponent_bits, mantissa_bits):
    exponent_sign = 1
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  
    if mantissa_bits < 0:
      mantissa_bits = 0
    if exponent_bits < 0:
      exponent_bits = 0
      
    coefficient = float2bin(float_number)
    sign = int (coefficient[:1], 2)
    if 0 < mantissa_bits:
        custom_mantissa = int (coefficient[12 : 12 + mantissa_bits], 2)
    else:
        custom_mantissa = 0
    residual_mantissa = int (coefficient[12 + mantissa_bits:], 2)
    exponent = int (coefficient[1:12], 2) - 1023

    exponent_full_range = pow(2, exponent_bits - exponent_sign) - 1
    if exponent < - exponent_full_range:
        quantized_value = 0
    elif exponent > exponent_full_range:
        quantized_value = pow(-1, sign) * (1 + (1 - pow(2, - mantissa_bits))) * pow(2, exponent_full_range)
    else:
        if (pow (2, (52 - (mantissa_bits + 1))) - 1) < residual_mantissa:
            custom_mantissa += 1
            if (pow (2, mantissa_bits) - 1) < custom_mantissa:
                custom_mantissa = 0
                exponent += 1
    
        quantized_value = pow(-1, sign) * (1 + custom_mantissa * pow(2, - mantissa_bits)) * pow(2, exponent)
    return quantized_value

def quantize_model(model, exponent_bits, mantissa_bits):
  for layer in model.layers:
    if isinstance(layer, tf.keras.layers.Conv2D):
      layer_weight = layer.get_weights()
      if len(layer_weight) == 2:
        filter_matrix = layer_weight[0]
        bias_matrix = layer_weight[1]
        if filter_matrix.ndim == 4:
          for id_i, i in enumerate(filter_matrix):
            for id_j, j in enumerate(i):
              for id_k, k in enumerate(j):
                for id_l ,l in enumerate(k):
                  filter_matrix[id_i][id_j][id_k][id_l] = quantize_float (l, exponent_bits, mantissa_bits)
                  bias_matrix[id_l] = quantize_float (bias_matrix[id_l], exponent_bits, mantissa_bits)
        layer_weight[0] = filter_matrix
        layer_weight[1] = bias_matrix
        layer.set_weights(layer_weight)

Early stop callback.

In [7]:
monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3, patience=10, verbose=1, mode='auto', restore_best_weights=True)

In [8]:
mantissa_bits = target_mantissa
exponent_bits = target_exponent
class MyCallback(tf.keras.callbacks.Callback):
  def on_epoch_end(self, epoch, logs={}):
    quantize_model(self.model, exponent_bits, mantissa_bits)
  def on_train_end(self, logs={}):
    quantize_model(self.model, exponent_bits, mantissa_bits)
  def on_batch_end(self, epoch, logs=None):
    quantize_model(self.model, exponent_bits, mantissa_bits)


In [7]:
if not os.path.exists(filename):
  os.mkdir(filename)

# load train and test dataset
def load_dataset():
	# load dataset
	(trainX, trainY), (testX, testY) = cifar10.load_data()
	# one hot encode target values
	trainY = to_categorical(trainY)
	testY = to_categorical(testY)
	return trainX, trainY, testX, testY

# scale pixels
def prep_pixels(train, test):
	# convert from integers to floats
	train_norm = train.astype('float32')
	test_norm = test.astype('float32')
	# normalize to range 0-1
	train_norm = train_norm / 255.0
	test_norm = test_norm / 255.0
	# return normalized images
	return train_norm, test_norm

# plot diagnostic learning curves
def summarize_diagnostics(history):
	# plot loss
	f = pyplot.figure()
	f.set_figwidth(10)
	f.set_figheight(20)
	
	pyplot.subplot(211)
	pyplot.title('Cross Entropy Loss')
	pyplot.plot(history.history['loss'], color='blue', label='train')
	pyplot.plot(history.history['val_loss'], color='orange', label='test')
	# plot accuracy
	pyplot.subplot(212)
	pyplot.title('Classification Accuracy')
	pyplot.plot(history.history['accuracy'], color='blue', label='train')
	pyplot.plot(history.history['val_accuracy'], color='orange', label='test')
	# save plot to file
	pyplot.savefig(filename + '/' + filename + '_plot.png')
	pyplot.close()

# run the test harness for evaluating a model
def run_test_harness():
	# load dataset
	trainX, trainY, testX, testY = load_dataset()
	# prepare pixel data
	trainX, testX = prep_pixels(trainX, testX)
	# define model
	model = define_model_CNN()
	# fit model
	history = model.fit(trainX, trainY, epochs=20, batch_size=64, validation_data=(testX, testY), verbose=1, callbacks=[MyCallback()])
	# evaluate model
	_, acc = model.evaluate(testX, testY, verbose=0)
	print('> %.3f' % (acc * 100.0))
	# learning curves
	summarize_diagnostics(history)
	# Save the entire model to a HDF5 file.
	model.save(filename + "/" + filename + '.h5')

In [8]:
# entry point, run the test harness
# load dataset
trainX, trainY, testX, testY = load_dataset()

# prepare pixel data
trainX, testX = prep_pixels(trainX, testX)

# define model
model = define_model_CNN()
# fit model

NameError: name 'define_model_CNN' is not defined

In [11]:
if not early_stop_training:
    history = model.fit(trainX, trainY, epochs=training_epochs, batch_size=training_batch_size, validation_data=(testX, testY), verbose=1)
else:
    monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3, patience=training_patience, verbose=1, mode='auto', restore_best_weights=True)
    history = model.fit(trainX, trainY, epochs=training_epochs, batch_size=training_batch_size, validation_data=(testX, testY), verbose=1, callbacks=[monitor])

# evaluate model
_, acc = model.evaluate(testX, testY, verbose=0)
print('Pre-quantization training > %.3f' % (acc * 100.0))

2021-11-10 18:28:28.120107: W tensorflow/core/framework/cpu_allocator_impl.cc:81] Allocation of 614400000 exceeds 10% of free system memory.


Epoch 1/20

2021-11-10 18:33:26.449827: W tensorflow/core/framework/cpu_allocator_impl.cc:81] Allocation of 122880000 exceeds 10% of free system memory.


Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


2021-11-10 20:10:52.910363: W tensorflow/core/framework/cpu_allocator_impl.cc:81] Allocation of 122880000 exceeds 10% of free system memory.


> 80.730


In [31]:
# learning curves
summarize_diagnostics(history)

In [23]:
if not quantize_aware_training:
    print ("_______ Post-training quantization _______")
    quantize_model(model, exponent_bits, mantissa_bits)
    # evaluate model
    _, acc = model.evaluate(testX, testY, verbose=0)
    print('Post-training quantization > %.3f' % (acc * 100.0))
else:
    print ("_______ Quantize aware training _______")
    monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3, patience=quantize_training_patience, verbose=1, mode='auto', restore_best_weights=True)
    history = model.fit(trainX, trainY, epochs=quantize_training_epochs, batch_size=training_batch_size, validation_data=(testX, testY), verbose=1, callbacks=[MyCallback(), monitor])
    # evaluate model
    _, acc = model.evaluate(testX, testY, verbose=0)
    print('Quantize aware training > %.3f' % (acc * 100.0))

_______ Post-training quantization _______
Post-training quantization > 80.090


In [24]:
# Save the entire model to a HDF5 file.
model.save(filename + "/" + filename + '.h5')

_______ LOAD CNN MODEL FOR EVALUATION AND CONVERSION TO TF LITE FLOATING-POINT AND FIXED-POINT _______

In [4]:
from tensorflow.keras.preprocessing import image
from tensorflow.keras.models import load_model
import numpy as np

class_names = ['airplane', 'automobile', 'bird', 'cat', 'deer',
               'dog', 'frog', 'horse', 'ship', 'truck']

img_path = "dog.jpg"
img = image.load_img(img_path, target_size=(32, 32))
x = image.img_to_array(img)
x = np.expand_dims(x, axis=0)
x = x.astype('float32')
x = x / 255.0

model = load_model(filename + "/" + filename + '.h5')

output_data = model.predict(x)

print('TensorFlow Predicted:', output_data)

print("Done!")

2021-11-11 01:16:18.929698: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcuda.so.1
2021-11-11 01:16:18.966076: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:982] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-11-11 01:16:18.967451: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1716] Found device 0 with properties: 
pciBusID: 0000:01:00.0 name: GeForce GTX 860M computeCapability: 5.0
coreClock: 1.0195GHz coreCount: 5 deviceMemorySize: 1.96GiB deviceMemoryBandwidth: 74.65GiB/s
2021-11-11 01:16:18.967618: W tensorflow/stream_executor/platform/default/dso_loader.cc:59] Could not load dynamic library 'libcudart.so.10.1'; dlerror: libcudart.so.10.1: cannot open shared object file: No such file or directory
2021-11-11 01:16:18.968269: W tensorflow/stream_executor/platform/default/dso_loader.cc:59] Could not load dynamic library 'libc

TensorFlow Predicted: [[1.5144495e-05 3.9155074e-07 2.9066799e-03 4.8440439e-03 8.9457666e-04
  9.8720443e-01 2.0172596e-03 2.1131528e-03 2.4682145e-06 1.7672298e-06]]
Done!


In [5]:
def print_model(model):
  for layer in model.layers:
    if isinstance(layer, tf.keras.layers.Conv2D):
      layer_weight = layer.get_weights()
      if len(layer_weight) == 2:
        filter_matrix = layer_weight[0]
        bias_matrix = layer_weight[1]
        print (filter_matrix)
        print (bias_matrix)

In [9]:
print_model (model)
_, acc = model.evaluate(testX, testY, verbose=0)
print('> %.3f' % (acc * 100.0))

[[[[ 5.0000000e-01 -3.7500000e-01  7.5000000e-01 ... -2.1875000e-01
     5.0000000e-01  1.0937500e-01]
   [ 5.0000000e-01  2.1875000e-01  4.3750000e-01 ... -3.1250000e-02
     4.3750000e-01 -2.5000000e-01]
   [ 7.8125000e-02 -3.1250000e-01  3.1250000e-01 ...  2.1875000e-01
     3.7500000e-01 -1.0937500e-01]]

  [[ 1.5625000e-01  1.2500000e-01 -1.8750000e-01 ... -1.8750000e-01
     2.5000000e-01 -2.1875000e-01]
   [-2.1875000e-01  6.2500000e-01  3.1250000e-01 ... -4.3750000e-01
     3.1250000e-01  2.1875000e-01]
   [-3.1250000e-01 -5.0000000e-01 -2.5000000e-01 ...  6.2500000e-01
     1.5625000e-01  5.0000000e-01]]

  [[ 6.2500000e-01 -4.3750000e-01 -3.7500000e-01 ... -5.0000000e-01
    -1.0937500e-01 -6.2500000e-01]
   [ 6.2500000e-01  4.3750000e-01 -1.5625000e-01 ... -7.8125000e-02
    -2.5000000e-01  4.3750000e-01]
   [ 3.7500000e-01  1.2500000e-01  2.5000000e-01 ...  5.0000000e-01
    -1.5625000e-01  3.1250000e-01]]]


 [[[ 2.1875000e-01 -4.3750000e-01 -5.0000000e-01 ...  2.5000000e-

2021-11-11 01:18:52.367867: W tensorflow/core/framework/cpu_allocator_impl.cc:81] Allocation of 122880000 exceeds 10% of free system memory.


> 80.090


In [15]:
import tensorflow as tf
from tensorflow.keras import datasets, layers, models, utils

if not os.path.exists(filename):
  print("Folder " + filename + " does not exist.")
  exit()

# Recreate the exact same model, including its weights and the optimizer
model = tf.keras.models.load_model(filename + "/" + filename + '.h5')

model.summary()

tf.keras.utils.plot_model(model, filename + "/" + filename + ".png", show_shapes=True, show_layer_names=True, expand_nested=True)

(train_images, train_labels), (test_images, test_labels) = datasets.cifar10.load_data()

# Normalize pixel values to be between 0 and 1
train_images, test_images = train_images / 255 , test_images / 255

train_images = tf.cast(train_images, tf.float32)
cifar_ds = tf.data.Dataset.from_tensor_slices((train_images)).batch(1)
def representative_dataset():
  for input_value in cifar_ds.take(100):
    # Model has only one input so each data point has one element.
    yield [input_value]

# Convert the model.
converter = tf.lite.TFLiteConverter.from_keras_model(model)
#converter.optimizations = [tf.lite.Optimize.DEFAULT]
#converter.representative_dataset = representative_dataset
#converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]
converter.inference_input_type = tf.float32  # or tf.uint8
converter.inference_output_type = tf.float32  # or tf.uint8
tflite_model = converter.convert()

# Save the model.
with open(filename + "/" + filename + "_f32" + '.tflite', 'wb') as f:
  f.write(tflite_model)

print("Done!")

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d (Conv2D)              (None, 32, 32, 64)        1792      
_________________________________________________________________
batch_normalization (BatchNo (None, 32, 32, 64)        256       
_________________________________________________________________
max_pooling2d (MaxPooling2D) (None, 16, 16, 64)        0         
_________________________________________________________________
dropout (Dropout)            (None, 16, 16, 64)        0         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 16, 16, 128)       73856     
_________________________________________________________________
batch_normalization_1 (Batch (None, 16, 16, 128)       512       
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 8, 8, 128)         0

2021-11-10 20:12:23.873494: W tensorflow/core/framework/cpu_allocator_impl.cc:81] Allocation of 1228800000 exceeds 10% of free system memory.


Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.


2021-11-10 20:13:42.265662: W tensorflow/python/util/util.cc:348] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.


INFO:tensorflow:Assets written to: /tmp/tmps7xca_qf/assets


2021-11-10 20:13:50.813863: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:982] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-11-10 20:13:50.823142: I tensorflow/core/grappler/devices.cc:69] Number of eligible GPUs (core count >= 8, compute capability >= 0.0): 0
2021-11-10 20:13:50.830882: I tensorflow/core/grappler/clusters/single_machine.cc:356] Starting new session
2021-11-10 20:13:50.984778: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:982] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-11-10 20:13:50.987750: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x55f81d689a70 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2021-11-10 20:13:50.987796: I tensorflow/compiler/xla/service/service.cc:176]   StreamExecutor device (0): GeForce GT

Done!


2021-11-10 20:13:51.464548: W tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc:313] Ignored output_format.
2021-11-10 20:13:51.464597: W tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc:316] Ignored drop_control_dependency.
2021-11-10 20:13:51.526225: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:982] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-11-10 20:13:51.527096: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1716] Found device 0 with properties: 
pciBusID: 0000:01:00.0 name: GeForce GTX 860M computeCapability: 5.0
coreClock: 1.0195GHz coreCount: 5 deviceMemorySize: 1.96GiB deviceMemoryBandwidth: 74.65GiB/s
2021-11-10 20:13:51.527269: W tensorflow/stream_executor/platform/default/dso_loader.cc:59] Could not load dynamic library 'libcudart.so.10.1'; dlerror: libcudart.so.10.1: cannot open shared object file: No such file or directory
2021-11-10 

In [16]:
# Convert the model.
converter = tf.lite.TFLiteConverter.from_keras_model(model)
converter.optimizations = [tf.lite.Optimize.DEFAULT]
converter.representative_dataset = representative_dataset
converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]
converter.inference_input_type = tf.float32  # or tf.uint8
converter.inference_output_type = tf.float32  # or tf.uint8
tflite_model = converter.convert()

# Save the model.
with open(filename + "/" + filename + "_i8" + '.tflite', 'wb') as f:
  f.write(tflite_model)

print("Done!")

INFO:tensorflow:Assets written to: /tmp/tmp7b_9ubnk/assets


INFO:tensorflow:Assets written to: /tmp/tmp7b_9ubnk/assets
2021-11-10 20:13:58.661770: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:982] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-11-10 20:13:58.662467: I tensorflow/core/grappler/devices.cc:69] Number of eligible GPUs (core count >= 8, compute capability >= 0.0): 0
2021-11-10 20:13:58.664582: I tensorflow/core/grappler/clusters/single_machine.cc:356] Starting new session
2021-11-10 20:13:58.665490: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:982] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-11-10 20:13:58.666174: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1716] Found device 0 with properties: 
pciBusID: 0000:01:00.0 name: GeForce GTX 860M computeCapability: 5.0
coreClock: 1.0195GHz coreCount: 5 deviceMemorySize: 1.96GiB devi

Done!


In [17]:

# Load TFLite model and allocate tensors.
interpreter = tf.lite.Interpreter(model_path=filename + "/" + filename + "_i8" + '.tflite')
interpreter.allocate_tensors()

# Get input and output tensors.
input_details = interpreter.get_input_details()
output_details = interpreter.get_output_details()

In [18]:

interpreter.set_tensor(input_details[0]['index'], x)
interpreter.invoke()
output_data = interpreter.get_tensor(output_details[0]['index'])
print('TensorFlow Lite Predicted:', output_data)

TensorFlow Lite Predicted: [[0.         0.         0.         0.00390625 0.         0.99609375
  0.         0.         0.         0.        ]]
