In [1]:
#The scale and zero point parameteers are learned globally for the entire layer
import tensorflow as tf
import numpy as np
from tensorflow.keras.layers import Dense, Flatten, Input
from tensorflow.keras.models import Model
from tensorflow.keras.datasets import mnist
from tensorflow.keras.optimizers import Adam

(x_train, y_train), (x_test, y_test) = mnist.load_data()
x_train = x_train.astype('float32') / 255.0
x_test = x_test.astype('float32') / 255.0
x_train = np.expand_dims(x_train, -1)
x_test = np.expand_dims(x_test, -1)

initializer_type = "ones"

class LearnedQuantizedDense(tf.keras.layers.Layer):
    def __init__(self, units, activation=None):
        super(LearnedQuantizedDense, self).__init__()
        self.units = units
        self.activation = tf.keras.activations.get(activation)

    def build(self, input_shape):
        print(input_shape, self.units)
        self.w = self.add_weight(           shape=(input_shape[-1], self.units), #784x128 -> #128x10
                                            initializer=initializer_type,
                                            trainable=True)
        self.b = self.add_weight(           shape=(self.units,),                    
                                            initializer=initializer_type, 
                                            trainable=True)
        self.scale = self.add_weight(       shape=(1,),                             
                                            initializer=initializer_type, 
                                            trainable=True)
        self.zero_point = self.add_weight(  shape=(1,),                            
                                            initializer=initializer_type,
                                            trainable=True)

    def call(self, inputs):
        #during forward pass, the model uses quantized (rounded) weights to simulate the effect of quantization    
        #during backpropagation Gradients are calculated as if the weights were not rounded,
        #This prevents the non-differentiable rounding operation from disrupting the learning process.    
        quantized_w = tf.stop_gradient(tf.round(self.w / self.scale + self.zero_point)) + \
                (self.w / self.scale + self.zero_point - tf.stop_gradient(self.w / self.scale + self.zero_point))
        dequantized_w = (quantized_w - self.zero_point) * self.scale
        quantized_b = tf.stop_gradient(tf.round(self.b / self.scale + self.zero_point)) + \
                (self.b / self.scale + self.zero_point - tf.stop_gradient(self.b / self.scale + self.zero_point))
        dequantized_b = (quantized_b - self.zero_point) * self.scale

        output = tf.matmul(inputs, dequantized_w) + dequantized_b
        if self.activation is not None:
            output = self.activation(output)
        return output

input_layer = Input(shape=(28, 28, 1))
flatten_layer = Flatten()(input_layer)
quantized_dense_layer = LearnedQuantizedDense(128, activation='relu')(flatten_layer)
output_layer = LearnedQuantizedDense(10, activation='softmax')(quantized_dense_layer)

quantized_model = Model(inputs=input_layer, outputs=output_layer)
quantized_model.compile(optimizer=Adam(learning_rate=0.01), loss='sparse_categorical_crossentropy', metrics=['accuracy'])
quantized_model.fit(x_train, y_train, epochs=10, validation_data=(x_test, y_test))

loss, accuracy = quantized_model.evaluate(x_test, y_test)
print(f'Quantized Model Test Accuracy: {accuracy}')




(None, 784) 128
(None, 128) 10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Quantized Model Test Accuracy: 0.17800000309944153


In [1]:
#Scale and zero point are learned as vectors, instead of scalars
import tensorflow as tf
import numpy as np
from tensorflow.keras.layers import Dense, Flatten, Input
from tensorflow.keras.models import Model
from tensorflow.keras.datasets import mnist
from tensorflow.keras.optimizers import Adam

# let's not normalize the inputs -> so that it the input quantization can be learned
(x_train, y_train), (x_test, y_test) = mnist.load_data()
x_train = x_train.astype('float32') / 255.0 # this is essentially pre-qunatization -> there's already 255 buckets, we would go down, not up, so 
x_test = x_test.astype('float32') / 255.0
x_train = np.expand_dims(x_train, -1)
x_test = np.expand_dims(x_test, -1)

initializer_type = "random_normal"

class LearnedQuantizedDense(tf.keras.layers.Layer):
    def __init__(self, units, activation=None):
        super(LearnedQuantizedDense, self).__init__()
        self.units = units
        self.activation = tf.keras.activations.get(activation)

    def build(self, input_shape):
        print(input_shape, self.units)
        self.w = self.add_weight(           shape=(input_shape[-1], self.units), 
                                            initializer=initializer_type,
                                            trainable=True)
        self.b = self.add_weight(           shape=(self.units,), 
                                            initializer=initializer_type, 
                                            trainable=True)
        # This is good to go: scale value per hyperparameter
        self.scale = self.add_weight(       shape=(input_shape[-1], 1), 
                                            initializer=initializer_type, 
                                            trainable=True)
        self.zero_point = self.add_weight(  shape=(input_shape[-1], 1), 
                                            initializer=initializer_type,
                                            trainable=True)

    def call(self, inputs):
    # if you set the scale to a very small number, then the number of unique numbers is exactly the same.
    # the larger the scale is the fewer quantized values -> this is what we want.
    # pls take a lok into the scale -> most likely it's going to 0, which is n
    # self.scale and self.zero_point already have the correct shape
        quantized_w = tf.stop_gradient(tf.round(self.w / self.scale + self.zero_point)) + \
                    (self.w / self.scale + self.zero_point - tf.stop_gradient(self.w / self.scale + self.zero_point))
        dequantized_w = (quantized_w - self.zero_point) * self.scale
        quantized_b = tf.stop_gradient(tf.round(self.b / self.scale[0, 0] + self.zero_point[0, 0])) + \
                    (self.b / self.scale[0, 0] + self.zero_point[0, 0] - tf.stop_gradient(self.b / self.scale[0, 0] + self.zero_point[0, 0]))
        dequantized_b = (quantized_b - self.zero_point[0, 0]) * self.scale[0, 0]

    # other thing that would be cool to know is to on the inputs itself can we learn a quantization 
    # using the gradients, can we modify the input parameters and quantizise 
        output = tf.matmul(inputs, dequantized_w) + dequantized_b
        if self.activation is not None:
            output = self.activation(output)
        return output

input_layer = Input(shape=(28, 28, 1))
flatten_layer = Flatten()(input_layer)
quantized_dense_layer = LearnedQuantizedDense(128, activation='relu')(flatten_layer)
output_layer = LearnedQuantizedDense(10, activation='softmax')(quantized_dense_layer)

quantized_model = Model(inputs=input_layer, outputs=output_layer)
quantized_model.compile(optimizer=Adam(learning_rate=0.01), loss='sparse_categorical_crossentropy', metrics=['accuracy'])
quantized_model.fit(x_train, y_train, epochs=5, validation_data=(x_test, y_test))

loss, accuracy = quantized_model.evaluate(x_test, y_test)
print(f'Quantized Model Test Accuracy: {accuracy}')




(None, 784) 128
(None, 128) 10
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Quantized Model Test Accuracy: 0.9527999758720398
