# Librerías y setup

In [1]:
!pip install -q tensorflow-model-optimization


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m241.2/241.2 kB[0m [31m13.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.3/17.3 MB[0m [31m97.2 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
numba 0.56.4 requires numpy<1.24,>=1.18, but you have numpy 1.24.3 which is incompatible.
tensorflow 2.12.0 requires numpy<1.24,>=1.22, but you have numpy 1.24.3 which is incompatible.[0m[31m
[0m

In [2]:
import tempfile
import os

import tensorflow as tf
import numpy as np

from tensorflow import keras
import tensorflow_model_optimization as tfmot
import time

# Objetivos de la compresión

La calidad de un algoritmo de compresión se evaluará en base a las siguientes reglas:

1. El nivel de compresión (en parámetros o memoria) debe maximizarse y la pérdida de precisión debe minimizarse
2. Debe proporcionar el máximo nivel de compresión en memoria
3. Debe maximizar la velocidad de inferencia


# Entrenamiento de un modelo sin reducir (MNIST)

In [3]:
# Load MNIST dataset
mnist = keras.datasets.mnist
(train_images, train_labels), (test_images, test_labels) = mnist.load_data()

# Normalize the input image so that each pixel value is between 0 and 1.
train_images = train_images / 255.0
test_images = test_images / 255.0

# Define the model architecture.
model = keras.Sequential([
  keras.layers.InputLayer(input_shape=(28, 28)),
  keras.layers.Reshape(target_shape=(28, 28, 1)),
  keras.layers.Conv2D(filters=12, kernel_size=(3, 3), activation='relu'),
  keras.layers.MaxPooling2D(pool_size=(2, 2)),
  keras.layers.Flatten(),
  keras.layers.Dense(10)
])


# Train the digit classification model
model.compile(optimizer='adam',
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])

model.fit(
  train_images,
  train_labels,
  epochs=6,
  validation_split=0.1,
)

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


<keras.callbacks.History at 0x7f1d36c3fc40>

Evaluamos la precisión base y guardamos el modelo para medir su peso en memoria

In [43]:
t_total = 0

for i in range(10):
  t_start = time.perf_counter()
  _, baseline_model_accuracy = model.evaluate(test_images, test_labels, verbose=0)
  t_end = time.perf_counter()

  t_total = t_total + (t_end - t_start)

print('Baseline test accuracy:', baseline_model_accuracy)

_, keras_file = tempfile.mkstemp('.h5')
tf.keras.models.save_model(model, keras_file, include_optimizer=False)
print('Saved baseline model to:', keras_file)

print('Average inference time (seconds): ',  t_total/10)

Baseline test accuracy: 0.9700999855995178
Saved baseline model to: /tmp/tmpdjvyzzme.h5
Average inference time (seconds):  1.3560929621000468


El modelo base realiza su inferencia en 1.35 segundos.

# Poda

Podamos el modelo pre-entrenado por medio de `sparsity`, que sustituye los pesos menos significativos por 0.



In [5]:
prune_low_magnitude = tfmot.sparsity.keras.prune_low_magnitude

# Compute end step to finish pruning after 2 epochs.
batch_size = 128
epochs = 2
validation_split = 0.1 # 10% of training set will be used for validation set. 

num_images = train_images.shape[0] * (1 - validation_split)
end_step = np.ceil(num_images / batch_size).astype(np.int32) * epochs

# Define model for pruning.
pruning_params = {
      'pruning_schedule': tfmot.sparsity.keras.PolynomialDecay(initial_sparsity=0.50,
                                                               final_sparsity=0.80,
                                                               begin_step=0,
                                                               end_step=end_step)
}

model_for_pruning = prune_low_magnitude(model, **pruning_params)

# `prune_low_magnitude` requires a recompile.
model_for_pruning.compile(optimizer='adam',
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])

model_for_pruning.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 prune_low_magnitude_reshape  (None, 28, 28, 1)        1         
  (PruneLowMagnitude)                                            
                                                                 
 prune_low_magnitude_conv2d   (None, 26, 26, 12)       230       
 (PruneLowMagnitude)                                             
                                                                 
 prune_low_magnitude_max_poo  (None, 13, 13, 12)       1         
 ling2d (PruneLowMagnitude)                                      
                                                                 
 prune_low_magnitude_flatten  (None, 2028)             1         
  (PruneLowMagnitude)                                            
                                                                 
 prune_low_magnitude_dense (  (None, 10)               4

In [6]:
logdir = tempfile.mkdtemp()

callbacks = [
  tfmot.sparsity.keras.UpdatePruningStep(),
  tfmot.sparsity.keras.PruningSummaries(log_dir=logdir),
]

model_for_pruning.fit(train_images, train_labels,
                  batch_size=batch_size, epochs=epochs, validation_split=validation_split,
                  callbacks=callbacks)

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7f1d1c0fbaf0>

In [85]:
model_for_pruning.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 prune_low_magnitude_reshape  (None, 28, 28, 1)        1         
  (PruneLowMagnitude)                                            
                                                                 
 prune_low_magnitude_conv2d   (None, 26, 26, 12)       230       
 (PruneLowMagnitude)                                             
                                                                 
 prune_low_magnitude_max_poo  (None, 13, 13, 12)       1         
 ling2d (PruneLowMagnitude)                                      
                                                                 
 prune_low_magnitude_flatten  (None, 2028)             1         
  (PruneLowMagnitude)                                            
                                                                 
 prune_low_magnitude_dense (  (None, 10)               4

El algoritmo introduce varios pesos adicionales para calcular los menos significativos, despojamos estos pesos extra y guardamos el modelo.

In [8]:
model_for_export = tfmot.sparsity.keras.strip_pruning(model_for_pruning)

_, pruned_keras_file = tempfile.mkstemp('.h5')
tf.keras.models.save_model(model_for_export, pruned_keras_file, include_optimizer=False)
print('Saved pruned Keras model to:', pruned_keras_file)



Saved pruned Keras model to: /tmp/tmp6fepv87k.h5


In [86]:
model_for_export.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 reshape (Reshape)           (None, 28, 28, 1)         0         
                                                                 
 conv2d (Conv2D)             (None, 26, 26, 12)        120       
                                                                 
 max_pooling2d (MaxPooling2D  (None, 13, 13, 12)       0         
 )                                                               
                                                                 
 flatten (Flatten)           (None, 2028)              0         
                                                                 
 dense (Dense)               (None, 10)                20290     
                                                                 
Total params: 20,410
Trainable params: 20,410
Non-trainable params: 0
____________________________________________________

Podemos ver que ya no hay pesos adicionales.

In [21]:
model_for_export.compile()

t_total = 0

for i in range(10):
  t_start = time.perf_counter()
  _, model_for_pruning_accuracy = model_for_pruning.evaluate(
   test_images, test_labels, verbose=0)
  t_end = time.perf_counter()

  t_total = t_total + (t_end - t_start)

print("Average inference time (seconds): ", t_total/10)
print('Baseline test accuracy:', baseline_model_accuracy) 
print('Pruned test accuracy:', model_for_pruning_accuracy)

Average inference time (seconds):  1.195314035599972
Baseline test accuracy: 0.9700999855995178
Pruned test accuracy: 0.9700999855995178


El tiempo de inferencia se reduce a 1.19 segundos, con la precisión inalterada.

Ahora definiremos funciones para medir el tamaño en memoria de los modelos, comprimidos en memoria con gzip y sin comprimir. La compresión en memoria puede ser beneficiosa si se sustituyen muchos valores por 0, ya que los algoritmos de compresión como gzip tendrán facilidad para agrupar los valores establecidos a 0:

In [10]:
def get_gzipped_model_size(file):
  # Returns size of gzipped model, in bytes.
  import os
  import zipfile

  _, zipped_file = tempfile.mkstemp('.zip')
  with zipfile.ZipFile(zipped_file, 'w', compression=zipfile.ZIP_DEFLATED) as f:
    f.write(file)

  return os.path.getsize(zipped_file)

In [22]:
def get_normal_model_size(file):
  return os.path.getsize(file)

In [11]:
print("Size of gzipped baseline Keras model: %.2f bytes" % (get_gzipped_model_size(keras_file)))
print("Size of gzipped pruned Keras model: %.2f bytes" % (get_gzipped_model_size(pruned_keras_file)))
# print("Size of gzipped pruned TFlite model: %.2f bytes" % (get_gzipped_model_size(pruned_tflite_file)))

Size of gzipped baseline Keras model: 78127.00 bytes
Size of gzipped pruned Keras model: 25736.00 bytes


In [23]:
print("Size of baseline Keras model: %.2f bytes" % (get_normal_model_size(keras_file)))
print("Size of pruned Keras model: %.2f bytes" % (get_normal_model_size(pruned_keras_file)))

Size of baseline Keras model: 98968.00 bytes
Size of pruned Keras model: 98968.00 bytes


## Conclusiones

1. La precisión se ha reducido en algo menos de 0.02%.

2. El tamaño en memoria se ha reducido 3 veces al aplicar gzip, seguramente debido a que `sparsity` sustituye pesos por 0, permitiendo que el algoritmo de compresión en almacenamiento agrupe los valores a 0 de forma eficiente.

3. La velocidad de inferencia se ha acelerado en aproximadamente 0.12 segundos.

En conclusión, este método de poda por `sparsity` resulta efectivo (de cara a reducción en almacenamiento) cuando se combina con un algoritmo de compresión para el fichero de salida. 

# Cuantización

Este proceso convertirá los pesos en tipo int8 (entero de 8 bits) y las funciones de activación en uint8 (unsigned integer de 8 bits).

In [12]:
quantize_model = tfmot.quantization.keras.quantize_model

# q_aware stands for for quantization aware.
q_aware_model = quantize_model(model)

# `quantize_model` requires a recompile.
q_aware_model.compile(optimizer='adam',
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])

q_aware_model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 quantize_layer (QuantizeLay  (None, 28, 28)           3         
 er)                                                             
                                                                 
 quant_reshape (QuantizeWrap  (None, 28, 28, 1)        1         
 perV2)                                                          
                                                                 
 quant_conv2d (QuantizeWrapp  (None, 26, 26, 12)       147       
 erV2)                                                           
                                                                 
 quant_max_pooling2d (Quanti  (None, 13, 13, 12)       1         
 zeWrapperV2)                                                    
                                                                 
 quant_flatten (QuantizeWrap  (None, 2028)             1

Se introducen 38 parámetros adicionales para cálculos internos del algoritmo de cuantización.

In [53]:
train_images_subset = train_images[:1000] # out of 60000
train_labels_subset = train_labels[:1000]

q_aware_model.fit(train_images_subset, train_labels_subset,
                  batch_size=500, epochs=2, validation_split=0.1)

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7f1d0dc7ad70>

In [14]:
_, quantized_keras_file = tempfile.mkstemp('.h5')
tf.keras.models.save_model(q_aware_model, quantized_keras_file, include_optimizer=False)
print('Saved pruned Keras model to:', quantized_keras_file)

Saved pruned Keras model to: /tmp/tmp8njuqvzr.h5


In [57]:
t_total = 0

for i in range(10):
  t_start = time.perf_counter()
  _, q_aware_model_accuracy = q_aware_model.evaluate(
   test_images, test_labels, verbose=0)
  t_end = time.perf_counter()

  t_total = t_total + (t_end - t_start)



print('Baseline test accuracy:', baseline_model_accuracy)
print('Quant test accuracy:', q_aware_model_accuracy)
print('Average inference time (seconds): ', t_total/10)

Baseline test accuracy: 0.9700999855995178
Quant test accuracy: 0.9782000184059143
Average inference time (seconds):  1.2131768901998838


El tiempo de inferencia se reduce, estando ahora en 1.21 segundos.
Merece la pena observar que el nuevo modelo sobreespecializa, ya que presenta precisión ligeramente mayor.

In [36]:
print("Size of gzipped baseline Keras model: %.2f bytes" % (get_gzipped_model_size(keras_file)))
print("Size of gzipped quantized Keras model: %.2f bytes" % (get_gzipped_model_size(quantized_keras_file)))

Size of gzipped baseline Keras model: 25736.00 bytes
Size of gzipped quantized Keras model: 56135.00 bytes


In [37]:
print("Size of baseline Keras model: %.2f bytes" % (get_normal_model_size(keras_file)))
print("Size of pruned Keras model: %.2f bytes" % (get_normal_model_size(quantized_keras_file)))

Size of baseline Keras model: 98968.00 bytes
Size of pruned Keras model: 116272.00 bytes


Curiosamente, sin gzip, el tamaño no disminuye, sino que aumenta ligeramente debido a los 38 parámetros que introduce el algoritmo de cuantización.

In [56]:
converter = tf.lite.TFLiteConverter.from_keras_model(q_aware_model)
converter.optimizations = [tf.lite.Optimize.DEFAULT]

quantized_tflite_model = converter.convert()



## Conclusiones

1. La precisión ha aumentado, conque el modelo puede estar sobreespecializando.

2. El tamaño en memoria se ha reducido en un 50%.

3. La velocidad de inferencia se acelera, aunque menos que en la poda.


# Knowledge Distillation

Vamos a crear un `distiller` personalizado, que tiene los siguientes componentes:

- Un modelo profesor entrenado.
- Un modelo alumno para entrenar.
- Una función de pérdida del estudiante (diferencia entre las predicciones del estudiante y la verdad objetivo).
- Una función de pérdida de destilación, junto con temperatura (diferencia entre las predicciones `soft` del estudiante y las etiquetas del profesor.
- Un factor $\alpha$ para ponderar las pérdidas del estudiante y la destilación.
- Un optimizador para el estudiante.


In [59]:
class Distiller(keras.Model):
    def __init__(self, student, teacher):
        super().__init__()
        self.teacher = teacher
        self.student = student

    def compile(
        self,
        optimizer,
        metrics,
        student_loss_fn,
        distillation_loss_fn,
        alpha=0.1,
        temperature=3,
    ):
        """ Configure the distiller.

        Args:
            optimizer: Keras optimizer for the student weights
            metrics: Keras metrics for evaluation
            student_loss_fn: Loss function of difference between student
                predictions and ground-truth
            distillation_loss_fn: Loss function of difference between soft
                student predictions and soft teacher predictions
            alpha: weight to student_loss_fn and 1-alpha to distillation_loss_fn
            temperature: Temperature for softening probability distributions.
                Larger temperature gives softer distributions.
        """
        super().compile(optimizer=optimizer, metrics=metrics)
        self.student_loss_fn = student_loss_fn
        self.distillation_loss_fn = distillation_loss_fn
        self.alpha = alpha
        self.temperature = temperature

    def train_step(self, data):
        # Unpack data
        x, y = data

        # Forward pass of teacher
        teacher_predictions = self.teacher(x, training=False)

        with tf.GradientTape() as tape:
            # Forward pass of student
            student_predictions = self.student(x, training=True)

            # Compute losses
            student_loss = self.student_loss_fn(y, student_predictions)

            # Compute scaled distillation loss from https://arxiv.org/abs/1503.02531
            # The magnitudes of the gradients produced by the soft targets scale
            # as 1/T^2, multiply them by T^2 when using both hard and soft targets.
            distillation_loss = (
                self.distillation_loss_fn(
                    tf.nn.softmax(teacher_predictions / self.temperature, axis=1),
                    tf.nn.softmax(student_predictions / self.temperature, axis=1),
                )
                * self.temperature**2
            )

            loss = self.alpha * student_loss + (1 - self.alpha) * distillation_loss

        # Compute gradients
        trainable_vars = self.student.trainable_variables
        gradients = tape.gradient(loss, trainable_vars)

        # Update weights
        self.optimizer.apply_gradients(zip(gradients, trainable_vars))

        # Update the metrics configured in `compile()`.
        self.compiled_metrics.update_state(y, student_predictions)

        # Return a dict of performance
        results = {m.name: m.result() for m in self.metrics}
        results.update(
            {"student_loss": student_loss, "distillation_loss": distillation_loss}
        )
        return results

    def test_step(self, data):
        # Unpack the data
        x, y = data

        # Compute predictions
        y_prediction = self.student(x, training=False)

        # Calculate the loss
        student_loss = self.student_loss_fn(y, y_prediction)

        # Update the metrics.
        self.compiled_metrics.update_state(y, y_prediction)

        # Return a dict of performance
        results = {m.name: m.result() for m in self.metrics}
        results.update({"student_loss": student_loss})
        return results

En `train_step` se hace un paso hacia adelante del estudiante y el profesor.
Calculamos `student loss`, la pérdida del estudiante y `distillation_loss`, pérdida de destilación con $\alpha$ y $1-\alpha$

En `test_step` se evalúa el modelo estudiante.

No emplearemos las clases definidas anteriormente porque el estudiante debe tener kernels más pequeños que el profesor:

In [60]:
from tensorflow.keras import layers

Preparamos los modelos profesor y alumno:

In [61]:
# profesor
alberto = keras.Sequential(
    [
        keras.Input(shape=(28, 28, 1)),
        layers.Conv2D(256, (3, 3), strides=(2, 2), padding="same"),
        layers.LeakyReLU(alpha=0.2),
        layers.MaxPooling2D(pool_size=(2, 2), strides=(1, 1), padding="same"),
        layers.Conv2D(512, (3, 3), strides=(2, 2), padding="same"),
        layers.Flatten(),
        layers.Dense(10),
    ],
    name="teacher",
)

# alumno(s)
neuralhive = keras.Sequential(
    [
        keras.Input(shape=(28, 28, 1)),
        layers.Conv2D(16, (3, 3), strides=(2, 2), padding="same"),
        layers.LeakyReLU(alpha=0.2),
        layers.MaxPooling2D(pool_size=(2, 2), strides=(1, 1), padding="same"),
        layers.Conv2D(32, (3, 3), strides=(2, 2), padding="same"),
        layers.Flatten(),
        layers.Dense(10),
    ],
    name="student",
)


# Estudiante sin destilar para comparar más adelante
student_scratch = keras.models.clone_model(neuralhive)

In [63]:
# Entrenamiento del profesor con normalidad
alberto.compile(
    optimizer=keras.optimizers.Adam(),
    loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=[keras.metrics.SparseCategoricalAccuracy()],
)

# Entrenamos y evaluamos al profesor con los datos
alberto.fit(train_images[:1000], train_labels[:1000], epochs=5)
alberto.evaluate(test_images[:1000], test_labels[:1000])

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


[0.20396357774734497, 0.9419999718666077]

Destilamos el profesor al estudiante:

In [64]:
# Initialize and compile distiller
distiller = Distiller(student=neuralhive, teacher=alberto)
distiller.compile(
    optimizer=keras.optimizers.Adam(),
    metrics=[keras.metrics.SparseCategoricalAccuracy()],
    student_loss_fn=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    distillation_loss_fn=keras.losses.KLDivergence(),
    alpha=0.1,
    temperature=10,
)

# Distill teacher to student
distiller.fit(train_images[:1000], train_labels[:1000], epochs=3)

# Evaluate student on test dataset
distiller.evaluate(test_images[:1000], test_labels[:1000])

Epoch 1/3
Epoch 2/3
Epoch 3/3


[0.7829999923706055, 1.4007198810577393]

In [65]:
# Entrenamos el alumno con normalidad
student_scratch.compile(
    optimizer=keras.optimizers.Adam(),
    loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=[keras.metrics.SparseCategoricalAccuracy()],
)

# Train and evaluate student trained from scratch.
student_scratch.fit(train_images[:1000], train_labels[:1000], epochs=3)
student_scratch.evaluate(test_images[:1000], test_labels[:1000])

Epoch 1/3
Epoch 2/3
Epoch 3/3


[0.5920273065567017, 0.8149999976158142]

In [72]:
print(student_scratch.evaluate(test_images, test_labels))
print(distiller.evaluate(test_images, test_labels))

[0.5363089442253113, 0.8371000289916992]
[0.8109999895095825, 0.6327903866767883]


Podemos observar un ligero decremento de la precisión, un ligero aumento de loss, y una ligera disminución en el tiempo de inferencia.

In [70]:
t_total = 0

for i in range(10):
  t_start = time.perf_counter()
  _, baseline_model_accuracy = distiller.evaluate(test_images, test_labels, verbose=0)
  t_end = time.perf_counter()

  t_total = t_total + (t_end - t_start)

print("Average inference time (seconds): ", t_total/10)

Average inference time (seconds):  1.1929158142000234


Guardemos los modelos para comparar su tamaño en memoria

In [79]:
_, student_scratch_file = tempfile.mkstemp('.h5')
tf.keras.models.save_model(student_scratch, student_scratch_file, include_optimizer=False)
print('Saved student from scratch model to:', student_scratch_file)


_, student_final_file = tempfile.mkstemp('.h5')
tf.keras.models.save_model(neuralhive, student_final_file, include_optimizer=False)
print('Saved final student model to:', student_final_file)




Saved student from scratch model to: /tmp/tmplnjd0ht9.h5
Saved final student model to: /tmp/tmpe2o6e941.h5


In [82]:
print("Size of gzipped baseline Keras model: %.2f bytes" % (get_gzipped_model_size(keras_file)))
print("Size of gzipped quantized Keras model: %.2f bytes" % (get_gzipped_model_size(student_final_file)))

Size of gzipped baseline Keras model: 25736.00 bytes
Size of gzipped quantized Keras model: 78472.00 bytes


In [84]:
neuralhive.summary()

Model: "student"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d_3 (Conv2D)           (None, 14, 14, 16)        160       
                                                                 
 leaky_re_lu_1 (LeakyReLU)   (None, 14, 14, 16)        0         
                                                                 
 max_pooling2d_2 (MaxPooling  (None, 14, 14, 16)       0         
 2D)                                                             
                                                                 
 conv2d_4 (Conv2D)           (None, 7, 7, 32)          4640      
                                                                 
 flatten_2 (Flatten)         (None, 1568)              0         
                                                                 
 dense_2 (Dense)             (None, 10)                15690     
                                                           

## Conclusiones

1. La precisión disminuye en un 6%

2. El tamaño en memoria aumenta.

3. La velocidad de inferencia se acelera a 1.19 segundos, resultado similar a la poda.


En conclusión, parece que Knowledge Distillation no tiene un gran impacto sobre el almacenamiento, pero acelera la inferencia de forma efectiva con escasa pérdida de precisión.