In [None]:
# Install compatible versions for Python 3.10
%pip install tensorflow==2.13.0
%pip install tensorflow-model-optimization==0.7.5
# Note: Don't install tf-keras separately - use TensorFlow's built-in Keras

In [None]:
import tensorflow_model_optimization as tfmot
import tensorflow as tf
from tensorflow.keras.datasets import mnist

# Load dataset
(x_train, y_train), (x_test, y_test) = mnist.load_data()
x_train, x_test = x_train / 255.0, x_test / 255.0

# Build a simple model
model = tf.keras.models.Sequential([
    tf.keras.layers.Flatten(input_shape=(28, 28)),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(10, activation='softmax')
])

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(x_train, y_train, epochs=5, validation_data=(x_test, y_test))

# Apply pruning to the model
pruning_params = {
    'pruning_schedule': tfmot.sparsity.keras.PolynomialDecay(initial_sparsity=0.0, final_sparsity=0.5, begin_step=0, end_step=1000)
}
pruned_model = tfmot.sparsity.keras.prune_low_magnitude(model, **pruning_params)

# Compile the pruned model
pruned_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the pruned model to finalize pruning
callbacks = [tfmot.sparsity.keras.UpdatePruningStep()]
pruned_model.fit(x_train, y_train, epochs=2, validation_data=(x_test, y_test), callbacks=callbacks)

# Strip pruning wrappers to remove pruning-specific layers and metadata
pruned_model = tfmot.sparsity.keras.strip_pruning(pruned_model)

In [None]:
# Convert the model to a TensorFlow Lite quantized model
print("Converting to TensorFlow Lite with quantization...")
converter = tf.lite.TFLiteConverter.from_keras_model(model)
converter.optimizations = [tf.lite.Optimize.DEFAULT]
quantized_model = converter.convert()

# Save and get quantized model size
with open('quantized_model.tflite', 'wb') as f:
    f.write(quantized_model)
    
quantized_size = len(quantized_model) / 1024  # Size in KB
print(f'Quantized model size: {quantized_size:.2f} KB')
print(f'Size reduction: {((original_size - quantized_size) / original_size * 100):.2f}%')

## Model Quantization

Convert the pruned model to TensorFlow Lite format with quantization optimization. Quantization reduces the model size and improves inference speed by converting 32-bit floating point weights and activations to 8-bit integers, while maintaining reasonable accuracy.

In [None]:
# Measure accuracy of the quantized model using the test set
interpreter = tf.lite.Interpreter(model_content=quantized_model)
interpreter.allocate_tensors()

input_index = interpreter.get_input_details()[0]['index']
output_index = interpreter.get_output_details()[0]['index']

# Evaluate accuracy
correct_predictions = 0
for i in range(len(x_test)):
    input_data = x_test[i:i+1].astype('float32')
    interpreter.set_tensor(input_index, input_data)
    interpreter.invoke()
    output = interpreter.get_tensor(output_index)
    predicted_label = output.argmax()
    if predicted_label == y_test[i]:
        correct_predictions += 1

accuracy = correct_predictions / len(x_test)
print(f'Quantized model accuracy: {accuracy * 100:.2f}%')