## Setup

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!python -V

Python 3.8.15


In [None]:
import sys  
sys.path.insert(0, '/content/drive/MyDrive/TinyML/TinyML Project/kws_test/Spoken-Keyword-Spotting')

In [None]:
#arman
%cd /content/drive/MyDrive/TinyML/TinyML Project/kws_test/Spoken-Keyword-Spotting/src

/content/drive/MyDrive/TinyML/TinyML Project/kws_test/Spoken-Keyword-Spotting/src


In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import numpy as np


In [None]:
# !pip install --upgrade pip
# !apt install libasound2-dev portaudio19-dev libportaudio2 libportaudiocpp0 ffmpeg

# !pip install pyaudio
# %cd /content/drive/MyDrive/Spoken-Keyword-Spotting
# !pip install -r requirements.txt

## Construct `Distiller()` class

The custom `Distiller()` class, overrides the `Model` methods `train_step`, `test_step`,
and `compile()`. In order to use the distiller, we need:

- A trained teacher model
- A student model to train
- A student loss function on the difference between student predictions and ground-truth
- A distillation loss function, along with a `temperature`, on the difference between the
soft student predictions and the soft teacher labels
- An `alpha` factor to weight the student and distillation loss
- An optimizer for the student and (optional) metrics to evaluate performance

In the `train_step` method, we perform a forward pass of both the teacher and student,
calculate the loss with weighting of the `student_loss` and `distillation_loss` by `alpha` and
`1 - alpha`, respectively, and perform the backward pass. Note: only the student weights are updated,
and therefore we only calculate the gradients for the student weights.

In the `test_step` method, we evaluate the student model on the provided dataset.

In [None]:
class Distiller(keras.Model):
    def __init__(self, student, teacher):
        super(Distiller, self).__init__()
        self.teacher = teacher
        self.student = student

    def compile(
        self,
        optimizer,
        metrics,
        student_loss_fn,
        distillation_loss_fn,
        alpha=0.1,
        temperature=3,
        run_eagerly=False
    ):
        """ Configure the distiller.

        Args:
            optimizer: Keras optimizer for the student weights
            metrics: Keras metrics for evaluation
            student_loss_fn: Loss function of difference between student
                predictions and ground-truth
            distillation_loss_fn: Loss function of difference between soft
                student predictions and soft teacher predictions
            alpha: weight to student_loss_fn and 1-alpha to distillation_loss_fn
            temperature: Temperature for softening probability distributions.
                Larger temperature gives softer distributions.
        """
        super(Distiller, self).compile(optimizer=optimizer, metrics=metrics, run_eagerly=run_eagerly)
        self.student_loss_fn = student_loss_fn
        self.distillation_loss_fn = distillation_loss_fn
        self.alpha = alpha
        self.temperature = temperature

    def train_step(self, data):
        print('train step entered')
        # Unpack data
        x, y = data
        print(x)
        print(y)
        # Forward pass of teacher
        teacher_predictions = self.teacher(x, training=False)
        print('forward pass of teacher done')

        with tf.GradientTape() as tape:
            # Forward pass of student
            student_predictions = self.student(x, training=True)
            print('forward pass of student done')

            # Compute losses
            student_loss = self.student_loss_fn(y, student_predictions)

            # Compute scaled distillation loss from https://arxiv.org/abs/1503.02531
            # The magnitudes of the gradients produced by the soft targets scale
            # as 1/T^2, multiply them by T^2 when using both hard and soft targets.
            distillation_loss = (
                self.distillation_loss_fn(
                    tf.nn.softmax(teacher_predictions / self.temperature, axis=1),
                    tf.nn.softmax(student_predictions / self.temperature, axis=1),
                )
                * self.temperature**2
            )

            loss = self.alpha * student_loss + (1 - self.alpha) * distillation_loss

        print('losses have been computed')
        # Compute gradients
        trainable_vars = self.student.trainable_variables
        gradients = tape.gradient(loss, trainable_vars)
        print('gradients completed')

        # Update weights
        self.optimizer.apply_gradients(zip(gradients, trainable_vars))
        print('optimizer done')

        # Update the metrics configured in `compile()`.
        self.compiled_metrics.update_state(y, student_predictions)

        # Return a dict of performance
        results = {m.name: m.result() for m in self.metrics}
        results.update(
            {"student_loss": student_loss, "distillation_loss": distillation_loss}
        )
        print('train step function done: ', results)
        return results

    def test_step(self, data):
        # Unpack the data
        x, y = data

        # Compute predictions
        y_prediction = self.student(x, training=False)

        # Calculate the loss
        student_loss = self.student_loss_fn(y, y_prediction)

        # Update the metrics.
        self.compiled_metrics.update_state(y, y_prediction)

        # Return a dict of performance
        results = {m.name: m.result() for m in self.metrics}
        results.update({"student_loss": student_loss})
        return results


## Create student and teacher models

Initialy, we create a teacher model and a smaller student model. Both models are
convolutional neural networks and created using `Sequential()`,
but could be any Keras model.

In [None]:
from src.models import create_model
from keras.models import load_model
teacher = load_model("../models/marvin_kws-3epoch.h5") #julian model: models/marvin_kws_Dec1_5pm.h5
teacher.summary()
student = create_model([2, 4, 8, 16, 32])
student_scratch = keras.models.clone_model(student)

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 reshape (Reshape)           (None, 99, 40, 1)         0         
                                                                 
 batch_normalization (BatchN  (None, 99, 40, 1)        4         
 ormalization)                                                   
                                                                 
 conv2d (Conv2D)             (None, 99, 40, 16)        160       
                                                                 
 batch_normalization_1 (Batc  (None, 99, 40, 16)       64        
 hNormalization)                                                 
                                                                 
 re_lu (ReLU)                (None, 99, 40, 16)        0         
                                                                 
 max_pooling2d (MaxPooling2D  (None, 49, 20, 16)       0

In [None]:
!pip install scikit_optimize python_speech_features path

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
#create a file that just returns a tf.data.dataset of our data
from create_tf_dataset import create_train_and_val

train_data, train_steps, val_data, val_steps = create_train_and_val()

# train_data = train_data.as_numpy_iterator()
# val_data = val_data.as_numpy_iterator()
# print(type(train_data), type(val_data))
# for x,y in train_data:
#   print(x)
#   break

train data setup complete.
test data setup complete.
Input data setup successful.
Dataset statistics
Train files: 51142
Validation files: 6798
Dev test files: 6835
Test files: 2567
pandas started to shuffle
pandas shuffling over
i:  0
i:  100
i:  200
i:  300
i:  400
i:  500
i:  600
i:  700
i:  800
i:  900
i:  1000
i:  1100
i:  1200
i:  1300
i:  1400
i:  1500
i:  1600
i:  1700
i:  1800
i:  1900
i:  2000
i:  2100
i:  2200
i:  2300
i:  2400
i:  2500
pandas started to shuffle val
pandas shuffling over
i:  0
i:  10
i:  20
i:  30
i:  40
i:  50
i:  60
i:  70
i:  80
i:  90
i:  100
i:  110
i:  120
i:  130
i:  140
i:  150
i:  160
i:  170
i:  180
i:  190
i:  200
i:  210
i:  220
i:  230
i:  240
i:  250


In [None]:
# for elt in train_data:
#   print(elt)
#   break

## Distill teacher to student

We have already trained the teacher model, and we only need to initialize a
`Distiller(student, teacher)` instance, `compile()` it with the desired losses,
hyperparameters and optimizer, and distill the teacher to the student.

In [None]:
# Initialize and compile distiller
distiller = Distiller(student=student, teacher=teacher)
distiller.compile(
    optimizer=keras.optimizers.Adam(),
    metrics=[keras.metrics.SparseCategoricalAccuracy()],
    student_loss_fn=keras.losses.SparseCategoricalCrossentropy(),
    distillation_loss_fn=keras.losses.KLDivergence(),
    alpha=0.1,
    temperature=10,
    #run_eagerly=True
)

# Distill teacher to student
print('train steps: ', train_steps)
distiller.fit(train_data.repeat(), steps_per_epoch=train_steps, validation_data=val_data.repeat(),validation_steps=val_steps, epochs=180)
#distiller.fit(x_train, y_train, epochs=5)

# Evaluate student on test dataset
#distiller.evaluate(x_test, y_test)

train steps:  20
Epoch 1/180
train step entered
Tensor("IteratorGetNext:0", shape=(None, 99, 40), dtype=float32)
Tensor("IteratorGetNext:1", shape=(None,), dtype=int32)
forward pass of teacher done
forward pass of student done
losses have been computed
gradients completed
optimizer done
train step function done:  {'sparse_categorical_accuracy': <tf.Tensor 'Identity:0' shape=() dtype=float32>, 'student_loss': <tf.Tensor 'sparse_categorical_crossentropy/weighted_loss/value:0' shape=() dtype=float32>, 'distillation_loss': <tf.Tensor 'mul:0' shape=() dtype=float32>}
train step entered
Tensor("IteratorGetNext:0", shape=(None, 99, 40), dtype=float32)
Tensor("IteratorGetNext:1", shape=(None,), dtype=int32)
forward pass of teacher done
forward pass of student done
losses have been computed
gradients completed
optimizer done
train step function done:  {'sparse_categorical_accuracy': <tf.Tensor 'Identity:0' shape=() dtype=float32>, 'student_loss': <tf.Tensor 'sparse_categorical_crossentropy/weig

<keras.callbacks.History at 0x7f37121c3100>

In [None]:
#save model
# print("Saving model")
#distiller.save("../models/student_128", save_format='tf')
#distiller.save_weights("../models/student_128_weights.h5")

student.save("../models/student_2560_60ep_conv2dis2.h5")



In [None]:
print(tf. __version__)

2.9.2


In [None]:
#test if the model will load properly
studLoaded = load_model("../models/student_2560_60ep_conv2dis2.h5")
student_scratch.compile(
    optimizer=keras.optimizers.Adam(),
    loss=keras.losses.SparseCategoricalCrossentropy(),
    metrics=[keras.metrics.SparseCategoricalAccuracy()],
)
for x, y in val_data:
  pred = student.predict(x)
  print(pred)
  break



[[1.06369100e-08 8.56202576e-09 3.49572343e-10 ... 3.30489214e-09
  2.19917656e-05 4.40445288e-13]
 [2.05039237e-06 4.40136070e-08 1.12538434e-09 ... 1.79116881e-07
  8.49887147e-05 7.46302380e-13]
 [1.00848325e-07 3.64679025e-08 7.26648874e-09 ... 1.42121246e-06
  1.16497839e-10 3.99900680e-13]
 ...
 [8.98901042e-09 1.08532805e-09 9.16671392e-11 ... 2.43806610e-08
  1.00140223e-10 3.88278548e-15]
 [5.84910813e-05 2.09029793e-04 5.76701888e-04 ... 3.24938089e-01
  3.41269970e-06 1.15929510e-09]
 [4.67523278e-05 1.86387841e-02 1.00867464e-05 ... 1.60899472e-05
  8.50854576e-07 3.89951405e-09]]


## Train student from scratch for comparison

We can also train an equivalent student model from scratch without the teacher, in order
to evaluate the performance gain obtained by knowledge distillation.

```
load the baseline model

for CONV2D param:
  record the size of the model
  for num_data:
    distill the model
    on validation set, record sparse_val_accuracy, pr/recall on Marvin, and training time
    save the model
    save the stats
```

  

In [None]:
def load_teacher(path_to_teacher):
  teacher = load_model(path_to_teacher) #julian model: models/marvin_kws_Dec1_5pm.h5
  teacher.summary()
  return teacher


def create_student(conv2dparam):
  """
  conv2dparam: a list of 5, each divisible by 2, specifying params in CONV2D filter layers
  returns: a student model and its clone, where student model is distilled to and clone trained from scratch
  """
  student = create_model(conv2dparam)
  student_scratch = keras.models.clone_model(student)
  return student, student_scratch


In [None]:
path_to_distilled_models = ""
path_to_scratch_models = ""
distilled_models_name = "distilled_model_param_{}_data_samples_{}"
scratch_models_name = "scratch_model_param_{}_data_samples_{}"
path_to_distilled_stats = ""
path_to_scratch_stats = ""

def train_loop(conv2dls, data_samples):

  teacher = load_teacher(path_to_teacher)

  for conv2d in conv2dls:
    for data_length in data_samples:
      train_data, train_steps, val_data, val_steps = create_tf_dataset(data_length)
      
      student, student_scratch = create_student(conv2d)
      distiller = Distiller(student=student, teacher=teacher)
      distiller.compile(
          optimizer=keras.optimizers.Adam(),
          metrics=[keras.metrics.SparseCategoricalAccuracy()],
          student_loss_fn=keras.losses.SparseCategoricalCrossentropy(),
          distillation_loss_fn=keras.losses.KLDivergence(),
          alpha=0.1,
          temperature=10,
          #run_eagerly=True
      )

      distiller.fit(train_data.repeat(), steps_per_epoch=train_steps, validation_data=val_data.repeat(),validation_steps=val_steps, epochs=30)
      #save student, save stats
      student.save(path_to_distilled_models + distilled_models_name.format(conv2d, data_length))

      #train scratch, save stats
      student_scratch.fit(train_data.repeat(), steps_per_epoch=train_steps, validation_data=val_data.repeat(),validation_steps=val_steps, epochs=30)
      student_scratch.save(path_to_scratch_models + scratch_models_name.format(conv2d, data_length))

ogconv2d = [8, 16, 32, 64, 128]
conv2dls = [ogconv2d, [elt//2 for elt in ogconv2d], [elt//4 for elt in ogconv2d]][::-1]
ds = None #50,000?
data_samples = [ds, ds//2, ds//4, ds//20][::-1]

TypeError: ignored