# Keyword Spotting with different architectures

In [1]:
import numpy as np
np.random.seed(1234)

from os.path import join as pjoin
import tensorflow as tf
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.initializers import glorot_uniform
from utils import load_dataset, load_dataset_keywords
CHECKPOINTS_PATH = 'checkPoints/'

2022-12-17 13:04:37.336305: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-12-17 13:04:37.504464: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2022-12-17 13:04:37.511675: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-12-17 13:04:37.511691: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore 

## Data loading

In [10]:
data_dir = 'speechdataset/processed_data'

keywords = ['marvin', 'no', 'yes']

categories = ['bed', 'down', 'forward', 'house', 'nine', 'one', 'six', 'tree', 'visual', 'bird', 'eight', 
              'four', 'learn', 'no','right', 'stop', 'two', 'wow', 'cat', 'five', 'go', 'left', 'off', 
              'seven', 'up', 'yes', 'backward', 'dog', 'follow', 'happy', 'marvin', 'on', 'sheila', 'three', 'zero']

n_labels = len(keywords) + 1 # number of keywords + not a keyword

X_train, Y_train, X_test, Y_test = load_dataset_keywords(data_dir, keywords)

In [3]:
print(X_train.shape, Y_train.shape)

(99, 39, 19587) (19587,)


## Convolutional Neural Network Architecture

In [10]:
def KWS_CNN_model(input_shape):
    """
    Arguments:
    :param input_shape: shape of the data of the dataset

    :returns Model: a tf.keras.Model() instance
    """
    
    X_input = tf.keras.Input(input_shape)
    n_frames = input_shape[0]
    
    # CONV -> pooling -> CONV -> lin -> Dense?
    # First convolution
    X = tf.keras.layers.Conv2D(filters=94, kernel_size=(int(2*n_frames/3),8), strides = (1,1), 
                               padding = 'same', name='First_Conv')(X_input)
    #Dropout
    #X = Dropout(rate = 0.2)(X)
    
    #Pooling on time and frequency
    X = tf.keras.layers.MaxPooling2D(pool_size=(2,3), strides=(2,3), padding='valid', name='MaxPooling')(X)
    X = tf.keras.layers.Activation('relu')(X)
    
    #Second convolution
    X = tf.keras.layers.Conv2D(filters=94, kernel_size=(int(n_frames/5),4), strides = (1,1), 
                               padding = 'same', name='Second_Conv')(X)
    #Dropout
    #X = Dropout(rate = 0.2)(X)
    
    X = tf.keras.layers.Activation('relu')(X)
    
    # Linear layer
    X = tf.keras.layers.Flatten()(X)
    X = tf.keras.layers.Dense(32, name='Linear')(X)
    
    # Dense layer
    X = tf.keras.layers.Dense(128, activation='relu', name='Dense1')(X)
    X = tf.keras.layers.Dense(128, activation='relu', name='Dense2')(X)
    X = tf.keras.layers.Dense(128, activation='relu', name='Dense3')(X)  
    
    # Softmax
    X = tf.keras.layers.Dense(n_labels, activation='softmax', name='Softmax')(X)
    
    # MODEL
    model = Model(inputs = X_input, outputs = X, name='KWS_CNN')
    
    return model

In [11]:
model = KWS_CNN_model((99,39,1))
model.summary()

Model: "KWS_CNN"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, 99, 39, 1)]       0         
                                                                 
 First_Conv (Conv2D)         (None, 99, 39, 94)        49726     
                                                                 
 MaxPooling (MaxPooling2D)   (None, 49, 13, 94)        0         
                                                                 
 activation_2 (Activation)   (None, 49, 13, 94)        0         
                                                                 
 Second_Conv (Conv2D)        (None, 49, 13, 94)        671630    
                                                                 
 activation_3 (Activation)   (None, 49, 13, 94)        0         
                                                                 
 flatten_1 (Flatten)         (None, 59878)             0   

In [18]:
model.compile(optimizer="adam", loss=tf.keras.losses.CategoricalCrossentropy(), metrics=["accuracy"])

In [None]:
# Train the model
early_stop_callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5)

# Fit
#history = model.fit(np.transpose(X_train, [2,0,1]), Y_train, epochs=num_epochs, validation_split=0.1, callbacks=[early_stop_callback])

una_jota = tf.one_hot(Y_train, n_labels, 1, 0)
print(una_jota)
history = model.fit(np.transpose(X_train, [2,0,1]), una_jota , epochs=100, batch_size=10,
                    validation_split=0.1)

In [None]:
model.save_weights(CHECKPOINTS_PATH)

In [21]:
# Loads the weights
model.load_weights(CHECKPOINTS_PATH)

# Re-evaluate the model
loss, acc = model.evaluate(np.transpose(X_test, [2,0,1]), tf.one_hot(Y_test, n_labels, 1, 0), verbose=2)
print("Restored model, accuracy: {:5.2f}%".format(100 * acc))

109/109 - 49s - loss: 2.0072 - accuracy: 0.1675 - 49s/epoch - 446ms/step
Restored model, accuracy: 16.75%


## Clustering, RNN, ...

In [None]:
model