# Keyword Spotting with different architectures

In [3]:
import numpy as np
np.random.seed(1234)

from os.path import join as pjoin
from pathlib import Path
import tensorflow as tf
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.initializers import glorot_uniform
from utils import load_dataset, load_dataset_keywords
CHECKPOINTS_PATH = 'models/'

## Convolutional Neural Network Architecture

In [5]:
def KWS_CNN_model(input_shape):
    """
    Arguments:
    :param input_shape: shape of the data of the dataset

    :returns Model: a tf.keras.Model() instance
    """
    
    X_input = tf.keras.Input(input_shape)
    n_frames = input_shape[0]
    
    # CONV -> pooling -> CONV -> lin -> Dense?
    # First convolution
    X = tf.keras.layers.Conv2D(filters=94, kernel_size=(int(2*n_frames/3),8), strides = (1,1), 
                               padding = 'same', name='First_Conv')(X_input)
    #Dropout
    #X = Dropout(rate = 0.2)(X)
    
    #Pooling on time and frequency
    X = tf.keras.layers.MaxPooling2D(pool_size=(2,3), strides=(2,3), padding='valid', name='MaxPooling')(X)
    X = tf.keras.layers.Activation('relu')(X)
    
    #Second convolution
    X = tf.keras.layers.Conv2D(filters=94, kernel_size=(int(n_frames/5),4), strides = (1,1), 
                               padding = 'same', name='Second_Conv')(X)
    #Dropout
    #X = Dropout(rate = 0.2)(X)
    
    X = tf.keras.layers.Activation('relu')(X)
    
    # Linear layer
    X = tf.keras.layers.Flatten()(X)
    X = tf.keras.layers.Dense(32, name='Linear')(X)
    
    # Dense layer
    X = tf.keras.layers.Dense(128, activation='relu', name='Dense1')(X)
    X = tf.keras.layers.Dense(128, activation='relu', name='Dense2')(X)
    X = tf.keras.layers.Dense(128, activation='relu', name='Dense3')(X)  
    
    # Softmax
    X = tf.keras.layers.Dense(4, activation='softmax', name='Softmax')(X)
    
    # MODEL
    model = Model(inputs = X_input, outputs = X, name='KWS_CNN')
    
    return model

In [3]:
model = KWS_CNN_model((frames,39,1))
model.summary()

Model: "KWS_CNN"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 99, 39, 1)]       0         
                                                                 
 First_Conv (Conv2D)         (None, 99, 39, 94)        49726     
                                                                 
 MaxPooling (MaxPooling2D)   (None, 49, 13, 94)        0         
                                                                 
 activation (Activation)     (None, 49, 13, 94)        0         
                                                                 
 Second_Conv (Conv2D)        (None, 49, 13, 94)        671630    
                                                                 
 activation_1 (Activation)   (None, 49, 13, 94)        0         
                                                                 
 flatten (Flatten)           (None, 59878)             0   

2022-12-18 12:06:19.353184: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-12-18 12:06:19.354816: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-12-18 12:06:19.354883: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcublas.so.11'; dlerror: libcublas.so.11: cannot open shared object file: No such file or directory
2022-12-18 12:06:19.354923: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcublasLt.so.11'; dlerror: libcublasLt.so.11: cannot open shared object file: No such file or directory
2022-12-18 12:06:19.389303: W tensorflow/c

In [4]:
model.compile(optimizer="adam", loss=tf.keras.losses.CategoricalCrossentropy(), metrics=["accuracy"])

## Training for different preprocessing parameters

In [6]:
data_dir = 'speechdataset/processed_data'

keywords = ['marvin', 'no', 'yes']

categories = ['bed', 'down', 'forward', 'house', 'nine', 'one', 'six', 'tree', 'visual', 'bird', 'eight', 
              'four', 'learn', 'no','right', 'stop', 'two', 'wow', 'cat', 'five', 'go', 'left', 'off', 
              'seven', 'up', 'yes', 'backward', 'dog', 'follow', 'happy', 'marvin', 'on', 'sheila', 'three', 'zero']

n_labels = len(keywords) + 1 # number of keywords + not a keyword

In [None]:
# Parameters
windows = [(0.025, 0.02, 66), (0.025, 0.005, 197), (0.030, 0.01, 99)] # s
num_filters_s = [26]

# Early stopping criteria
early_stop_callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=1e-5, patience=1)

# Save initial weights
model.save_weights(pjoin(CHECKPOINTS_PATH, 'initial_weights.h5'))

# Go through all the available files and find their audio features
for window_len, window_step, frames in windows:
    for num_filt in num_filters_s:
        print(f'Training with preproessing parameters: wlen {window_len*1000}ms, wstep {window_step*1000}ms, {num_filt} filters')
        # Load data
        X_train, Y_train, X_test, Y_test = load_dataset_keywords(data_dir, keywords, categories, frames=frames,
                                                                 winlen=window_len, winstep=window_step, nfilt=num_filt)
        print('Dataset loaded')

        # Train the model
        model = KWS_CNN_model((frames,39,1))
        model.compile(optimizer="adam", loss=tf.keras.losses.CategoricalCrossentropy(), metrics=["accuracy"])
        
        model.fit(np.transpose(X_train, [2,0,1]), tf.one_hot(Y_train, n_labels, 1, 0), epochs=5, batch_size=10,
                            validation_split=0.1, callbacks=[early_stop_callback])

        model.save(pjoin(CHECKPOINTS_PATH, f'CNNmodel_{int(window_len*1000)}ms_{int(window_step*1000)}ms_{num_filt}'))

        loss, acc = model.evaluate(np.transpose(X_test, [2,0,1]), tf.one_hot(Y_test, n_labels, 1, 0), verbose=2)
        print("Test accuracy: {:5.2f}%".format(100 * acc))

Training with preproessing parameters: wlen 25.0ms, wstep 20.0ms, 26 filters
Dataset loaded
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5




INFO:tensorflow:Assets written to: checkPoints/weights_25ms_20ms_26/assets


INFO:tensorflow:Assets written to: checkPoints/weights_25ms_20ms_26/assets


497/497 - 97s - loss: 0.1962 - accuracy: 0.9379 - 97s/epoch - 195ms/step
Test accuracy: 93.79%
Training with preproessing parameters: wlen 25.0ms, wstep 5.0ms, 26 filters
Dataset loaded
Epoch 1/5

In [None]:
PATH = Path(CHECKPOINTS_PATH)
for model in PATH.glob('CNNmodel*'):
    print(model)

In [9]:
params = [(0.02, 0.01, 99, 26), (0.025, 0.02, 66, 26), (0.025, 0.005, 197, 26), (0.030, 0.01, 99, 26),
         (0.025, 0.01, 99, 26), (0.025, 0.01, 99, 30), (0.025, 0.01, 99, 40), (0.025, 0.01, 99, 20)] # s

for window_len, window_step, frames, num_filt in params:
    print(f'Preprocessing parameters: wlen {window_len*1000}ms, wstep {window_step*1000}ms, {num_filt} filters')
    _, _, X_test, Y_test = load_dataset_keywords(data_dir, keywords, categories, frames=frames,
                                                 winlen=window_len, winstep=window_step, nfilt=num_filt)
    new_model = tf.keras.models.load_model(pjoin(CHECKPOINTS_PATH,
                                                 f'CNNmodel_{int(window_len*1000)}ms_{int(window_step*1000)}ms_{num_filt}/'))
    loss, acc = new_model.evaluate(np.transpose(X_test, [2,0,1]), tf.one_hot(Y_test, n_labels, 1, 0), verbose=2)
    print("Test accuracy: {:5.8f}%".format(100 * acc), "\n")

Preprocessing parameters: wlen 20.0ms, wstep 10.0ms, 26 filters
497/497 - 216s - loss: 0.4225 - accuracy: 0.9036 - 216s/epoch - 436ms/step
Test accuracy: 90.35590291% 

Preprocessing parameters: wlen 25.0ms, wstep 20.0ms, 26 filters
497/497 - 96s - loss: 0.1882 - accuracy: 0.9413 - 96s/epoch - 194ms/step
Test accuracy: 94.12913322% 

Preprocessing parameters: wlen 25.0ms, wstep 5.0ms, 26 filters
497/497 - 912s - loss: 0.4113 - accuracy: 0.9058 - 912s/epoch - 2s/step
Test accuracy: 90.57638049% 

Preprocessing parameters: wlen 30.0ms, wstep 10.0ms, 26 filters
497/497 - 219s - loss: 0.4064 - accuracy: 0.9077 - 219s/epoch - 440ms/step
Test accuracy: 90.77165127% 

Preprocessing parameters: wlen 25.0ms, wstep 10.0ms, 26 filters
497/497 - 213s - loss: 0.4156 - accuracy: 0.9054 - 213s/epoch - 429ms/step
Test accuracy: 90.54487944% 

Preprocessing parameters: wlen 25.0ms, wstep 10.0ms, 30 filters
497/497 - 216s - loss: 0.4136 - accuracy: 0.9055 - 216s/epoch - 435ms/step
Test accuracy: 90.5511