# Lab 5: Google Speech Commands

## Imports

In [1]:
import copy
import wave
from pathlib import Path
import numpy as np
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Input, Conv1D, AvgPool1D, MaxPool1D, ZeroPadding1D, BatchNormalization, Flatten, Dense, Activation, Dropout
from keras.utils.data_utils import get_file
from keras.utils.np_utils import to_categorical
import math

## Download, cache and extract Google Speech Commands

In [2]:
dataset_dir = Path('datasets')
if not (dataset_dir/'testing_list.txt').exists(): # Assume dataset already downloaded/extracted if testing list is present
    get_file(None, "http://download.tensorflow.org/data/speech_commands_v0.02.tar.gz",
                    extract=True,
                    file_hash="6b74f3901214cb2c2934e98196829835",
                    cache_dir='.',
                    cache_subdir=dataset_dir)

## Load raw spoken digits data from Google Speech Commands

In [1]:
# Classes to handle, ordered by label
CLASSES = ['zero', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine']

with (dataset_dir/'testing_list.txt').open() as f:
    testing_list = f.read().splitlines()

x_train = []
y_train = []
x_test = []
y_test = []

for recording in dataset_dir.glob(f'**/*.wav'):
    if not recording.parent.name in CLASSES: # Ignore unused classes
        continue
    label = CLASSES.index(recording.parent.name) # Assign class number
    
    with wave.open(str(recording)) as f: # Read wave file
        data = np.frombuffer(f.readframes(f.getnframes()), dtype=np.int16).copy() # As 16-bit signed integer
        
    data = data.astype(np.float32) # Convert to 32-bit floating-point
    data.resize((16000, 1)) # Resize to 1s (16kHz) with zero-padding, 1 channel

    if str(recording.relative_to(dataset_dir)).replace("\\","/") in testing_list: # Assign to test set if file in test list
        x_test.append(data)
        y_test.append(label)
    else:
        x_train.append(data)
        y_train.append(label)

x_train = np.array(x_train)
y_train = to_categorical(np.array(y_train))

x_test = np.array(x_test)
y_test = to_categorical(np.array(y_test))

NameError: name 'dataset_dir' is not defined

## Prepare for inference with fixed-point Q7.9 samples by scaling input data accordingly

In [4]:
FIXED_POINT = 9
x_train /= 2**FIXED_POINT
x_test  /= 2**FIXED_POINT

## Export small dataset (250 random vectors)

In [5]:
perms = np.random.permutation(len(y_test))[0:250]
x_test_250 = x_test[perms]
y_test_250 = y_test[perms]
np.savetxt('x_test_gsc_250.csv', x_test_250.reshape((x_test_250.shape[0], -1)), delimiter=',', fmt='%s')
np.savetxt('y_test_gsc_250.csv', y_test_250, delimiter=',', fmt='%s')

## Build model *not* M5

In [6]:
# Define hyperparameters
num_classes = 10 # Number of sound classes
num_filters = 2 # Number of filters for CNN layers
num_units = 128 # Number of units for RNN layers
num_heads = 8 # Number of heads for attention layers
dropout_rate = 0.1 # Dropout rate for regularization

model = Sequential()
model.add(Input(shape=(16000, 1)))
model.add(MaxPool1D(pool_size=(4)))
model.add(Conv1D(filters=num_filters, kernel_size=3, activation='relu'))
model.add(MaxPool1D(pool_size=(4)))
model.add(Conv1D(filters=num_filters, kernel_size=3, activation='relu'))
model.add(MaxPool1D(pool_size=(4)))
model.add(Conv1D(filters=num_filters, kernel_size=3, activation='relu'))
model.add(Flatten())
model.add(Dense(units=50, activation='relu'))
model.add(Dense(units=35, activation='relu'))
model.add(Dense(units=16, activation='relu'))
model.add(Dense(units=num_classes, activation='relu'))
model.add(Activation('softmax')) # SoftMax activation needs to be separate from Dense to remove it later on
# EXPLORE Learning Rate
opt = tf.keras.optimizers.Adam(learning_rate=10e-3)
model.summary()
model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['categorical_accuracy'])

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 max_pooling1d (MaxPooling1D  (None, 4000, 1)          0         
 )                                                               
                                                                 
 conv1d (Conv1D)             (None, 3998, 2)           8         
                                                                 
 max_pooling1d_1 (MaxPooling  (None, 999, 2)           0         
 1D)                                                             
                                                                 
 conv1d_1 (Conv1D)           (None, 997, 2)            14        
                                                                 
 max_pooling1d_2 (MaxPooling  (None, 249, 2)           0         
 1D)                                                             
                                                        

## Train model

In [7]:
model.fit(x_train, y_train, epochs=30, batch_size=384, validation_data=(x_test, y_test))

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x165ffd7ead0>

## Evaluate model on test dataset

In [26]:
# get new test data from board
x_test.shape
x_test_board = []
y_test_board = []

dataset_dir_board = Path('board_dataset')

for recording in dataset_dir_board.glob(f'**/*.wav'):
    if not recording.parent.name in CLASSES: # Ignore unused classes
        continue
    label = CLASSES.index(recording.parent.name) # Assign class number
    
    with wave.open(str(recording)) as f: # Read wave file
        data = np.frombuffer(f.readframes(f.getnframes()), dtype=np.int16).copy() # As 16-bit signed integer
        
    data = data.astype(np.float32) # Convert to 32-bit floating-point
    data.resize((16000, 1)) # Resize to 1s (16kHz) with zero-padding, 1 channel
    
    x_test_board.append(data)
    y_test_board.append(label)

print(len(x_test_board))
print(len(y_test_board))

x_test_board = np.array(x_test_board)
y_test_board = to_categorical(np.array(y_test_board))

FIXED_POINT = 9
x_test_board  /= 2**FIXED_POINT

# model.evaluate(x_test, y_test, verbose=2)
# pred_test = model.predict(x_test)
# print(tf.math.confusion_matrix(y_test.argmax(axis=1), pred_test.argmax(axis=1)))

13
13


In [27]:
model.evaluate(x_test_board, y_test_board, verbose=2)
pred_test_board = model.predict(x_test_board)
print(tf.math.confusion_matrix(y_test_board.argmax(axis=1), pred_test_board.argmax(axis=1)))

1/1 - 0s - loss: 2.7966 - categorical_accuracy: 0.2308 - 32ms/epoch - 32ms/step
tf.Tensor(
[[0 0 1 0 0 0 0 0 0 0]
 [0 1 0 0 0 0 0 0 0 0]
 [0 1 1 0 0 0 0 0 0 0]
 [0 1 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 1 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 1]
 [0 0 1 0 0 0 0 0 0 0]
 [0 0 1 0 0 1 0 0 0 0]
 [0 0 0 0 0 0 0 0 1 0]
 [0 1 0 0 0 0 0 0 1 0]], shape=(10, 10), dtype=int32)


## Evaluate model on small dataset

In [20]:
model.evaluate(x_test_250, y_test_250, verbose=2)
pred_test_250 = model.predict(x_test_250)
print(tf.math.confusion_matrix(y_test_250.argmax(axis=1), pred_test_250.argmax(axis=1)))

8/8 - 0s - loss: 1.7559 - categorical_accuracy: 0.3280 - 122ms/epoch - 15ms/step
tf.Tensor(
[[14  0  0  0  4  0  1  0  0  0]
 [ 4  3  0  0  2  6  0  2  0  3]
 [17  0  0  0  2  0  0  0  0  0]
 [14  0  0  0  2  0  5  1  0  0]
 [ 7  2  0  0 10  3  0  2  0  2]
 [ 1  6  0  0  2 21  1  4  0  2]
 [ 4  0  0  0  0  0 21  0  0  0]
 [ 6  3  0  0  4  2  0 12  0  2]
 [23  1  0  0  1  0  4  2  0  1]
 [ 1  6  0  0  4  7  1  1  0  1]], shape=(10, 10), dtype=int32)


## Save trained model

In [91]:
model.save('lab_gsc.h5')

## Remove SoftMax layer

In [92]:
model = tf.keras.Model(model.input, model.layers[-2].output, name=model.name)

## Install MicroAI for C inference code generation (kerascnn2c module)

In [93]:
%pip install https://bitbucket.org/edge-team-leat/microai_public/get/6adfbcb347d3.zip#subdirectory=third_party/kerascnn2c_fixed
import kerascnn2c

Collecting https://bitbucket.org/edge-team-leat/microai_public/get/6adfbcb347d3.zip#subdirectory=third_party/kerascnn2c_fixed
  Downloading https://bitbucket.org/edge-team-leat/microai_public/get/6adfbcb347d3.zip (1.9 MB)
     ---------------------------------------- 1.9/1.9 MB 4.5 MB/s eta 0:00:00
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Note: you may need to restart the kernel to use updated packages.


## Generate C code for the trained model with 16-bit fixed-point representation

In [94]:
res = kerascnn2c.Converter(output_path=Path('gsc_output_fixed'),
                           fixed_point=FIXED_POINT, # Number of bits for the fractional part, Q7.9 format
                           number_type='int16_t', # Data type for weights/activations (16 bits quantization)
                           long_number_type='int32_t', # Data type for intermediate results
                           number_min=-(2**15), # Minimum value for 16-bit signed integers
                           number_max=(2**15)-1 # Maximum value for 16-bit signed integers
                          ).convert_model(copy.deepcopy(model))
with open('gsc_model_fixed.h', 'w') as f:
    f.write(res)

———————————————————————————————————————————————————————————————————————————————————————————————————————
Inputs                           | Layer                            | Outputs                         
———————————————————————————————————————————————————————————————————————————————————————————————————————
                                 | input_21                         | max_pooling1d_46                
-------------------------------------------------------------------------------------------------------
input_21                         | max_pooling1d_46                 | conv1d_54                       
-------------------------------------------------------------------------------------------------------
max_pooling1d_46                 | conv1d_54                        | max_pooling1d_47                
-------------------------------------------------------------------------------------------------------
conv1d_54                        | max_pooling1d_47                 

## Compile the 16-bit fixed-point C code for x86 and evaluate on small dataset

In [95]:
!g++ -Wall -Wextra -pedantic -Ofast -o gsc_fixed -Igsc_output_fixed/ gsc_output_fixed/model.c main.cpp 
!./gsc_fixed x_test_gsc_250.csv y_test_gsc_250.csv

gsc_output_fixed/model.c: In function 'void cnn(const number_t (*)[16000], number_t*)':
  124 |     activations1.average_pooling1d_11_output,
      |     ~~~~~~~~~~~~~^~~~~~~~~~~~~~~~~~~~~~~~~~~
'.' is not recognized as an internal or external command,
operable program or batch file.


In [66]:
# Load new dataset
# get new test data from board
x_train_sapcm = []
y_train_sapcm = []
x_test_sapcm = []
y_test_sapcm = []

dataset_dir_sapcm = Path('dataset_sapcm')

CLASSES_SAPCM = ["Wednesday", "Purple", "Sunrise"]

with Path('dataset_sapcm/testing_list_sapcm.txt').open() as f:
    testing_list_sapcm = f.read().splitlines()
    
for recording in dataset_dir_sapcm.glob(f'**/*.wav'):
    if not recording.parent.name in CLASSES_SAPCM: # Ignore unused classes
        continue
    label = CLASSES_SAPCM.index(recording.parent.name) # Assign class number
    
    with wave.open(str(recording)) as f: # Read wave file
        data = np.frombuffer(f.readframes(f.getnframes()), dtype=np.int16).copy() # As 16-bit signed integer
        
    data = data.astype(np.float32) # Convert to 32-bit floating-point
    data.resize((16000, 1)) # Resize to 1s (16kHz) with zero-padding, 1 channel
    
    x_train_sapcm.append(data)
    y_train_sapcm.append(label)
    
x_train_sapcm = np.array(x_train_sapcm)
y_train_sapcm = to_categorical(np.array(y_train_sapcm))

perms = np.random.permutation(len(y_train_sapcm))[0:math.ceil(len(y_train_sapcm)*0.2)]
x_test_sapcm = x_train_sapcm[perms]
y_test_sapcm = y_train_sapcm[perms]

x_train_sapcm = np.delete(x_train_sapcm, perms, axis=0)
y_train_sapcm = np.delete(y_train_sapcm, perms, axis=0)

x_test_sapcm = np.array(x_test_sapcm)
y_test_sapcm = np.array(y_test_sapcm)

FIXED_POINT = 9
x_train_sapcm /= 2**FIXED_POINT
x_test_sapcm  /= 2**FIXED_POINT

print(x_train_sapcm.shape)
print(y_train_sapcm.shape)
print(x_test_sapcm.shape)
print(y_test_sapcm.shape)


(255, 16000, 1)
(255, 3)
(64, 16000, 1)
(64, 3)


In [55]:
# Define hyperparameters
num_classes = 3 # Number of sound classes
num_filters = 2 # Number of filters for CNN layers
num_units = 128 # Number of units for RNN layers
num_heads = 8 # Number of heads for attention layers
dropout_rate = 0.1 # Dropout rate for regularization

model1 = Sequential()
model1.add(Input(shape=(16000, 1)))
model1.add(MaxPool1D(pool_size=(4)))
model1.add(Conv1D(filters=num_filters, kernel_size=3, activation='relu'))
model1.add(MaxPool1D(pool_size=(4)))
model1.add(Conv1D(filters=num_filters, kernel_size=3, activation='relu'))
model1.add(MaxPool1D(pool_size=(4)))
model1.add(Conv1D(filters=num_filters, kernel_size=3, activation='relu'))
model1.add(Flatten())
model1.add(Dense(units=50, activation='relu'))
model1.add(Dense(units=35, activation='relu'))
model1.add(Dense(units=16, activation='relu'))
model1.add(Dense(units=num_classes, activation='relu'))
model1.add(Activation('softmax')) # SoftMax activation needs to be separate from Dense to remove it later on
# EXPLORE Learning Rate
opt = tf.keras.optimizers.Adam(learning_rate=10e-4)
model1.summary()
model1.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['categorical_accuracy'])

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 max_pooling1d_6 (MaxPooling  (None, 4000, 1)          0         
 1D)                                                             
                                                                 
 conv1d_6 (Conv1D)           (None, 3998, 2)           8         
                                                                 
 max_pooling1d_7 (MaxPooling  (None, 999, 2)           0         
 1D)                                                             
                                                                 
 conv1d_7 (Conv1D)           (None, 997, 2)            14        
                                                                 
 max_pooling1d_8 (MaxPooling  (None, 249, 2)           0         
 1D)                                                             
                                                      

In [69]:
# Define hyperparameters
num_classes = 3 # Number of sound classes
num_filters = 2 # Number of filters for CNN layers
num_units = 128 # Number of units for RNN layers
num_heads = 8 # Number of heads for attention layers
dropout_rate = 0.1 # Dropout rate for regularization

model2 = Sequential()
model2.add(Input(shape=(16000, 1)))
model2.add(MaxPool1D(pool_size=(4)))
model2.add(Conv1D(filters=64, kernel_size=5, activation='softmax'))
model2.add(MaxPool1D(pool_size=(4)))
model2.add(Conv1D(filters=128, kernel_size=5, activation='softmax'))
model2.add(MaxPool1D(pool_size=(4)))
model2.add(Conv1D(filters=256, kernel_size=5, activation='softmax'))
model2.add(Flatten())
model2.add(Dense(units=256, activation='softmax'))
model2.add(Dropout(0.2))
model2.add(Dense(units=128, activation='softmax'))
model2.add(Dropout(0.2))
model2.add(Dense(units=num_classes, activation='softmax'))

opt = tf.keras.optimizers.Adam(learning_rate=10e-2)
model2.summary()
model2.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['categorical_accuracy'])

Model: "sequential_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 max_pooling1d_18 (MaxPoolin  (None, 4000, 1)          0         
 g1D)                                                            
                                                                 
 conv1d_18 (Conv1D)          (None, 3996, 64)          384       
                                                                 
 max_pooling1d_19 (MaxPoolin  (None, 999, 64)          0         
 g1D)                                                            
                                                                 
 conv1d_19 (Conv1D)          (None, 995, 128)          41088     
                                                                 
 max_pooling1d_20 (MaxPoolin  (None, 248, 128)         0         
 g1D)                                                            
                                                      

In [70]:
model2.fit(x_train_sapcm, y_train_sapcm, epochs=10, batch_size=30, validation_data=(x_test_sapcm, y_test_sapcm))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x265d01f60e0>

In [71]:
model2.evaluate(x_test_sapcm, y_test_sapcm, verbose=2)
pred_test_sapcm = model2.predict(x_test_sapcm)
print(tf.math.confusion_matrix(y_test_sapcm.argmax(axis=1), pred_test_sapcm.argmax(axis=1)))

2/2 - 0s - loss: 1.1196 - categorical_accuracy: 0.2969 - 210ms/epoch - 105ms/step
tf.Tensor(
[[19  0  0]
 [27  0  0]
 [18  0  0]], shape=(3, 3), dtype=int32)
