In [1]:
import os
import sys
import numpy as np
from scipy.io import wavfile 
from librosa.feature import mfcc
from librosa import load

import tensorflow as tf
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers import SimpleRNN, Activation, Dense
from keras.optimizers import Adam

INPUT_SIZE = 20
BATCH_SIZE = 100
BATCH_INDEX = 0
OUTPUT_SIZE = 10
CELL_SIZE = 50
LR = 5e-4

PATH = 'SpeechCommands/speech_commands_v0.02'
DIGITS = '0123456789'
LABELS = ['zero', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine']

In [2]:
def load_wav_files():
    D_train, D_test = [], []
    with open('trainingspeakers.txt') as fileobj:
        trainingspeakers = [line.strip() for line in fileobj]
    fileobj.close()
    with open('validationspeakers.txt') as fileobj:
        validationspeakers = [line.strip() for line in fileobj]
    fileobj.close()
    for d in DIGITS:
        dpath = os.path.join(PATH, LABELS[int(d)])
        for filename in [x for x in os.listdir(dpath)]:
            filepath = os.path.join(dpath, filename)
            sig, rate = load(filepath)

            # Extract MFCC features
            mfcc_features = mfcc(sig, rate).T
            dp = np.vstack(([int(d)]*INPUT_SIZE, mfcc_features))
            if filename.split('_')[0] in validationspeakers:
                D_test.append(dp)
            else:
                D_train.append(dp)
    lt = len(D_train)
    D_train.extend(D_test)
    D = tf.keras.preprocessing.sequence.pad_sequences(D_train, padding="post")
    D_train = D[:lt]
    D_test = D[lt:]
    np.random.shuffle(D_train)
    np.random.shuffle(D_test)
    X_train = [dp[1:] for dp in D_train]
    X_test = [dp[1:] for dp in D_test]
    y_train = [dp[0][0] for dp in D_train]
    y_test = [dp[0][0] for dp in D_test]
    y_train = np_utils.to_categorical(y_train, num_classes=10)
    y_test = np_utils.to_categorical(y_test, num_classes=10)
    return (np.array(X_train), y_train), (np.array(X_test), y_test)

In [4]:
traind, testd = load_wav_files()
X_train, y_train = traind
X_test, y_test = testd
scl = np.max(np.abs(X_train))
X_train = X_train/scl
X_test = X_test/scl

In [5]:
TIME_STEPS = X_train.shape[1]

In [23]:
# build RNN model
model = Sequential()

# RNN cell
model.add(SimpleRNN(
    batch_input_shape=(None, TIME_STEPS, INPUT_SIZE),
    units=CELL_SIZE,
    activation="elu",
    kernel_initializer="orthogonal",
    recurrent_regularizer="l2"
))

# output layer
model.add(Dense(OUTPUT_SIZE))
model.add(Activation('softmax'))

# optimizer
adam = Adam(tf.keras.optimizers.schedules.ExponentialDecay(
    LR,
    decay_steps=500,
    decay_rate=0.94,
    staircase=True))
model.compile(optimizer=adam,
              loss='categorical_crossentropy',
              metrics=['accuracy'])

In [24]:
# training
for step in range(100001):
    # data shape = (batch_num, steps, inputs/outputs)
    X_batch = X_train[BATCH_INDEX: BATCH_INDEX+BATCH_SIZE, :, :]
    Y_batch = y_train[BATCH_INDEX: BATCH_INDEX+BATCH_SIZE, :]
    cost = model.train_on_batch(X_batch, Y_batch)
    BATCH_INDEX += BATCH_SIZE
    BATCH_INDEX = 0 if BATCH_INDEX >= X_train.shape[0] else BATCH_INDEX

    if step % 5000 == 0:
        loss, accuracy = model.evaluate(X_train, y_train, batch_size=y_train.shape[0], verbose=False)
        print('train loss: ', loss, 'train accuracy: ', accuracy)
        loss, accuracy = model.evaluate(X_test, y_test, batch_size=y_test.shape[0], verbose=False)
        print('test loss: ', loss, 'test accuracy: ', accuracy)

train loss:  2.8095529079437256 train accuracy:  0.10377146303653717
test loss:  2.809520721435547 test accuracy:  0.10563652962446213
train loss:  1.2783865928649902 train accuracy:  0.6426576972007751
test loss:  1.2976655960083008 test accuracy:  0.6434928178787231
train loss:  1.1305230855941772 train accuracy:  0.70171719789505
test loss:  1.1505022048950195 test accuracy:  0.6996001601219177
train loss:  1.0664575099945068 train accuracy:  0.7273952960968018
test loss:  1.092832326889038 test accuracy:  0.7228170037269592
train loss:  1.0390204191207886 train accuracy:  0.7394960522651672
test loss:  1.0655114650726318 test accuracy:  0.7351992726325989
train loss:  1.0107507705688477 train accuracy:  0.7502808570861816
test loss:  1.0390881299972534 test accuracy:  0.7444860339164734
train loss:  1.0001981258392334 train accuracy:  0.7544214129447937
test loss:  1.028967022895813 test accuracy:  0.7469366788864136
train loss:  0.9940862655639648 train accuracy:  0.75666826963424

In [25]:
y_true = np.argmax(y_test, axis=-1)
y_pred = np.argmax(model.predict(X_test), axis=-1)

In [26]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_true, y_pred)

array([[685,   0,  70,  28,  15,   0,   5,  22,   1,   6],
       [  5, 546,   0,   5,  36,  70,   0,   9,   3,  93],
       [ 60,   2, 518,  95,   5,   1,  11,  51,  13,   8],
       [  3,   0,  64, 518,   0,   9,   1,  41,  37,  62],
       [ 16,  37,   4,   7, 613,  17,   2,  44,   4,  12],
       [  3, 119,   0,   5,  14, 516,   1,   9,  16, 103],
       [  3,   0,  36,  21,   1,   4, 651,  25,  35,   3],
       [  4,   0,  41,  32,  31,   9,   6, 637,   5,  26],
       [  2,   3,  10,  89,   1,  24,  13,   6, 592,  13],
       [  0, 111,   2,  38,   9,  55,   0,  24,   2, 549]])