In [4]:
import os
import sys
import numpy as np
from scipy.io import wavfile 
from librosa.feature import mfcc
from librosa import load

import tensorflow as tf
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers import SimpleRNN, Activation, Dense
from keras.optimizers import Adam

INPUT_SIZE = 20
BATCH_SIZE = 100
BATCH_INDEX = 0
OUTPUT_SIZE = 10
CELL_SIZE = 50
LR = 5e-4

PATH = 'free-spoken-digit-dataset-master/recordings'
DIGITS = '0123456789'

In [5]:
def load_wav_files(test_pct=0.1, test_speaker=None):
    D_train, D_test = [], []
    for d in DIGITS:
        for filename in [x for x in os.listdir(PATH) if x.endswith('.wav') and x.startswith(d)]:
            filepath = os.path.join(PATH, filename)
            sig, rate = load(filepath)

            # Extract MFCC features
            mfcc_features = mfcc(sig, rate).T
            dp = np.vstack(([int(d)]*INPUT_SIZE, mfcc_features))
            if test_speaker:
                if test_speaker in filename:
                    D_test.append(dp)
                else:
                    D_train.append(dp)
            else:
                D_train.append(dp)
    if test_speaker:
        lt = len(D_train)
    else:
        lt = int(len(D_train)*(1-test_pct))
    D_train.extend(D_test)
    D = tf.keras.preprocessing.sequence.pad_sequences(D_train, padding="post")
    if test_speaker:
        D_train = D[:lt]
        D_test = D[lt:]
        np.random.shuffle(D_train)
        np.random.shuffle(D_test)
        X_train = [dp[1:] for dp in D_train]
        X_test = [dp[1:] for dp in D_test]
        y_train = [dp[0][0] for dp in D_train]
        y_test = [dp[0][0] for dp in D_test]
        y_train = np_utils.to_categorical(y_train, num_classes=10)
        y_test = np_utils.to_categorical(y_test, num_classes=10)
        return (np.array(X_train), y_train), (np.array(X_test), y_test)
    else:
        np.random.shuffle(D)
        X = [dp[1:] for dp in D]
        y = [dp[0][0] for dp in D]
        y = np_utils.to_categorical(y, num_classes=10)
        return (np.array(X[:lt]), y[:lt]), (np.array(X[lt:]), y[lt:])

In [6]:
traind, testd = load_wav_files()
X_train, y_train = traind
X_test, y_test = testd
scl = np.max(np.abs(X_train))

In [7]:
X_train = X_train/scl
X_test = X_test/scl

In [8]:
TIME_STEPS = X_train.shape[1]

In [21]:
# build RNN model
model = Sequential()

# RNN cell
model.add(SimpleRNN(
    batch_input_shape=(None, TIME_STEPS, INPUT_SIZE),
    units=CELL_SIZE,
    activation="elu",
    kernel_initializer="orthogonal",
    recurrent_regularizer="l2"
))

# output layer
model.add(Dense(OUTPUT_SIZE))
model.add(Activation('softmax'))

# optimizer
adam = Adam(tf.keras.optimizers.schedules.ExponentialDecay(
    LR,
    decay_steps=500,
    decay_rate=0.94,
    staircase=True))
model.compile(optimizer=adam,
              loss='categorical_crossentropy',
              metrics=['accuracy'])

In [22]:
# training
for step in range(30001):
    # data shape = (batch_num, steps, inputs/outputs)
    X_batch = X_train[BATCH_INDEX: BATCH_INDEX+BATCH_SIZE, :, :]
    Y_batch = y_train[BATCH_INDEX: BATCH_INDEX+BATCH_SIZE, :]
    cost = model.train_on_batch(X_batch, Y_batch)
    BATCH_INDEX += BATCH_SIZE
    BATCH_INDEX = 0 if BATCH_INDEX >= X_train.shape[0] else BATCH_INDEX

    if step % 1000 == 0:
        loss, accuracy = model.evaluate(X_train, y_train, batch_size=y_train.shape[0], verbose=False)
        print('train loss: ', loss, 'train accuracy: ', accuracy)
        loss, accuracy = model.evaluate(X_test, y_test, batch_size=y_test.shape[0], verbose=False)
        print('test loss: ', loss, 'test accuracy: ', accuracy)

train loss:  2.79999041557312 train accuracy:  0.09370370209217072
test loss:  2.802091121673584 test accuracy:  0.09333333373069763
train loss:  1.8447370529174805 train accuracy:  0.41740739345550537
test loss:  1.8977195024490356 test accuracy:  0.3799999952316284
train loss:  1.448486328125 train accuracy:  0.597777783870697
test loss:  1.5100284814834595 test accuracy:  0.5600000023841858
train loss:  1.1955831050872803 train accuracy:  0.7062963247299194
test loss:  1.2538082599639893 test accuracy:  0.653333306312561
train loss:  1.0415067672729492 train accuracy:  0.7651851773262024
test loss:  1.0619148015975952 test accuracy:  0.7433333396911621
train loss:  0.9965206980705261 train accuracy:  0.7781481742858887
test loss:  1.0202271938323975 test accuracy:  0.7733333110809326
train loss:  0.9517961740493774 train accuracy:  0.8018518686294556
test loss:  1.0015106201171875 test accuracy:  0.7866666913032532
train loss:  0.9474383592605591 train accuracy:  0.79666668176651
te

In [32]:
y_true = np.argmax(y_test, axis=-1)
y_pred = np.argmax(model.predict(X_test), axis=-1)

In [34]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_true, y_pred)

array([[28,  0,  0,  2,  1,  0,  0,  0,  0,  0],
       [ 0, 25,  0,  0,  3,  5,  0,  0,  0,  0],
       [ 1,  0, 25,  1,  0,  0,  0,  0,  1,  0],
       [ 4,  0,  1, 22,  0,  0,  4,  2,  0,  0],
       [ 6,  1,  0,  0, 36,  0,  0,  0,  0,  0],
       [ 0,  5,  0,  0,  0, 26,  0,  0,  0,  3],
       [ 0,  0,  1,  0,  0,  0, 28,  0,  3,  0],
       [ 1,  0,  0,  0,  0,  0,  0, 16,  1,  0],
       [ 0,  0,  0,  0,  0,  0,  3,  0, 23,  0],
       [ 0,  0,  0,  0,  0,  1,  0,  0,  0, 21]])