todo: validation set, test set

# Define Functions for Loading Dataset

In [44]:
import os 
from random import shuffle
from keras.utils import to_categorical
import wave
import numpy as np
import librosa

CHUNK = 4096
path = 'data/spoken_numbers_pcm/'

def load_wav_file(name):
    f = wave.open(name, 'r')
    chunk = []
    data0 = f.readframes(CHUNK)
    while data0:
        data = np.fromstring(data0, dtype='uint8')
        data = (data + 128) / 255.
        chunk.extend(data)
        data0 = f.readframes(CHUNK)
    chunk = chunk[0:CHUNK * 2]  
    chunk.extend(np.zeros(CHUNK * 2 - len(chunk)))  
    #print("%s loaded" % name)
    return chunk

def wavGenerator(batch_size=1000):
    batch_waves = []
    labels = []
    files = os.listdir(path)
    while True:
        shuffle(files)
        for wav in files:
            if not wav.endswith(".wav"): continue
            labels.append(to_categorical(int(wav[0]), num_classes = 10))
            chunk = load_wav_file(path+wav)
            batch_waves.append(chunk)
            if len(batch_waves) >= batch_size:
                yield np.asarray(batch_waves), np.squeeze(np.asarray(labels))
                batch_waves = []  # Reset for next batch
                labels = []

def mfccGenerator(batch_size=10):
    batch_features = []
    labels = []
    files = os.listdir(path)
    while True:
        shuffle(files)
        for wav in files:
            if not wav.endswith(".wav"): continue
            labels.append(to_categorical(int(wav[0]), num_classes = 10))
            wave, sr = librosa.load(path+wav, mono=True)
            mfcc = librosa.feature.mfcc(wave, sr)
            mfcc=np.pad(mfcc,((0,0),(0,80-len(mfcc[0]))), mode='constant', constant_values=0)
            batch_features.append(np.array(mfcc))
            if len(batch_features) >= batch_size:
                yield np.asarray(batch_features), np.squeeze(np.asarray(labels)) 
                batch_features = []  # Reset for next batch
                labels = []

# Simple NN on Truncated Raw Audio

In [None]:
from keras.models import Sequential
from keras.layers import Dense, Dropout

# define the model
model = Sequential()
model.add(Dense(64, activation='relu', input_shape=(8192,)))
model.add(Dense(10, activation='softmax'))

# summarize the model
model.summary()

# compile the model 
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# train the model 
batch_size = 1000
hist = model.fit_generator(wavGenerator(batch_size), 
                           steps_per_epoch=len(os.listdir(path)) // batch_size, 
                           epochs=5, verbose=1)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_31 (Dense)             (None, 64)                524352    
_________________________________________________________________
dense_32 (Dense)             (None, 10)                650       
Total params: 525,002
Trainable params: 525,002
Non-trainable params: 0
_________________________________________________________________
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5

# LSTM on MFCC Features

In [64]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM

# define the model
model = Sequential()
model.add(LSTM(128, activation='relu', input_shape=(20, 80)))
model.add(Dense(10, activation='softmax'))

# summarize the model
model.summary()

# compile the model 
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# train the model 
batch_size = 100
hist = model.fit_generator(mfccGenerator(batch_size), 
                           steps_per_epoch=len(os.listdir(path)) // batch_size, 
                           epochs=5, verbose=1)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_20 (LSTM)               (None, 128)               107008    
_________________________________________________________________
dense_28 (Dense)             (None, 10)                1290      
Total params: 108,298
Trainable params: 108,298
Non-trainable params: 0
_________________________________________________________________
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


# Conv1D + GRU on MFCC Features

In [51]:
from keras.models import Sequential
from keras.layers import Convolution1D, Dense, GRU

# define the model
model = Sequential()
model.add(Convolution1D(200, 2, padding='same', activation='relu', input_shape=(20, 80)))
model.add(GRU(200, activation='relu'))
model.add(Dense(10, activation='softmax'))

# summarize the model
model.summary()

# compile the model 
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# train the model 
batch_size = 100
hist = model.fit_generator(mfccGenerator(batch_size), 
                           steps_per_epoch=len(os.listdir(path)) // batch_size, 
                           epochs=5, verbose=1)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_22 (Conv1D)           (None, 20, 200)           32200     
_________________________________________________________________
gru_25 (GRU)                 (None, 200)               240600    
_________________________________________________________________
dense_18 (Dense)             (None, 10)                2010      
Total params: 274,810
Trainable params: 274,810
Non-trainable params: 0
_________________________________________________________________
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


# Conv1D on MFCC Features

In [62]:
from keras.models import Sequential
from keras.layers import Convolution1D, Dense, MaxPooling1D, Activation, Flatten

# define the model
model = Sequential()
model.add(Convolution1D(200, 2, padding='same', activation='relu', input_shape=(20, 80)))
model.add(MaxPooling1D(2))
model.add(TimeDistributed(Dense(1)))
model.add(Flatten())
model.add(Activation('softmax'))

# summarize the model
model.summary()

# compile the model 
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# train the model 
batch_size = 100
hist = model.fit_generator(mfccGenerator(batch_size), 
                           steps_per_epoch=len(os.listdir(path)) // batch_size, 
                           epochs=5, verbose=1)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_33 (Conv1D)           (None, 20, 200)           32200     
_________________________________________________________________
max_pooling1d_9 (MaxPooling1 (None, 10, 200)           0         
_________________________________________________________________
time_distributed_14 (TimeDis (None, 10, 1)             201       
_________________________________________________________________
flatten_3 (Flatten)          (None, 10)                0         
_________________________________________________________________
activation_2 (Activation)    (None, 10)                0         
Total params: 32,401
Trainable params: 32,401
Non-trainable params: 0
_________________________________________________________________
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM

# define the model
model = Sequential()
model.add(LSTM(100, activation='relu', return_sequences=True, input_shape=(20, 80)))
model.add(LSTM(100, activation='relu'))
model.add(Dense(10, activation='softmax'))

# summarize the model
model.summary()

# compile the model 
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# train the model 
batch_size = 100
hist = model.fit_generator(mfccGenerator(batch_size), 
                           steps_per_epoch=len(os.listdir(path)) // batch_size, 
                           epochs=5, verbose=1)