# Simple NN on Truncated Raw Audio

todo: validation set, test set

In [47]:
import os 
from random import shuffle
from keras.utils import to_categorical
import wave
import numpy as np

CHUNK = 4096
path = 'data/spoken_numbers_pcm/'

def load_wav_file(name):
    f = wave.open(name, 'r')
    chunk = []
    data0 = f.readframes(CHUNK)
    while data0:
        data = np.fromstring(data0, dtype='uint8')
        data = (data + 128) / 255.
        chunk.extend(data)
        data0 = f.readframes(CHUNK)
    chunk = chunk[0:CHUNK * 2]  
    chunk.extend(np.zeros(CHUNK * 2 - len(chunk)))  
    #print("%s loaded" % name)
    return chunk

def wavGenerator(batch_size=1000):
    batch_waves = []
    labels = []
    files = os.listdir(path)
    while True:
        shuffle(files)
        for wav in files:
            if not wav.endswith(".wav"): continue
            labels.append(to_categorical(int(wav[0]), num_classes = 10))
            chunk = load_wav_file(path+wav)
            batch_waves.append(chunk)
            if len(batch_waves) >= batch_size:
                yield np.asarray(batch_waves), np.squeeze(np.asarray(labels))
                batch_waves = []  # Reset for next batch
                labels = []

In [48]:
from keras.models import Sequential
from keras.layers import Dense, Dropout

# define the model
model = Sequential()
model.add(Dense(64, activation='relu', input_shape=(8192,)))
model.add(Dropout(0.5))
model.add(Dense(10, activation='softmax'))

# summarize the model
model.summary()

# compile the model 
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# train the model 
batch_size = 1000
hist = model.fit_generator(wavGenerator(batch_size), 
                           steps_per_epoch = len(os.listdir(path)) // batch_size, 
                           epochs=50, verbose=1)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_10 (Dense)             (None, 64)                524352    
_________________________________________________________________
dropout_9 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_11 (Dense)             (None, 10)                650       
Total params: 525,002
Trainable params: 525,002
Non-trainable params: 0
_________________________________________________________________
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
E

# LSTM on MFCC Features

In [44]:
import librosa

def mfccGenerator(batch_size=10):
    batch_features = []
    labels = []
    files = os.listdir(path)
    while True:
        shuffle(files)
        for wav in files:
            if not wav.endswith(".wav"): continue
            labels.append(to_categorical(int(wav[0]), num_classes = 10))
            wave, sr = librosa.load(path+wav, mono=True)
            mfcc = librosa.feature.mfcc(wave, sr)
            mfcc=np.pad(mfcc,((0,0),(0,80-len(mfcc[0]))), mode='constant', constant_values=0)
            batch_features.append(np.array(mfcc))
            if len(batch_features) >= batch_size:
                yield np.asarray(batch_features), np.squeeze(np.asarray(labels)) 
                batch_features = []  # Reset for next batch
                labels = []

In [46]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM

# define the model
model = Sequential()
model.add(LSTM(128, input_shape=(20, 80)))
model.add(Dropout(0.8))
model.add(Dense(10, activation='softmax'))

# summarize the model
model.summary()

# compile the model 
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# train the model 
batch_size = 100
hist = model.fit_generator(mfccGenerator(batch_size), 
                           steps_per_epoch = len(os.listdir(path)) // batch_size, 
                           epochs = 50, verbose = 1)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_7 (LSTM)                (None, 128)               107008    
_________________________________________________________________
dropout_8 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_9 (Dense)              (None, 10)                1290      
Total params: 108,298
Trainable params: 108,298
Non-trainable params: 0
_________________________________________________________________
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
E

# insert more complex model here ~