In [59]:
import numpy as np
import os
import random
import tensorflow as tf

from tensorflow.keras.layers import Input, Conv2D, MaxPool2D, Dropout, Flatten, Dense, LSTM
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical

from scipy.io import wavfile
from glob import glob
from python_speech_features import mfcc

In [39]:
input_dir = "speaker"
train_batch_size = 32
val_batch_size = 8
labels = os.listdir(input_dir)

In [40]:
nfilt = 26
nfeat = 13
nfft = 512
rate = 16000
step = rate // 10

In [47]:
def features_from_txt(filename):
    folder = os.path.basename(os.path.dirname(filename))
    rate, wav = wavfile.read(filename)
    rand_index = np.random.randint(0, wav.shape[0] - step)
    sample = wav[rand_index:rand_index+step]
    X_sample = mfcc(sample, rate, numcep=nfeat, nfilt=nfilt, nfft=nfft).T
    label = to_categorical(labels.index(folder), num_classes=len(labels))
#     print(wav.shape, sample.shape, X_sample.shape, label.shape)
    return (X_sample, label)

In [48]:
# features_from_txt("speaker/0016/00003.wav")

In [49]:
def generate_data(input_dir, shuffle=False):
    files = glob(os.path.join(input_dir, "*", "*"))
    random.shuffle(files)
    n = len(files)
    def fetch():
        i = 0
        while True:
            if shuffle:
                file = random.choice(files)
            else:
                file = files[i]
                i = (i+1)%n
            yield features_from_txt(file)
    return fetch

In [56]:
def audio_model():
    x1 = Input(shape=(13, 9), name='audio_input')
    x = LSTM(128, return_sequences=True)(x1)
    x = LSTM(64)(x)
    x = Dense(20, activation='softmax',name='audio_output')(x)
    return Model(inputs=x1, outputs=x)

In [60]:
batch_size = 32
dataset = tf.data.Dataset.from_generator(generate_data(input_dir), output_types=(tf.float64, tf.uint8), output_shapes=((13, 9), (20,)))
dataset = dataset.batch(batch_size)
for d in dataset.take(1):
    print(d[0].shape, d[1].shape)

(32, 13, 9) (32, 20)


In [61]:
epochs = 50
lr = 0.0001
save_model_as = "audio_epochs{}_lr{}_batch{}"
optimizer = Adam(lr=lr)
model = audio_model()
model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])

model.fit(dataset, epochs=epochs, steps_per_epoch=20)
model.save(save_model_as.format(epochs, lr, batch_size))

NameError: name 'train_generator' is not defined