In [0]:
#https://github.com/PacktPublishing/Python-Deep-Learning-Cookbook/blob/master/Chapter09/Chapter%209%20-%20Identifying%20speakers%20with%20voice%20recognition.ipynb

In [0]:
!wget https://www.dropbox.com/s/u94fie2pwddwasv/spoken_numbers_pcm.tar?dl=0

--2020-02-11 09:56:48--  https://www.dropbox.com/s/u94fie2pwddwasv/spoken_numbers_pcm.tar?dl=0
Resolving www.dropbox.com (www.dropbox.com)... 162.125.8.1, 2620:100:601b:1::a27d:801
Connecting to www.dropbox.com (www.dropbox.com)|162.125.8.1|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: /s/raw/u94fie2pwddwasv/spoken_numbers_pcm.tar [following]
--2020-02-11 09:56:48--  https://www.dropbox.com/s/raw/u94fie2pwddwasv/spoken_numbers_pcm.tar
Reusing existing connection to www.dropbox.com:443.
HTTP request sent, awaiting response... 302 Found
Location: https://uc581891a486c1a07f9088b41d5b.dl.dropboxusercontent.com/cd/0/inline/Ax5eEa1vCF_rMy0K6ytL4SVJ_rXtOd8tpCb0a2_wA5J8F35KYR_CMiZwSum5sXkloYHa7JXcIcizuyiYE62b_FUfpQ32dEz_4sGOqJs6XKM-OA/file# [following]
--2020-02-11 09:56:48--  https://uc581891a486c1a07f9088b41d5b.dl.dropboxusercontent.com/cd/0/inline/Ax5eEa1vCF_rMy0K6ytL4SVJ_rXtOd8tpCb0a2_wA5J8F35KYR_CMiZwSum5sXkloYHa7JXcIcizuyiYE62b_FUfpQ32dEz_4sGO

In [0]:
!tar -xvf /content/spoken_numbers_pcm.tar?dl=0

spoken_numbers_pcm/
spoken_numbers_pcm/0_Agnes_100.wav
spoken_numbers_pcm/0_Agnes_120.wav
spoken_numbers_pcm/0_Agnes_140.wav
spoken_numbers_pcm/0_Agnes_160.wav
spoken_numbers_pcm/0_Agnes_180.wav
spoken_numbers_pcm/0_Agnes_200.wav
spoken_numbers_pcm/0_Agnes_220.wav
spoken_numbers_pcm/0_Agnes_240.wav
spoken_numbers_pcm/0_Agnes_260.wav
spoken_numbers_pcm/0_Agnes_280.wav
spoken_numbers_pcm/0_Agnes_300.wav
spoken_numbers_pcm/0_Agnes_320.wav
spoken_numbers_pcm/0_Agnes_340.wav
spoken_numbers_pcm/0_Agnes_360.wav
spoken_numbers_pcm/0_Agnes_380.wav
spoken_numbers_pcm/0_Agnes_400.wav
spoken_numbers_pcm/0_Albert_100.wav
spoken_numbers_pcm/0_Albert_120.wav
spoken_numbers_pcm/0_Albert_140.wav
spoken_numbers_pcm/0_Albert_160.wav
spoken_numbers_pcm/0_Albert_180.wav
spoken_numbers_pcm/0_Albert_200.wav
spoken_numbers_pcm/0_Albert_220.wav
spoken_numbers_pcm/0_Albert_240.wav
spoken_numbers_pcm/0_Albert_260.wav
spoken_numbers_pcm/0_Albert_280.wav
spoken_numbers_pcm/0_Albert_300.wav
spoken_numbers_pcm/0_Alb

In [0]:

import glob
import numpy as np
import random
import librosa
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer

import keras
from keras.layers import LSTM, Dense, Dropout, Flatten
from keras.models import Sequential
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping, ModelCheckpoint

In [0]:
SEED = 2017
DATA_DIR = '/content/spoken_numbers_pcm/'

In [0]:
files = glob.glob(DATA_DIR + "*.wav")
X_train, X_val = train_test_split(files, test_size=0.2, random_state=SEED)

print('# Training examples: {}'.format(len(X_train)))
print('# Validation examples: {}'.format(len(X_val)))

# Training examples: 1920
# Validation examples: 480


In [0]:
labels = []
for i in range(len(X_train)):
    label = X_train[i].split('/')[-1].split('_')[1]
    if label not in labels:
        labels.append(label)
print(labels)

['Agnes', 'Daniel', 'Albert', 'Tom', 'Junior', 'Bruce', 'Steffi', 'Fred', 'Kathy', 'Ralph', 'Princess', 'Victoria', 'Vicki', 'Alex', 'Samantha']


In [0]:
label_binarizer = LabelBinarizer()
label_binarizer.fit(list(set(labels)))

def one_hot_encode(x): return label_binarizer.transform(x)

In [0]:
n_features = 20
max_length = 80
n_classes = len(labels)

In [0]:
def batch_generator(data, batch_size=16):
    while 1:
        random.shuffle(data)
        X, y = [], []
        for i in range(batch_size):
            wav = data[i]
            wave, sr = librosa.load(wav, mono=True)
            label = wav.split('/')[-1].split('_')[1]
            y.append(label)
            mfcc = librosa.feature.mfcc(wave, sr)
            mfcc = np.pad(mfcc, ((0,0), (0, max_length-len(mfcc[0]))), mode='constant', constant_values=0) 
            X.append(np.array(mfcc))
        yield np.array(X), np.array(one_hot_encode(y))

In [0]:
learning_rate = 0.001
batch_size = 64
n_epochs = 10
dropout = 0.5

input_shape = (n_features, max_length)
steps_per_epoch = 50

In [0]:
model = Sequential()
model.add(LSTM(256, return_sequences=True, input_shape=input_shape,
dropout=dropout))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dropout(dropout))
model.add(Dense(n_classes, activation='softmax'))

In [0]:
opt = Adam(lr=learning_rate)
model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy'])
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_2 (LSTM)                (None, 20, 256)           345088    
_________________________________________________________________
flatten_2 (Flatten)          (None, 5120)              0         
_________________________________________________________________
dense_3 (Dense)              (None, 128)               655488    
_________________________________________________________________
dropout_2 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_4 (Dense)              (None, 15)                1935      
Total params: 1,002,511
Trainable params: 1,002,511
Non-trainable params: 0
_________________________________________________________________


In [0]:
callbacks = [ModelCheckpoint('/content/drive/My Drive/Models/Speech Recognition/voice_recognition_best_model_{epoch:02d}.hdf5', save_best_only=True),
            EarlyStopping(monitor='val_acc', patience=2)]

In [0]:

history = model.fit_generator(
 generator=batch_generator(X_train, batch_size),
 steps_per_epoch=steps_per_epoch,
 epochs=n_epochs,
 verbose=1,
 validation_data=batch_generator(X_val, 32),
 validation_steps=5,
 callbacks=callbacks
 )

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where



Epoch 1/10


