# 1 - Packages

In [2]:
import os
import random
import keras
import librosa
import numpy as np
from os import listdir
from os.path import isdir, join
from keras.models import Model
from sklearn.model_selection import train_test_split
from keras.initializers import glorot_normal
from keras.layers import Input, Activation, LSTM
from keras.layers import Dense, Dropout, Bidirectional
from keras.optimizers import Adam

# 2 - Input

In [3]:
dir_path = input()
#speech_recognition_dataset/train/audio

speech_recognition_dataset/train/audio


# 3 - Find All Labels

In [4]:
categories = [f for f in listdir(dir_path) if isdir(join(dir_path, f))]
categories.sort()
categories.remove('.ipynb_checkpoints')
print('Number of Categories: ', len(categories[1:]))
print(categories[1:])

Number of Categories:  30
['bed', 'bird', 'cat', 'dog', 'down', 'eight', 'five', 'four', 'go', 'happy', 'house', 'left', 'marvin', 'nine', 'no', 'off', 'on', 'one', 'right', 'seven', 'sheila', 'six', 'stop', 'three', 'tree', 'two', 'up', 'wow', 'yes', 'zero']


# 4 - Constants

In [5]:
target_list = ['yes', 'no', 'up', 'down', 'left', 'right', 'on', 'off', 'stop', 'go']
unknown_list = [d for d in categories if d not in target_list and d != '_background_noise_' ]
print('Target List: ', target_list)
print('\nUnknowns List : ', unknown_list)

Target List:  ['yes', 'no', 'up', 'down', 'left', 'right', 'on', 'off', 'stop', 'go']

Unknowns List :  ['bed', 'bird', 'cat', 'dog', 'eight', 'five', 'four', 'happy', 'house', 'marvin', 'nine', 'one', 'seven', 'sheila', 'six', 'three', 'tree', 'two', 'wow', 'zero']


# 5 - Load Data

In [6]:
file_name = 'background_noise_mfcc.memmap'
#393 is the number of the Silence Examples
fpc = np.memmap(file_name, dtype='float32', mode='r', shape=((393) * 101,20)) 

In [7]:
background_noise = []
with open('background_noise_label.txt') as f:
    background_noise_label = f.readlines()
for i in range(0, len(background_noise_label)):
    x = fpc[i*101:(i+1)*101, :]
    y = background_noise_label[i].strip()
    background_noise.append([x, y])

In [8]:
file_name = 'all_waves_mfcc.memmap'
#58252 is the number of the Known and Unknown Examples
fpc = np.memmap(file_name, dtype='float32', mode='r', shape=((58252) * 101,20)) 

In [9]:
all_dataset_wav = []
unknown_wav = []
label_all = []
with open('all_waves_label.txt') as f:
    all_waves_label = f.readlines()
for i in range(0, len(all_waves_label)):
    mfcc = fpc[i*101:(i+1)*101, :]
    y = all_waves_label[i].strip()
    if y in unknown_list:
        unknown_wav.append(mfcc)
    else:
        all_dataset_wav.append([mfcc, y])

# 6 - Prepare Data

In [10]:
waves = np.reshape(np.delete(all_dataset_wav,1,1),(len(all_dataset_wav)))
label_all = [i for i in np.delete(all_dataset_wav,0,1).tolist()]

In [11]:
wave_values = np.array([x for x in waves])
label_values = [x for x in label_all]
label_values = np.array(label_values)

In [12]:
print(np.array(unknown_wav).shape, wave_values.shape)

(35871, 101, 20) (21312, 101, 20)


In [13]:
unknown = unknown_wav
np.random.shuffle(unknown)
unknown = np.array(unknown)
unknown = unknown[:2000*(3)]
unknown_label = np.array(['unknown' for _ in range(2000*(3))])
unknown_label = unknown_label.reshape(2000*(3),1)
print(unknown.shape, unknown_label.shape)

(6000, 101, 20) (6000, 1)


In [14]:
wav_silence = np.reshape(np.delete(background_noise,1,1),(len(background_noise)))
silence_wave = np.array([x for x in wav_silence])
silence_label = np.array(['silence' for _ in range(len(background_noise))])
silence_label = silence_label.reshape(-1,1)

In [15]:
print(wave_values.shape, label_values.shape)
print(unknown.shape, unknown_label.shape)
print(silence_wave.shape, silence_label.shape)

(21312, 101, 20) (21312, 1)
(6000, 101, 20) (6000, 1)
(393, 101, 20) (393, 1)


In [16]:
wave_values = np.concatenate((wave_values, silence_wave), axis = 0)
wave_values = np.concatenate((wave_values, unknown), axis = 0)
label_values = np.concatenate((label_values, unknown_label), axis = 0)
label_values = np.concatenate((label_values, silence_label), axis = 0)
print(len(wave_values), len(label_values))

27705 27705


In [17]:
X_train, X_test, Y_train, Y_test = train_test_split(wave_values, label_values, 
                                                                    test_size=0.2,
                                                                    random_state = 1993,
                                                                   shuffle=True)

In [18]:
label_value = target_list
label_value.append('unknown')
label_value.append('silence')

# 7 - Convert to one hot vector 

In [19]:
new_label_value = dict()
for i, l in enumerate(label_value):
    new_label_value[l] = i
label_value = new_label_value
print(label_value)

{'yes': 0, 'no': 1, 'up': 2, 'down': 3, 'left': 4, 'right': 5, 'on': 6, 'off': 7, 'stop': 8, 'go': 9, 'unknown': 10, 'silence': 11}


In [20]:
temp = []
cnt = 0

for v in Y_train:
    temp.append(label_value[v[0]])
Y_train = np.array(temp)

temp = []
for v in Y_test:
    temp.append(label_value[v[0]])
Y_test = np.array(temp)

#Make Label data 'class num' -> 'One hot vector'
Y_train = keras.utils.to_categorical(Y_train, len(label_value))
Y_test = keras.utils.to_categorical(Y_test, len(label_value))

In [21]:
X_train = X_train.reshape(-1, X_train.shape[1], X_train.shape[2])

In [22]:
print('Train_Wav Demension : ' + str(np.shape(X_train)))
print('Train_Label Demension : ' + str(np.shape(Y_train)))
print('Test_Wav Demension : ' + str(np.shape(X_test)))
print('Test_Label Demension : ' + str(np.shape(Y_test)))
print('Number Of Labels : ' + str(len(label_value)))

Train_Wav Demension : (22164, 101, 20)
Train_Label Demension : (22164, 12)
Test_Wav Demension : (5541, 101, 20)
Test_Label Demension : (5541, 12)
Number Of Labels : 12


# 7 - Model Structure

In [23]:
inputs = Input(shape=((101, 20)))
blstm1 = Bidirectional(LSTM(units=256 ,return_sequences=True,kernel_initializer=glorot_normal(seed=961)))(inputs)
dropout1 = Dropout(0.5)(blstm1)
blstm2 = Bidirectional(LSTM(units=256 ,return_sequences=False,kernel_initializer=glorot_normal(seed=961)))(dropout1)
dropout2 = Dropout(0.5)(blstm2)
dense1 = Dense(units=512, activation='relu', kernel_initializer=glorot_normal(seed=961))(dropout2)
dense2 = Dense(units=512, activation='relu', kernel_initializer=glorot_normal(seed=961))(dense1)
output = Dense(units=len(label_value),  activation='softmax', kernel_initializer=glorot_normal(seed=961))(dense2)
model = Model(inputs, output)
model.compile(loss=keras.losses.categorical_crossentropy,
             optimizer=keras.optimizers.Adam(lr = 0.001),
             metrics=['accuracy'])

In [24]:
model.summary()

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 101, 20)           0         
_________________________________________________________________
bidirectional_1 (Bidirection (None, 101, 512)          567296    
_________________________________________________________________
dropout_1 (Dropout)          (None, 101, 512)          0         
_________________________________________________________________
bidirectional_2 (Bidirection (None, 512)               1574912   
_________________________________________________________________
dropout_2 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 512)               262656    
_________________________________________________________________
dense_2 (Dense)              (None, 512)               2626

# 8 - Training

In [25]:
history = model.fit(X_train, Y_train, validation_data=[X_test, Y_test],
          batch_size=512, 
          epochs=12,
          verbose=1)


Train on 22164 samples, validate on 5541 samples
Epoch 1/12
Epoch 2/12
Epoch 3/12
Epoch 4/12
Epoch 5/12
Epoch 6/12
Epoch 7/12
Epoch 8/12
Epoch 9/12
Epoch 10/12
Epoch 11/12
Epoch 12/12


In [27]:
_, test_acc = model.evaluate(X_train, Y_train)

print('Test accuracy:', test_acc)

Test accuracy: 0.9330445528030396
