# 30 Speechs Audio Commands Recognition

In [None]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import librosa
import tensorflow as tf
import tensorflow.keras as keras
from my_classes import AudioDataGenerator

In [None]:
tf.test.is_gpu_available()

In [None]:
%%time
path = "C:\\Users\\Beranger\\Desktop\\Datasets\\Audio\\Speech_Commands\\"

all_classes = ['bed','bird','cat','dog','down','eight','five','four','go','happy','house',
         'left','marvin','nine','no','off','on','one','right','seven','sheila','six',
         'stop','three','tree','two','up','wow','yes','zero']
all_files = []
labels = []

for c in all_classes:
    all_files = all_files + os.listdir(path+c)
    for file in os.listdir(path+c):

        labels.append((file,c))
file_labels = dict(labels)

In [None]:
DF = pd.DataFrame(labels)
train_files, test_files, train_labels, test_labels = train_test_split(DF[0] , DF[1], test_size=0.20, random_state=42)
valid_files, test_files, valid_labels, test_labels = train_test_split(test_files , test_labels, test_size=0.50, random_state=42)

train_files = list(train_files.values)
valid_files = list(valid_files.values)
test_files = list(test_files.values)

In [None]:
dim_spec = (20, 44)

train_gen = AudioDataGenerator(list_IDs=train_files,
    all_classes=all_classes,
    base_path=path,
    dim_spec=dim_spec,
    dim_2=(1, 8000),
    labels=file_labels,
    option = 'mfcc',                                     
    batch_size=32,
    shuffle=True,
    n_channels=1)

valid_gen = AudioDataGenerator(list_IDs=valid_files,
    all_classes=all_classes,
    base_path=path,
    dim_spec=dim_spec,
    dim_2=(1, 8000),
    labels=file_labels,
    option='mfcc',
    batch_size=32,
    shuffle=True,
    n_channels=1)

test_gen = AudioDataGenerator(list_IDs=test_files,
    all_classes=all_classes,
    base_path=path,
    dim_spec=dim_spec,
    dim_2=(1, 8000),
    labels=file_labels,
    option='mfcc',
    batch_size=32,
    shuffle=True,
    n_channels=1)

In [29]:
valid_gen[0][0][1].shape

(32, 1, 8000)

# Machine Learning

In [43]:
from keras.models import Model, Input
from keras.layers import Dense, Conv2D, LSTM, BatchNormalization, Flatten, Lambda, Conv1D, MaxPool1D, Reshape, Bidirectional
from keras.layers import Reshape, MaxPooling2D, AveragePooling1D, Dropout, Concatenate, GRU, Permute, GlobalMaxPool1D
from keras.callbacks import ModelCheckpoint, EarlyStopping

callbacks_list = [EarlyStopping(monitor='val_acc',
                                patience=5),
                  ModelCheckpoint(filepath='audio_best_model.h5',
                                  monitor='val_loss',
                                  save_best_only=True)]

def build_model():
    # First branch
    input_mfcc = Input(shape=(*dim_spec, 1))
    f = Conv2D(64, 2, activation='relu')(input_mfcc)
    f = BatchNormalization(axis=2)(f)
    f = Dropout(0.2)(f)
    
    f = Conv2D(64, 2, activation='relu')(f)
    f = BatchNormalization(axis=2)(f)
    f = Dropout(0.2)(f)
    
    f = Permute((2, 1, 3))(f)
    dim_1 = int(f.shape[1])
    dim_2 = int(f.shape[2])
    dim_3 = int(f.shape[3])
    f = Reshape((dim_1, dim_2*dim_3))(f)
    
    f = Bidirectional(GRU(32, return_sequences=True))(f)
    f = BatchNormalization()(f)
    f = Dropout(0.2)(f)
    
    f = Bidirectional(GRU(32, return_sequences=False))(f)
    f = BatchNormalization()(f)
    f = Dropout(0.2)(f)
    
    #f = Flatten()(f)
    
    # Second branch
    input_wave = Input(shape=(1, 8000))
                
    g = Conv1D(8, 13, activation='relu', data_format='channels_first', name='Conv_bloc_1')(input_wave)
    g = MaxPool1D(3)(g)
    g = BatchNormalization()(g)
    g = Dropout(0.3)(g)
    
    g = Conv1D(16, 11, activation='relu', data_format='channels_first', name='Conv_bloc_2')(g)
    g = MaxPool1D(3)(g)
    g = BatchNormalization()(g)
    g = Dropout(0.3)(g)
    
    g = Conv1D(32, 9, activation='relu', data_format='channels_first', name='Conv_bloc_3')(g)
    g = MaxPool1D(3)(g)
    g = BatchNormalization()(g)
    g = Dropout(0.3)(g)   
    
    g = Bidirectional(GRU(64, return_sequences=True))(g)
    g = BatchNormalization()(g)
    
    g = Bidirectional(GRU(64, return_sequences=False))(g)
    g = BatchNormalization()(g)
    
    h = Concatenate()([f, g])
    
    # Joining of the 02 branches                
    outputs = Dense(30, activation='softmax')(h)
    model = Model([input_mfcc, input_wave], outputs)
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    print(model.summary())
    return model

In [44]:
model = build_model()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_14 (InputLayer)           (None, 1, 8000)      0                                            
__________________________________________________________________________________________________
Conv_bloc_1 (Conv1D)            (None, 8, 7988)      112         input_14[0][0]                   
__________________________________________________________________________________________________
input_13 (InputLayer)           (None, 20, 44, 1)    0                                            
__________________________________________________________________________________________________
max_pooling1d_19 (MaxPooling1D) (None, 2, 7988)      0           Conv_bloc_1[0][0]                
__________________________________________________________________________________________________
conv2d_13 

In [45]:
%%time
history = model.fit_generator(generator=train_gen,
                              validation_data=valid_gen,
                              steps_per_epoch=1618,
                              epochs=200,
                              validation_steps=202,
                              callbacks=callbacks_list)#,
                    #use_multiprocessing=True,
                    #workers=4)

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Wall time: 2h 36min 41s


In [50]:
# Load the best model
saved_model_path = os.path.join(os.getcwd(), 'audio_best_model.h5')
model = keras.models.load_model(saved_model_path)

In [51]:
# Evaluation on test set to be more confident on our model.
model.evaluate_generator(test_gen, steps=202, verbose=1)



[0.17761864562754176, 0.95606434]

In [None]:
#librosa.load("C:\\Users\\Beranger\\Desktop\\Datasets\\Audio\\Speech_Commands\\bird\\0a9f9af7_nohash_0.wav")

In [83]:
dim_spec=(20, 44)
n_channels=1
n_fft=2048
hop_length=512
power=2.0
ref_log_scal=1
n_mfcc=20

def wave2mfcc(path_wav):
    wave, srate = librosa.load(path_wav, duration=2, mono=True, sr=None)
        
    # We create the mfcc
    mfccs = librosa.feature.mfcc(y=wave, sr=samplerate, n_fft=n_fft, hop_length=hop_length,
                                    power=power, n_mels=20) # a changer
        
    wave = librosa.resample(wave, srate, 8000)
    
    return  mfccs, wave
    

def predict(inputs, classes=all_classes):
    mat, audio= inputs
    if mat.shape[1] < 44:
        mat = np.array(np.pad(mat, ((0,0), (0, 44 - mat.shape[1])),'constant', constant_values= 0))
              
    
    
    prob=model.predict([ mat.reshape(1, 20, 44, 1), audio.reshape(1, 1,8000)])
    index = np.argmax(prob[0])
    return classes[index]


In [141]:
import sounddevice as sd
import soundfile as sf

samplerate = 16000
duration = 1 # seconds
filename = 'file.wav'
print("start")
mydata = sd.rec(int(samplerate*duration), samplerate=samplerate,
               channels=1, blocking=True)
print("end")

sd.wait()
sf.write(filename, mydata, samplerate)


start
end


In [142]:
import IPython.display as ipd
filepath = os.path.join(os.getcwd(), 'file.wav')

# reading
samples, sample_rate = librosa.load(filepath, sr=samplerate)
samples = librosa.resample(samples, sample_rate, 8000)
ipd.Audio(samples, rate=8000)

In [143]:
predict([*wave2mfcc(filepath)])

'house'

In [67]:
wave2mfcc(filepath)[0].shape

(8000,)

In [68]:
wave2mfcc(filepath)[1].shape

(20, 32)