## Building NN for speech recogniton and optimize result 

In [208]:
import numpy as np
import os
from scipy.io import wavfile
from python_speech_features import mfcc, logfbank
import matplotlib.pyplot as plt
import pickle
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn import preprocessing
%matplotlib inline

from utils import *


#### Get data

In [11]:
fpaths = []
labels = []
word_spoken = []


dataset = '41'
input_folder = 'data\{}'.format(dataset)

# pars the input directory that contains audio files
# get audio files and their lables

for f in os.listdir(input_folder):
    for w in os.listdir(input_folder+'\\'+ f):
        # check wheter files is wav or not
        
        if (w.find('wav')!=-1):
            fpaths.append(input_folder+'\\'+f+'\\'+w)
            labels.append(f)
            if f not in word_spoken:
                word_spoken.append(f)
print("Spoken words: "+ str(word_spoken))

Spoken words: ['0_zero', '1_one', '2_two', '3_three', '4_four', '5_five', '6_six', '7_seven', '8_eight', '9_nine', 'aboard', 'adjusted & locked', 'All switches', 'Alternate air door', 'A_C Documents', 'Battery+Main bus', 'Cabin doors', 'Checked', 'Circuit Breakers', 'Closed', 'Cockpit', 'Cockpit checklist completed', 'Completed', 'decimal', 'Flight Controls', 'Fuel Quantity', 'Fuel Selector', 'Fuel Shutoff Valve', 'Fuel Temperature', 'in', 'locked', 'off', 'On', 'open', 'preflight_inspection', 'removed', 'Seats & Belts', 'Shut-off cabin heat', 'sufficient', 'Towbar', 'Weight and balance']


In [12]:
# size of dataset
print(len(labels))

21433


## Extracting frequeny domain features

At the second stage we convet a signal into the freqency domain. In monst modern speech recognitoon freqeency-domain features are used as key component. In case of multispeakers MFFC feature extraction works best. After convert a signal into a freq domain, it's requered to convert it into a useable form. **Mel Frequency Cepstral Coefficients (MFCC)** is a good way to do that. *MFCC* takes the power spectrum of a signal and then uses a combination of filter banks and disrete cosinetransform to extract pattern of phones or features.

After extracting **MFFC** features we exract data into single data matrix, and a label vector with the correct label for eac data file is ceated. 

In [334]:
from scipy.io import wavfile

data = []

mfcc_max_length = 0

# first file desitination name and index
file_name = ''
word_spoken_index = 0

for n,file in enumerate(fpaths):
    
    # show current desintation 
    if (file.find(file_name)<=0):
        file_name=word_spoken[word_spoken_index]
        print(word_spoken[word_spoken_index])
        word_spoken_index+=1

    # read file 
    sampling_freq, audio = wavfile.read(file)
    # Extract MFCC features
    mfcc_features = mfcc(audio, sampling_freq)

    
    mfcc_len=mfcc_features.shape[0]*mfcc_features.shape[1]
    # get length of largets feature array
    if mfcc_len>mfcc_max_length:    
        mfcc_max_length=mfcc_len
     
    # flat data into 2D array
    mfcc_features=np.resize(mfcc_features,(1,mfcc_len))
    
    data.insert(n,mfcc_features)

0_zero
1_one
2_two
3_three
4_four
5_five
6_six
7_seven
8_eight
9_nine
aboard
adjusted & locked
All switches
Alternate air door
A_C Documents
Battery+Main bus
Cabin doors
Checked
Circuit Breakers
Closed
Cockpit
Cockpit checklist completed
Completed
decimal
Flight Controls
Fuel Quantity
Fuel Selector
Fuel Shutoff Valve
Fuel Temperature
in
locked
off
On
open
preflight_inspection
removed
Seats & Belts
Shut-off cabin heat
sufficient
Towbar
Weight and balance


#### Pad zeros to small arrays

In [343]:
x_data = np.zeros((len(data),mfcc_max_length), dtype=float)
for i,_d in enumerate(data):
    x_data[i,0:_d.shape[1]]=_d

### Get labels

In [345]:
#Each sample file is one row in data, and has one entry in labels
print('Number of files total:', len(data))
all_labels = np.zeros(len(data),dtype=int)
for n, l in enumerate(set(labels)):
    all_labels[np.array([i for i, _ in enumerate(labels) if _ == l])] = n
print('Labels and label indices', all_labels)

Number of files total: 21433
Labels and label indices [14 14 14 ... 31 31 31]


In [346]:
target = np.zeros((len(all_labels),41),dtype=int)
for i,_ in enumerate(all_labels):
    target[i][_]=1

## Create train and test dataset

In [349]:
from sklearn.model_selection import StratifiedShuffleSplit
sss = StratifiedShuffleSplit(n_splits=10, test_size=0.15, random_state=0)

for train_index, test_index in sss.split(x_data,target):
    X_train, X_test = x_data[train_index], x_data[test_index]
    y_train, y_test = target[train_index], target[test_index]
    

print('Size of training matrix:', X_train.shape)
print('Size of testing matrix:', X_test.shape)

Size of training matrix: (18218, 2808)
Size of testing matrix: (3215, 2808)


#### Normalize values

In [350]:
# normalize train
scaler = preprocessing.StandardScaler().fit(X_train)
X_train=scaler.transform(X_train)

In [357]:
# normalize test
scaler = preprocessing.StandardScaler().fit(X_test)
X_test=scaler.transform(X_test)

In [351]:
#get number of columns in training data
n_cols = x_data.shape[1]

In [352]:
model = Sequential()

#add layers to model
model.add(Dense(200, activation='sigmoid', input_shape=(n_cols,)))
model.add(Dense(200, activation='sigmoid'))
model.add(Dense(200, activation='sigmoid'))
model.add(Dense(41, activation='softmax'))

In [353]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_45 (Dense)             (None, 200)               561800    
_________________________________________________________________
dense_46 (Dense)             (None, 200)               40200     
_________________________________________________________________
dense_47 (Dense)             (None, 200)               40200     
_________________________________________________________________
dense_48 (Dense)             (None, 41)                8241      
Total params: 650,441
Trainable params: 650,441
Non-trainable params: 0
_________________________________________________________________


In [354]:
early_stopping_monitor = EarlyStopping(patience=3)

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [356]:
#train model
model.fit(X_train, y_train, epochs=50, batch_size=20,validation_split=0.2)

Train on 14574 samples, validate on 3644 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x233c49abf28>

### Evaluate model

In [358]:
scores = model.evaluate(X_test, y_test, verbose=0)
print("%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))

acc: 98.44%
