In [4]:
from os import listdir
from os.path import isdir, join
import librosa
import random
import numpy as np
import matplotlib.pyplot as plt
import python_speech_features

In [5]:
# Dataset path and view possible targets
dataset_path = 'D:\wake word detection\data set for wake word detection'
for name in listdir(dataset_path):
    if isdir(join(dataset_path, name)):
        print(name)

backward
bed
bird
cat
dog
down
eight
five
follow
forward
four
go
happy
house
learn
left
marvin
nine
no
off
on
one
right
seven
sheila
six
stop
three
tree
two
up
visual
wow
yes
zero
_background_noise_


In [6]:
# Create an all targets list
all_targets = [ name for name in listdir(dataset_path) if isdir(join(dataset_path, name))]
print(all_targets)

['backward', 'bed', 'bird', 'cat', 'dog', 'down', 'eight', 'five', 'follow', 'forward', 'four', 'go', 'happy', 'house', 'learn', 'left', 'marvin', 'nine', 'no', 'off', 'on', 'one', 'right', 'seven', 'sheila', 'six', 'stop', 'three', 'tree', 'two', 'up', 'visual', 'wow', 'yes', 'zero', '_background_noise_']


In [7]:
# Leave off background noise set
all_targets.remove('_background_noise_')
print(all_targets)

['backward', 'bed', 'bird', 'cat', 'dog', 'down', 'eight', 'five', 'follow', 'forward', 'four', 'go', 'happy', 'house', 'learn', 'left', 'marvin', 'nine', 'no', 'off', 'on', 'one', 'right', 'seven', 'sheila', 'six', 'stop', 'three', 'tree', 'two', 'up', 'visual', 'wow', 'yes', 'zero']


In [8]:
# See how many files are there in each target directory that we choose
num_samples = 0
for target in all_targets:
    print(len(listdir(join(dataset_path, target))))
    num_samples += len(listdir(join(dataset_path, target)))
print('Total samples:', num_samples)

1664
2014
2064
2031
2128
3917
3787
4052
1579
1557
3728
3880
2054
2113
1575
3801
2100
3934
3941
3745
3845
3890
3778
3998
2022
3860
3872
3727
1759
3880
3723
1592
2123
4044
4052
Total samples: 105829


In [9]:
#each word has few thousand samples
# Settings
target_list = all_targets
feature_sets_file = 'all_targets_mfcc_sets.npz' #we would have feature file with us which will be a matrix in the form of features
#we will store it in a npz file
perc_keep_samples = 0.1 # 1.0 is keep all samples #we use 10% of data set for feature extraction in prototype process,for training 
val_ratio = 0.1 #to test how well model performs in testing on the data it has not been worked on
test_ratio = 0.1#to test how our mode perform
sample_rate = 8000 # final model run faster in a low sampling rate
num_mfcc = 16
len_mfcc = 16

In [10]:
# Create list of filenames along with ground truth vector (y)
#supervised learning for classification so we would need the labels for signals during the training step
filenames = []
y = [] #hold the actual values
for index, target in enumerate(target_list):
    print(join(dataset_path, target))
    filenames.append(listdir(join(dataset_path, target)))
    y.append(np.ones(len(filenames[index])) * index)

D:\wake word detection\data set for wake word detection\backward
D:\wake word detection\data set for wake word detection\bed
D:\wake word detection\data set for wake word detection\bird
D:\wake word detection\data set for wake word detection\cat
D:\wake word detection\data set for wake word detection\dog
D:\wake word detection\data set for wake word detection\down
D:\wake word detection\data set for wake word detection\eight
D:\wake word detection\data set for wake word detection\five
D:\wake word detection\data set for wake word detection\follow
D:\wake word detection\data set for wake word detection\forward
D:\wake word detection\data set for wake word detection\four
D:\wake word detection\data set for wake word detection\go
D:\wake word detection\data set for wake word detection\happy
D:\wake word detection\data set for wake word detection\house
D:\wake word detection\data set for wake word detection\learn
D:\wake word detection\data set for wake word detection\left
D:\wake word det

In [11]:
# Check ground truth Y vector
#number is assigned to words in the alphabetical order i.e. backward = 0, bed=1, bird = 2, etc.
print(y)
for item in y:
    print(len(item))

[array([0., 0., 0., ..., 0., 0., 0.]), array([1., 1., 1., ..., 1., 1., 1.]), array([2., 2., 2., ..., 2., 2., 2.]), array([3., 3., 3., ..., 3., 3., 3.]), array([4., 4., 4., ..., 4., 4., 4.]), array([5., 5., 5., ..., 5., 5., 5.]), array([6., 6., 6., ..., 6., 6., 6.]), array([7., 7., 7., ..., 7., 7., 7.]), array([8., 8., 8., ..., 8., 8., 8.]), array([9., 9., 9., ..., 9., 9., 9.]), array([10., 10., 10., ..., 10., 10., 10.]), array([11., 11., 11., ..., 11., 11., 11.]), array([12., 12., 12., ..., 12., 12., 12.]), array([13., 13., 13., ..., 13., 13., 13.]), array([14., 14., 14., ..., 14., 14., 14.]), array([15., 15., 15., ..., 15., 15., 15.]), array([16., 16., 16., ..., 16., 16., 16.]), array([17., 17., 17., ..., 17., 17., 17.]), array([18., 18., 18., ..., 18., 18., 18.]), array([19., 19., 19., ..., 19., 19., 19.]), array([20., 20., 20., ..., 20., 20., 20.]), array([21., 21., 21., ..., 21., 21., 21.]), array([22., 22., 22., ..., 22., 22., 22.]), array([23., 23., 23., ..., 23., 23., 23.]), arr

In [12]:
# Flatten filename and y vectors so that only a list is present which contains the collection of arrays rather than something else
filenames = [item for sublist in filenames for item in sublist]
y = [item for sublist in y for item in sublist]

In [13]:
# Associate filenames with true output using zip  and shuffle them so that the model trains on the random data rather than on the same data
#on a hierarchical order
filenames_y = list(zip(filenames, y))#to link file name with its associated value, i.e. we make all the similar file names to have same value
random.shuffle(filenames_y) # we shuffle the whole files so that it can be used for testing and training the data sets randomly  
filenames, y = zip(*filenames_y)#unzip the files

In [14]:
# Only keep the specified number of samples (shorter extraction/training)
print(len(filenames))
filenames = filenames[:int(len(filenames) * perc_keep_samples)] 
print(len(filenames))#total no of train set based on available size

105829
10582


In [15]:
# Calculate validation and test set sizes
val_set_size = int(len(filenames) * val_ratio)#total size of validation set based on the total available set size
test_set_size = int(len(filenames) * test_ratio)#total size of test set based on the total available set size


In [16]:
# Break dataset apart into train, validation, and test sets
filenames_val = filenames[:val_set_size] #total no of files for validation set
filenames_test = filenames[val_set_size:(val_set_size + test_set_size)] #total no of files for the test set
filenames_train = filenames[(val_set_size + test_set_size):] #total no of files for the train set

In [17]:
# Break y apart into train, validation, and test sets
y_orig_val = y[:val_set_size]
y_orig_test = y[val_set_size:(val_set_size + test_set_size)]
y_orig_train = y[(val_set_size + test_set_size):]

In [18]:
# Function: Create MFCC from given path
def calc_mfcc(path):  #load wavefile from a given path
    
    # Load wavefile
    signal, fs = librosa.load(path, sr=sample_rate) #resample the audio wave file to 8000 samples per second
    
    # Create MFCCs from sound clip
    mfccs = python_speech_features.base.mfcc(signal, 
                                            samplerate=fs,
                                            winlen=0.256, #window size of MFCCs, we want low no of MFCCs so we keep the size more
                                            winstep=0.050, #distance between window
                                            numcep=num_mfcc, #we want 16 MFCCs
                                            nfilt=26, #we want filter to be 26
                                            nfft=2048, #samples for fft depend on size of the window
                                            preemph=0.0,
                                            ceplifter=0,
                                            appendEnergy=False,#to give total energy of frame once the 0th element is thrown out 
                                            winfunc=np.hanning)#hamming window so that fft didn't produce many unwanted artifacts at higher freq 
    return mfccs.transpose()

In [19]:
# TEST: Construct test set by computing MFCC of each WAV file
prob_cnt = 0
x_test = []
y_test = []
for index, filename in enumerate(filenames_train):
    
    # Stop after 500
    if index >= 500:
        break
    
    # Create path from given filename and target item
    path = join(dataset_path, target_list[int(y_orig_train[index])], 
                filename)
    
    # Create MFCCs
    mfccs = calc_mfcc(path)
    
    if mfccs.shape[1] == len_mfcc:
        x_test.append(mfccs)
        y_test.append(y_orig_train[index])
    else:
        print('Dropped:', index, mfccs.shape)
        prob_cnt += 1

Dropped: 21 (16, 12)
Dropped: 28 (16, 7)
Dropped: 30 (16, 13)
Dropped: 52 (16, 11)
Dropped: 61 (16, 15)
Dropped: 80 (16, 10)
Dropped: 105 (16, 14)
Dropped: 107 (16, 15)
Dropped: 121 (16, 14)
Dropped: 152 (16, 8)
Dropped: 182 (16, 11)
Dropped: 231 (16, 7)
Dropped: 233 (16, 13)
Dropped: 257 (16, 15)
Dropped: 268 (16, 9)
Dropped: 284 (16, 15)
Dropped: 290 (16, 10)
Dropped: 297 (16, 15)
Dropped: 298 (16, 13)
Dropped: 309 (16, 13)
Dropped: 312 (16, 15)
Dropped: 346 (16, 14)
Dropped: 397 (16, 11)
Dropped: 428 (16, 12)
Dropped: 434 (16, 8)
Dropped: 464 (16, 14)
Dropped: 465 (16, 12)
Dropped: 475 (16, 12)
Dropped: 480 (16, 15)
Dropped: 490 (16, 14)


In [20]:
print('% of problematic samples:', prob_cnt / 500)

% of problematic samples: 0.06


In [21]:
#drop those samples which didn't produce 16 sets of coefficients exactly
# Function: Create MFCCs, keeping only ones of desired length
def extract_features(in_files, in_y):
    prob_cnt = 0
    out_x = []
    out_y = []
        
    for index, filename in enumerate(in_files):
    
        # Create path from given filename and target item
        path = join(dataset_path, target_list[int(in_y[index])], 
                    filename)
        
        # Check to make sure we're reading a .wav file
        if not path.endswith('.wav'):
            continue

        # Create MFCCs
        mfccs = calc_mfcc(path)

        # Only keep MFCCs with given length
        if mfccs.shape[1] == len_mfcc:
            out_x.append(mfccs)
            out_y.append(in_y[index])
        else:
            print('Dropped:', index, mfccs.shape)
            prob_cnt += 1
            
    return out_x, out_y, prob_cnt

In [22]:

# Create train, validation, and test sets
x_train, y_train, prob = extract_features(filenames_train, 
                                          y_orig_train)
print('Removed percentage:', prob / len(y_orig_train))
x_val, y_val, prob = extract_features(filenames_val, y_orig_val)
print('Removed percentage:', prob / len(y_orig_val))
x_test, y_test, prob = extract_features(filenames_test, y_orig_test)
print('Removed percentage:', prob / len(y_orig_test))

Dropped: 21 (16, 12)
Dropped: 28 (16, 7)
Dropped: 30 (16, 13)
Dropped: 52 (16, 11)
Dropped: 61 (16, 15)
Dropped: 80 (16, 10)
Dropped: 105 (16, 14)
Dropped: 107 (16, 15)
Dropped: 121 (16, 14)
Dropped: 152 (16, 8)
Dropped: 182 (16, 11)
Dropped: 231 (16, 7)
Dropped: 233 (16, 13)
Dropped: 257 (16, 15)
Dropped: 268 (16, 9)
Dropped: 284 (16, 15)
Dropped: 290 (16, 10)
Dropped: 297 (16, 15)
Dropped: 298 (16, 13)
Dropped: 309 (16, 13)
Dropped: 312 (16, 15)
Dropped: 346 (16, 14)
Dropped: 397 (16, 11)
Dropped: 428 (16, 12)
Dropped: 434 (16, 8)
Dropped: 464 (16, 14)
Dropped: 465 (16, 12)
Dropped: 475 (16, 12)
Dropped: 480 (16, 15)
Dropped: 490 (16, 14)
Dropped: 511 (16, 14)
Dropped: 533 (16, 13)
Dropped: 539 (16, 10)
Dropped: 554 (16, 15)
Dropped: 556 (16, 15)
Dropped: 575 (16, 13)
Dropped: 589 (16, 10)
Dropped: 592 (16, 13)
Dropped: 597 (16, 15)
Dropped: 615 (16, 12)
Dropped: 619 (16, 15)
Dropped: 632 (16, 12)
Dropped: 637 (16, 7)
Dropped: 685 (16, 13)
Dropped: 689 (16, 15)
Dropped: 696 (16, 15)


Dropped: 4676 (16, 13)
Dropped: 4679 (16, 10)
Dropped: 4684 (16, 10)
Dropped: 4728 (16, 12)
Dropped: 4730 (16, 12)
Dropped: 4733 (16, 10)
Dropped: 4750 (16, 15)
Dropped: 4753 (16, 11)
Dropped: 4767 (16, 15)
Dropped: 4769 (16, 13)
Dropped: 4775 (16, 13)
Dropped: 4806 (16, 15)
Dropped: 4811 (16, 13)
Dropped: 4812 (16, 11)
Dropped: 4836 (16, 15)
Dropped: 4842 (16, 13)
Dropped: 4858 (16, 15)
Dropped: 4859 (16, 9)
Dropped: 4877 (16, 14)
Dropped: 4895 (16, 9)
Dropped: 4903 (16, 10)
Dropped: 4907 (16, 10)
Dropped: 4910 (16, 9)
Dropped: 4912 (16, 13)
Dropped: 4916 (16, 13)
Dropped: 4929 (16, 8)
Dropped: 4937 (16, 10)
Dropped: 4938 (16, 11)
Dropped: 4939 (16, 10)
Dropped: 4986 (16, 11)
Dropped: 4987 (16, 12)
Dropped: 5014 (16, 13)
Dropped: 5021 (16, 11)
Dropped: 5034 (16, 11)
Dropped: 5041 (16, 11)
Dropped: 5043 (16, 15)
Dropped: 5045 (16, 11)
Dropped: 5054 (16, 15)
Dropped: 5069 (16, 13)
Dropped: 5077 (16, 8)
Dropped: 5086 (16, 12)
Dropped: 5097 (16, 12)
Dropped: 5113 (16, 15)
Dropped: 5116 (1

Dropped: 640 (16, 11)
Dropped: 651 (16, 14)
Dropped: 658 (16, 12)
Dropped: 662 (16, 11)
Dropped: 669 (16, 14)
Dropped: 673 (16, 12)
Dropped: 683 (16, 14)
Dropped: 716 (16, 13)
Dropped: 753 (16, 10)
Dropped: 767 (16, 12)
Dropped: 771 (16, 11)
Dropped: 773 (16, 8)
Dropped: 782 (16, 11)
Dropped: 786 (16, 14)
Dropped: 810 (16, 15)
Dropped: 811 (16, 15)
Dropped: 832 (16, 8)
Dropped: 845 (16, 13)
Dropped: 865 (16, 11)
Dropped: 869 (16, 9)
Dropped: 886 (16, 11)
Dropped: 921 (16, 13)
Dropped: 937 (16, 15)
Dropped: 958 (16, 14)
Dropped: 972 (16, 9)
Dropped: 986 (16, 11)
Dropped: 1001 (16, 7)
Dropped: 1003 (16, 13)
Dropped: 1033 (16, 3)
Dropped: 1035 (16, 14)
Dropped: 1039 (16, 11)
Dropped: 1049 (16, 13)
Removed percentage: 0.08601134215500945
Dropped: 3 (16, 11)
Dropped: 7 (16, 7)
Dropped: 18 (16, 13)
Dropped: 20 (16, 12)
Dropped: 139 (16, 10)
Dropped: 153 (16, 13)
Dropped: 164 (16, 13)
Dropped: 169 (16, 11)
Dropped: 170 (16, 14)
Dropped: 171 (16, 14)
Dropped: 174 (16, 10)
Dropped: 191 (16, 15)

In [23]:
# Save features and truth vector (y) sets to disk
np.savez(feature_sets_file, 
         x_train=x_train, 
         y_train=y_train, 
         x_val=x_val, 
         y_val=y_val, 
         x_test=x_test, 
         y_test=y_test)

In [24]:

# TEST: Load features
feature_sets = np.load(feature_sets_file)
feature_sets.files

['x_train', 'y_train', 'x_val', 'y_val', 'x_test', 'y_test']