In [112]:
import os
import math
import random
import tensorflow as tf
import numpy as np
from sklearn.utils import shuffle
from python_speech_features import mfcc
from python_speech_features import delta
from python_speech_features import logfbank
import scipy.io.wavfile as wav
import sys
%matplotlib inline

In [185]:
NUM_LABELS = 10

def genOneHot(indexToBeOne):
    global NUM_LABELS
    oneHot = []
    for i in range(NUM_LABELS):
        if i == indexToBeOne:
            oneHot.append(1)
        else:
            oneHot.append(0)
    return np.array(oneHot)

def extractFeatures(raw, sr):
    stft = np.abs(librosa.stft(raw))
    mfccs = np.mean(librosa.feature.mfcc(y=raw, sr=sr, n_mfcc=40).T,axis=0)
    mel = np.mean(librosa.feature.melspectrogram(raw, sr=sr).T,axis=0)
#     chroma = np.mean(librosa.feature.chroma_stft(S=stft, sr=sr).T,axis=0)
#     contrast = np.mean(librosa.feature.spectral_contrast(S=stft, sr=sr).T,axis=0)
#     tonnetz = np.mean(librosa.feature.tonnetz(y=librosa.effects.harmonic(raw),sr=sr).T,axis=0)
    
    return mel
    
def loadSoundFiles(filePath, index):
    IDEAL_SIZE = 22050
    NUM_SAMPLES = 1000
    oneHots = []
    soundFiles = os.listdir(filePath)
    features = np.empty((0, 128))
    
    counter = 0
    for soundFile in soundFiles:
        raw, sr = librosa.load(filePath + soundFile)
        if len(raw) == IDEAL_SIZE:
            mel = extractFeatures(raw, sr)
            internalFeatures = np.hstack([mel])
            features = np.vstack([features, internalFeatures])
            counter += 1
            
            oneHot = genOneHot(index)
            if len(oneHots) == 0:
                oneHots = oneHot
            else:
                oneHots = np.vstack((oneHots, oneHot))
        
            if (counter % (NUM_SAMPLES / 10)) == 0:
                print(str((counter/NUM_SAMPLES) * 100) + "% complete")
        
            if counter == NUM_SAMPLES:
                break
            
    print("The one hot signature for this word is: " + str(oneHots[0]))
    print("The number of samples for this word is: " + str(len(features)))
        
    return np.array(features), oneHots
    
data = []
oneHots = []
rootDir = "./train/audio/"
acceptedWords = ["yes", "no", "up", "down", "left", "right", "on", "off", "stop", "go"]
count = 1
for acceptedWord in acceptedWords:
    print(str(count) + "/" + str(NUM_LABELS))
    features, oneHot = loadSoundFiles(rootDir + acceptedWord + "/", count-1)
    data.append(features)
    oneHots.append(oneHot)
    count += 1
    
print("DONE")

1/10
10.0% complete
20.0% complete
30.0% complete
40.0% complete
50.0% complete
60.0% complete
70.0% complete
80.0% complete
90.0% complete
100.0% complete
The one hot signature for this word is: [1 0 0 0 0 0 0 0 0 0]
The number of samples for this word is: 1000
2/10
10.0% complete
20.0% complete
30.0% complete
40.0% complete
50.0% complete
60.0% complete
70.0% complete
70.0% complete
80.0% complete
80.0% complete
80.0% complete
80.0% complete
90.0% complete
90.0% complete
100.0% complete
The one hot signature for this word is: [0 1 0 0 0 0 0 0 0 0]
The number of samples for this word is: 1000
3/10
0.0% complete
10.0% complete
20.0% complete
30.0% complete
30.0% complete
40.0% complete
50.0% complete
60.0% complete
70.0% complete
80.0% complete
90.0% complete
100.0% complete
The one hot signature for this word is: [0 0 1 0 0 0 0 0 0 0]
The number of samples for this word is: 1000
4/10
10.0% complete
10.0% complete
20.0% complete
30.0% complete
30.0% complete
40.0% complete
50.0% comple

In [189]:
dataTraining = []
trainingLabels = []
dataTest = []
testLabels = []

minSamples = 1000000
for training in data:
    if minSamples > len(training):
        minSamples = len(training)

numTrainingSamples = int(minSamples * 0.8)
numTestSamples = int(minSamples * 0.2)

for datasubset in data:
    if len(dataTraining) == 0:
        dataTraining = datasubset[:numTrainingSamples]
        dataTest = datasubset[numTrainingSamples:numTrainingSamples+numTestSamples]
    else:
        dataTraining = np.vstack((dataTraining, datasubset[:numTrainingSamples]))
        dataTest = np.vstack((dataTest, datasubset[numTrainingSamples:numTrainingSamples+numTestSamples]))
               
for oneHotInfo in oneHots:
    if len(trainingLabels) == 0:
        trainingLabels = oneHotInfo[:numTrainingSamples]
        testLabels = oneHotInfo[numTrainingSamples:numTrainingSamples+numTestSamples]
    else:
        trainingLabels = np.vstack((trainingLabels, oneHotInfo[:numTrainingSamples]))
        testLabels = np.vstack((testLabels, oneHotInfo[numTrainingSamples:numTrainingSamples+numTestSamples]))

print("Finished formatting the data to remove bias and to be ready for training")

Finished formatting the data to remove bias and to be ready for training


In [197]:
dataTraining, trainingLabels = shuffle(dataTraining, trainingLabels)
dataTest, testLabels = shuffle(dataTest, testLabels)
print("Randomized the order of the data and labels keeping the relationship 1 to 1")

Randomized the order of the data and labels keeping the relationship 1 to 1


In [220]:
def getBatch(data, labels, batchSize):
    randomIndexes = np.random.choice(len(data), batchSize)
    return data[randomIndexes], labels[randomIndexes]

tf.reset_default_graph()

NUM_SAMPLES = 128
NUM_CLASSIFICATIONS = NUM_LABELS
FIRST_LAYER_OUTPUT = 100
RATE = 0.01
BATCH_SIZE = 200
EPOCHS = 5000
ITERATIONS_PER_EPOCH = 100

# Shape = (N x 22050)
x = tf.placeholder(tf.float32, (None, NUM_SAMPLES))
# Shape = (N x 2)
ref = tf.placeholder(tf.float32, (None, NUM_CLASSIFICATIONS))
# Calculate the logits
logits1 = tf.layers.dense(inputs=x, units=FIRST_LAYER_OUTPUT, activation=tf.nn.relu, name="L1")
logits2 = tf.layers.dense(inputs=logits1, units=NUM_CLASSIFICATIONS, activation=None, name="L2")

# The mean cross entropy as the cost function
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=logits2, labels=ref))

# Initialize the tensorflow session
optimizer = tf.train.GradientDescentOptimizer(RATE).minimize(cost)
s = tf.Session()
s.run(tf.global_variables_initializer())

# Do the training
count = 0
for _ in range(EPOCHS):
    inputData, correctAns = getBatch(dataTraining, trainingLabels, BATCH_SIZE)
    crossData, crossAns = getBatch(dataTraining, trainingLabels, BATCH_SIZE)
    for _ in range(ITERATIONS_PER_EPOCH):
        err, _ = s.run((cost, optimizer), feed_dict={x: inputData, ref: correctAns})
        count += 1
        if (count % ((EPOCHS * ITERATIONS_PER_EPOCH) / 100)) == 0:
            crossErr = s.run((cost), feed_dict={x: crossData, ref: crossAns})
            print(str(int(((count / (EPOCHS * ITERATIONS_PER_EPOCH)) * 100) + 0.5)) + "% complete: Training Error = " + str(err) + ". Cross Validation Error = " + str(crossErr))

1% complete: Training Error = 0.962552. Cross Validation Error = 3.22797
2% complete: Training Error = 0.778844. Cross Validation Error = 3.00522
3% complete: Training Error = 0.942308. Cross Validation Error = 3.79976
4% complete: Training Error = 0.806947. Cross Validation Error = 3.25792
5% complete: Training Error = 0.752114. Cross Validation Error = 3.06435
6% complete: Training Error = 0.715763. Cross Validation Error = 2.80659
7% complete: Training Error = 0.702454. Cross Validation Error = 3.7587
8% complete: Training Error = 0.649847. Cross Validation Error = 3.98267
9% complete: Training Error = 0.58479. Cross Validation Error = 4.60736
10% complete: Training Error = 0.56113. Cross Validation Error = 3.35113
11% complete: Training Error = 0.517044. Cross Validation Error = 3.7597
12% complete: Training Error = 0.594371. Cross Validation Error = 3.07002
13% complete: Training Error = 0.368475. Cross Validation Error = 3.54908
14% complete: Training Error = 0.499825. Cross Vali

In [221]:
# Take the softmax to convert the logit value to a percentage guess
probability = tf.nn.softmax(logits2)
# Take the highest probability value as the neural network's guess
prediction = tf.argmax((probability), axis=1)
# Check how accurate the training is
guesses = s.run((prediction), feed_dict={x: dataTraining})

count = 0
corr = 0
categoriesRight = np.zeros(10)
for trainingLabel in trainingLabels:
    if (np.argmax(trainingLabel)) == guesses[count]:
        corr += 1
        categoriesRight[np.argmax(trainingLabel)] += 1
    count += 1

print("The neural network is " + str((corr / len(guesses)) * 100) + "% accurate on the training data")
print(acceptedWords)
count = 0
print((categoriesRight / numTrainingSamples) * 100)

The neural network is 69.65% accurate on the training data
['yes', 'no', 'up', 'down', 'left', 'right', 'on', 'off', 'stop', 'go']
[ 81.875  67.875  65.5    58.25   66.75   66.25   74.25   78.875  70.5
  66.375]


In [222]:
# Take the softmax to convert the logit value to a percentage guess
probability = tf.nn.softmax(logits2)
# Take the highest probability value as the neural network's guess
prediction = tf.argmax((probability), axis=1)
# Check how accurate the test is
guesses = s.run((prediction), feed_dict={x: dataTest})

count = 0
corr = 0
categoriesRight = np.zeros(10)
for testLabel in testLabels:
    if (np.argmax(testLabel)) == guesses[count]:
        corr += 1
        categoriesRight[np.argmax(testLabel)] += 1
    count += 1
    
print("The neural network is " + str((corr / len(guesses)) * 100) + "% accurate on the test data")
print(acceptedWords)
print((categoriesRight / numTestSamples) * 100)

The neural network is 40.25% accurate on the test data
['yes', 'no', 'up', 'down', 'left', 'right', 'on', 'off', 'stop', 'go']
[ 60.   34.   41.5  34.   34.   32.   37.   51.5  37.   41.5]


In [236]:
BATCH_SIZE = 200

def weight_variable(shape):
    initial = tf.truncated_normal(shape, stddev=0.1)
    return tf.Variable(initial)

def bias_variable(shape):
    initial = tf.constant(0.1, shape=shape)
    return tf.Variable(initial)

def conv2d(x, W):
    return tf.nn.conv2d(x, W, strides=[1, 1, 1, 1], padding='SAME')

def max_pool_2x2(x):
    return tf.nn.max_pool(x, ksize=[1, 2, 2, 1],
                        strides=[1, 2, 2, 1], padding='SAME')

W_conv1 = weight_variable([5, 5, 1, 32])
b_conv1 = bias_variable([32])
subAudio = tf.reshape(x, [-1, 16, 8, 1])

h_conv1 = tf.nn.relu(conv2d(x_image, W_conv1) + b_conv1)
h_pool1 = max_pool_2x2(h_conv1)

W_conv2 = weight_variable([5, 5, 32, 64])
b_conv2 = bias_variable([64])

h_conv2 = tf.nn.relu(conv2d(h_pool1, W_conv2) + b_conv2)
h_pool2 = max_pool_2x2(h_conv2)

W_fc1 = weight_variable([4 * 2 * 64, 1024])
b_fc1 = bias_variable([1024])

h_pool2_flat = tf.reshape(h_pool2, [-1, 4 * 2 * 64])
h_fc1 = tf.nn.relu(tf.matmul(h_pool2_flat, W_fc1) + b_fc1)

keep_prob = tf.placeholder(tf.float32)
h_fc1_drop = tf.nn.dropout(h_fc1, keep_prob)

W_fc2 = weight_variable([1024, 10])
b_fc2 = bias_variable([10])

convolution = tf.matmul(h_fc1_drop, W_fc2) + b_fc2

ref = tf.placeholder(tf.float32, (None, NUM_CLASSIFICATIONS))

cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=convolution, labels=ref))

s = tf.Session()
optimizer = tf.train.GradientDescentOptimizer(RATE).minimize(cost)
s.run(tf.global_variables_initializer())
err, _ = s.run((cost, optimizer), feed_dict={x:dataTraining, ref:trainingLabels})
print(err)

# Initialize the tensorflow session
# optimizer = tf.train.GradientDescentOptimizer(RATE).minimize(cost)
# s = tf.Session()
# s.run(tf.global_variables_initializer())

# Do the training
# count = 0
# for _ in range(EPOCHS):
#     inputData, correctAns = getBatch(dataTraining, trainingLabels, BATCH_SIZE)
#     crossData, crossAns = getBatch(dataTraining, trainingLabels, BATCH_SIZE)
#     for _ in range(ITERATIONS_PER_EPOCH):
#         err, _ = s.run((cost, optimizer), feed_dict={x: inputData, ref: correctAns})
#         count += 1
#         if (count % ((EPOCHS * ITERATIONS_PER_EPOCH) / 100)) == 0:
#             crossErr = s.run((cost), feed_dict={x: crossData, ref: crossAns})
#             print(str(int(((count / (EPOCHS * ITERATIONS_PER_EPOCH)) * 100) + 0.5)) + "% complete: Training Error = " + str(err) + ". Cross Validation Error = " + str(crossErr))

InvalidArgumentError: You must feed a value for placeholder tensor 'Placeholder_8' with dtype float
	 [[Node: Placeholder_8 = Placeholder[dtype=DT_FLOAT, shape=<unknown>, _device="/job:localhost/replica:0/task:0/cpu:0"]()]]

Caused by op 'Placeholder_8', defined at:
  File "/usr/lib/python3.6/runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "/usr/lib/python3.6/runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "/usr/lib/python3.6/site-packages/ipykernel/__main__.py", line 3, in <module>
    app.launch_new_instance()
  File "/usr/lib/python3.6/site-packages/traitlets/config/application.py", line 658, in launch_instance
    app.start()
  File "/usr/lib/python3.6/site-packages/ipykernel/kernelapp.py", line 474, in start
    ioloop.IOLoop.instance().start()
  File "/usr/lib/python3.6/site-packages/zmq/eventloop/ioloop.py", line 177, in start
    super(ZMQIOLoop, self).start()
  File "/usr/lib/python3.6/site-packages/tornado/ioloop.py", line 888, in start
    handler_func(fd_obj, events)
  File "/usr/lib/python3.6/site-packages/tornado/stack_context.py", line 277, in null_wrapper
    return fn(*args, **kwargs)
  File "/usr/lib/python3.6/site-packages/zmq/eventloop/zmqstream.py", line 440, in _handle_events
    self._handle_recv()
  File "/usr/lib/python3.6/site-packages/zmq/eventloop/zmqstream.py", line 472, in _handle_recv
    self._run_callback(callback, msg)
  File "/usr/lib/python3.6/site-packages/zmq/eventloop/zmqstream.py", line 414, in _run_callback
    callback(*args, **kwargs)
  File "/usr/lib/python3.6/site-packages/tornado/stack_context.py", line 277, in null_wrapper
    return fn(*args, **kwargs)
  File "/usr/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 276, in dispatcher
    return self.dispatch_shell(stream, msg)
  File "/usr/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 228, in dispatch_shell
    handler(stream, idents, msg)
  File "/usr/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 390, in execute_request
    user_expressions, allow_stdin)
  File "/usr/lib/python3.6/site-packages/ipykernel/ipkernel.py", line 196, in do_execute
    res = shell.run_cell(code, store_history=store_history, silent=silent)
  File "/usr/lib/python3.6/site-packages/ipykernel/zmqshell.py", line 501, in run_cell
    return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
  File "/usr/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2717, in run_cell
    interactivity=interactivity, compiler=compiler, result=result)
  File "/usr/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2821, in run_ast_nodes
    if self.run_code(code, result):
  File "/usr/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2881, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-236-c082f27d4590>", line 37, in <module>
    keep_prob = tf.placeholder(tf.float32)
  File "/usr/lib/python3.6/site-packages/tensorflow/python/ops/array_ops.py", line 1548, in placeholder
    return gen_array_ops._placeholder(dtype=dtype, shape=shape, name=name)
  File "/usr/lib/python3.6/site-packages/tensorflow/python/ops/gen_array_ops.py", line 2094, in _placeholder
    name=name)
  File "/usr/lib/python3.6/site-packages/tensorflow/python/framework/op_def_library.py", line 767, in apply_op
    op_def=op_def)
  File "/usr/lib/python3.6/site-packages/tensorflow/python/framework/ops.py", line 2630, in create_op
    original_op=self._default_original_op, op_def=op_def)
  File "/usr/lib/python3.6/site-packages/tensorflow/python/framework/ops.py", line 1204, in __init__
    self._traceback = self._graph._extract_stack()  # pylint: disable=protected-access

InvalidArgumentError (see above for traceback): You must feed a value for placeholder tensor 'Placeholder_8' with dtype float
	 [[Node: Placeholder_8 = Placeholder[dtype=DT_FLOAT, shape=<unknown>, _device="/job:localhost/replica:0/task:0/cpu:0"]()]]
