In [8]:
import os
import math
import random
import tensorflow as tf
import numpy as np
from sklearn.utils import shuffle
from python_speech_features import mfcc
from python_speech_features import delta
from python_speech_features import logfbank
import scipy.io.wavfile as wav
import librosa
import sys
%matplotlib inline

In [49]:
NUM_LABELS = 10

def genOneHot(indexToBeOne):
    global NUM_LABELS
    oneHot = []
    for i in range(NUM_LABELS):
        if i == indexToBeOne:
            oneHot.append(1)
        else:
            oneHot.append(0)
    return np.array(oneHot)

def extractFeatures(raw, sr):
    stft = np.abs(librosa.stft(raw))
    mfccs = np.mean(librosa.feature.mfcc(y=raw, sr=sr, n_mfcc=40).T,axis=0)
    mel = np.mean(librosa.feature.melspectrogram(raw, sr=sr).T,axis=0)
#     chroma = np.mean(librosa.feature.chroma_stft(S=stft, sr=sr).T,axis=0)
#     contrast = np.mean(librosa.feature.spectral_contrast(S=stft, sr=sr).T,axis=0)
#     tonnetz = np.mean(librosa.feature.tonnetz(y=librosa.effects.harmonic(raw),sr=sr).T,axis=0)
    
    return mel
    
def loadSoundFiles(filePath, index):
    IDEAL_SIZE = 22050
    NUM_SAMPLES = 1000
    oneHots = []
    soundFiles = os.listdir(filePath)
    features = np.empty((0, 144))
    
    counter = 0
    for soundFile in soundFiles:
        raw, sr = librosa.load(filePath + soundFile)
        if len(raw) == IDEAL_SIZE:
            mel = extractFeatures(raw, sr)
            mel = np.lib.pad(mel, (8,8), 'constant', constant_values=(0, 0))
            internalFeatures = np.hstack([mel])
            features = np.vstack([features, internalFeatures])
            counter += 1
            
            oneHot = genOneHot(index)
            if len(oneHots) == 0:
                oneHots = oneHot
            else:
                oneHots = np.vstack((oneHots, oneHot))
        
            if (counter % (NUM_SAMPLES / 10)) == 0:
                print(str((counter/NUM_SAMPLES) * 100) + "% complete")
        
            if counter == NUM_SAMPLES:
                break
            
    print("The one hot signature for this word is: " + str(oneHots[0]))
    print("The number of samples for this word is: " + str(len(features)))
        
    return np.array(features), oneHots
    
data = []
oneHots = []
rootDir = "./train/audio/"
acceptedWords = ["yes", "no", "up", "down", "left", "right", "on", "off", "stop", "go"]
count = 1
for acceptedWord in acceptedWords:
    print(str(count) + "/" + str(NUM_LABELS))
    features, oneHot = loadSoundFiles(rootDir + acceptedWord + "/", count-1)
    data.append(features)
    oneHots.append(oneHot)
    count += 1
    
print("DONE")

1/10
10.0% complete
20.0% complete
30.0% complete
40.0% complete
50.0% complete
60.0% complete
70.0% complete
80.0% complete
90.0% complete
100.0% complete
The one hot signature for this word is: [1 0 0 0 0 0 0 0 0 0]
The number of samples for this word is: 1000
2/10
10.0% complete
20.0% complete
30.0% complete
40.0% complete
50.0% complete
60.0% complete
70.0% complete
80.0% complete
90.0% complete
100.0% complete
The one hot signature for this word is: [0 1 0 0 0 0 0 0 0 0]
The number of samples for this word is: 1000
3/10
10.0% complete
20.0% complete
30.0% complete
40.0% complete
50.0% complete
60.0% complete
70.0% complete
80.0% complete
90.0% complete
100.0% complete
The one hot signature for this word is: [0 0 1 0 0 0 0 0 0 0]
The number of samples for this word is: 1000
4/10
10.0% complete
20.0% complete
30.0% complete
40.0% complete
50.0% complete
60.0% complete
70.0% complete
80.0% complete
90.0% complete
100.0% complete
The one hot signature for this word is: [0 0 0 1 0 0 0 

In [291]:
dataTraining = []
trainingLabels = []
dataTest = []
testLabels = []

minSamples = 1000000
for training in data:
    if minSamples > len(training):
        minSamples = len(training)

numTrainingSamples = int(minSamples * 0.8)
numTestSamples = int(minSamples * 0.2)

for datasubset in data:
    if len(dataTraining) == 0:
        dataTraining = datasubset[:numTrainingSamples]
        dataTest = datasubset[numTrainingSamples:numTrainingSamples+numTestSamples]
    else:
        dataTraining = np.vstack((dataTraining, datasubset[:numTrainingSamples]))
        dataTest = np.vstack((dataTest, datasubset[numTrainingSamples:numTrainingSamples+numTestSamples]))
               
for oneHotInfo in oneHots:
    if len(trainingLabels) == 0:
        trainingLabels = oneHotInfo[:numTrainingSamples]
        testLabels = oneHotInfo[numTrainingSamples:numTrainingSamples+numTestSamples]
    else:
        trainingLabels = np.vstack((trainingLabels, oneHotInfo[:numTrainingSamples]))
        testLabels = np.vstack((testLabels, oneHotInfo[numTrainingSamples:numTrainingSamples+numTestSamples]))

print("Finished formatting the data to remove bias and to be ready for training")

Finished formatting the data to remove bias and to be ready for training


In [292]:
dataTraining, trainingLabels = shuffle(dataTraining, trainingLabels)
dataTest, testLabels = shuffle(dataTest, testLabels)
print("Randomized the order of the data and labels keeping the relationship 1 to 1")

Randomized the order of the data and labels keeping the relationship 1 to 1


In [293]:
def getBatch(data, labels, batchSize):
    randomIndexes = np.random.choice(len(data), batchSize)
    return data[randomIndexes], labels[randomIndexes]

In [220]:
tf.reset_default_graph()

NUM_SAMPLES = 144
NUM_CLASSIFICATIONS = NUM_LABELS
FIRST_LAYER_OUTPUT = 100
RATE = 0.01
BATCH_SIZE = 200
EPOCHS = 5000
ITERATIONS_PER_EPOCH = 100

# Shape = (N x 128)
x = tf.placeholder(tf.float32, (None, NUM_SAMPLES))
# Shape = (N x 2)
ref = tf.placeholder(tf.float32, (None, NUM_CLASSIFICATIONS))
# Calculate the logits
logits1 = tf.layers.dense(inputs=x, units=FIRST_LAYER_OUTPUT, activation=tf.nn.relu, name="L1")
logits2 = tf.layers.dense(inputs=logits1, units=NUM_CLASSIFICATIONS, activation=None, name="L2")

# The mean cross entropy as the cost function
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=logits2, labels=ref))

# Initialize the tensorflow session
optimizer = tf.train.GradientDescentOptimizer(RATE).minimize(cost)
s = tf.Session()
s.run(tf.global_variables_initializer())

# Do the training
count = 0
for _ in range(EPOCHS):
    inputData, correctAns = getBatch(dataTraining, trainingLabels, BATCH_SIZE)
    crossData, crossAns = getBatch(dataTraining, trainingLabels, BATCH_SIZE)
    for _ in range(ITERATIONS_PER_EPOCH):
        err, _ = s.run((cost, optimizer), feed_dict={x: inputData, ref: correctAns})
        count += 1
        if (count % ((EPOCHS * ITERATIONS_PER_EPOCH) / 100)) == 0:
            crossErr = s.run((cost), feed_dict={x: crossData, ref: crossAns})
            print(str(int(((count / (EPOCHS * ITERATIONS_PER_EPOCH)) * 100) + 0.5)) + "% complete: Training Error = " + str(err) + ". Cross Validation Error = " + str(crossErr))

1% complete: Training Error = 0.962552. Cross Validation Error = 3.22797
2% complete: Training Error = 0.778844. Cross Validation Error = 3.00522
3% complete: Training Error = 0.942308. Cross Validation Error = 3.79976
4% complete: Training Error = 0.806947. Cross Validation Error = 3.25792
5% complete: Training Error = 0.752114. Cross Validation Error = 3.06435
6% complete: Training Error = 0.715763. Cross Validation Error = 2.80659
7% complete: Training Error = 0.702454. Cross Validation Error = 3.7587
8% complete: Training Error = 0.649847. Cross Validation Error = 3.98267
9% complete: Training Error = 0.58479. Cross Validation Error = 4.60736
10% complete: Training Error = 0.56113. Cross Validation Error = 3.35113
11% complete: Training Error = 0.517044. Cross Validation Error = 3.7597
12% complete: Training Error = 0.594371. Cross Validation Error = 3.07002
13% complete: Training Error = 0.368475. Cross Validation Error = 3.54908
14% complete: Training Error = 0.499825. Cross Vali

In [294]:
# Take the softmax to convert the logit value to a percentage guess
probability = tf.nn.softmax(logits2)
# Take the highest probability value as the neural network's guess
prediction = tf.argmax((probability), axis=1)
# Check how accurate the training is
guesses = s.run((prediction), feed_dict={x: dataTraining})

count = 0
corr = 0
categoriesRight = np.zeros(10)
for trainingLabel in trainingLabels:
    if (np.argmax(trainingLabel)) == guesses[count]:
        corr += 1
        categoriesRight[np.argmax(trainingLabel)] += 1
    count += 1

print("The mlp neural network is " + str((corr / len(guesses)) * 100) + "% accurate on the training data")
print(acceptedWords)
count = 0
print((categoriesRight / numTrainingSamples) * 100)

NameError: name 'logits2' is not defined

In [222]:
# Take the softmax to convert the logit value to a percentage guess
probability = tf.nn.softmax(logits2)
# Take the highest probability value as the neural network's guess
prediction = tf.argmax((probability), axis=1)
# Check how accurate the test is
guesses = s.run((prediction), feed_dict={x: dataTest})

count = 0
corr = 0
categoriesRight = np.zeros(10)
for testLabel in testLabels:
    if (np.argmax(testLabel)) == guesses[count]:
        corr += 1
        categoriesRight[np.argmax(testLabel)] += 1
    count += 1
    
print("The mlp neural network is " + str((corr / len(guesses)) * 100) + "% accurate on the test data")
print(acceptedWords)
print((categoriesRight / numTestSamples) * 100)

The neural network is 40.25% accurate on the test data
['yes', 'no', 'up', 'down', 'left', 'right', 'on', 'off', 'stop', 'go']
[ 60.   34.   41.5  34.   34.   32.   37.   51.5  37.   41.5]


In [299]:
cnnTraining = dataTraining
cnnTrainingLabels = trainingLabels
cnnTest = dataTest
cnnTestLabels = testLabels

def fixShape(data):
    return data.reshape(-1, 1, 144, 1)

In [300]:
tf.reset_default_graph()

NUM_SAMPLES = 144
DENSE_LAYER = 256
NUM_CLASSIFICATIONS = NUM_LABELS
RATE = 0.01
BATCH_SIZE = 100
EPOCHS = 5000
ITERATIONS_PER_EPOCH = 100
DROP_OUT_RATE = 0.5

class N():
    pass

model = N()

model.x = tf.placeholder(tf.float32, (None, 1, None, 1))
model.ref = tf.placeholder(tf.float32, (None, NUM_CLASSIFICATIONS))
model.drop = tf.placeholder(tf.float32)
model.L1 = tf.layers.conv2d(model.x, filters=32, kernel_size=(1,5), padding="SAME", activation=tf.nn.relu)
model.L2 = tf.layers.max_pooling2d(inputs=model.L1, pool_size=(1,2), strides=(1, 2))
model.L3 = tf.reshape(model.L2, (-1, 32*72))
model.L4 = tf.layers.dense(inputs=model.L3, units=DENSE_LAYER)
model.L5 = tf.nn.dropout(model.L4, model.drop)
model.L6 = tf.layers.dense(inputs=model.L5, units=NUM_CLASSIFICATIONS)
model.err = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=model.L6, labels=model.ref))

optimizer = tf.train.GradientDescentOptimizer(RATE).minimize(model.err)
s = tf.Session()
s.run(tf.global_variables_initializer())

count = 0
for _ in range(EPOCHS):
    inputData, correctAns = getBatch(cnnTraining, cnnTrainingLabels, BATCH_SIZE)
    inputData = fixShape(inputData)
    crossData, crossAns = getBatch(cnnTraining, cnnTrainingLabels, BATCH_SIZE)
    crossData = fixShape(crossData)
    for _ in range(ITERATIONS_PER_EPOCH):
        err, _ = s.run((model.err, optimizer), feed_dict={model.x: inputData, model.ref: correctAns, model.drop: DROP_OUT_RATE})
        count += 1
        if (count % ((EPOCHS * ITERATIONS_PER_EPOCH) / 100)) == 0:
            crossErr = s.run((model.err), feed_dict={model.x: crossData, model.ref: crossAns, model.drop: DROP_OUT_RATE})
            print(str(int(((count / (EPOCHS * ITERATIONS_PER_EPOCH)) * 100) + 0.5)) + "% complete: Training Error = " + str(err) + ". Cross Validation Error = " + str(crossErr))

1% complete: Training Error = 1.16044. Cross Validation Error = 2.36456


KeyboardInterrupt: 

In [175]:
# Take the softmax to convert the logit value to a percentage guess
probability = tf.nn.softmax(model.L6)
# Take the highest probability value as the neural network's guess
prediction = tf.argmax((probability), axis=1)
# Check how accurate the training is
guesses = s.run((prediction), feed_dict={model.x: fixShape(cnnTraining)})

count = 0
corr = 0
categoriesRight = np.zeros(10)
for cnnTrainingLabel in cnnTrainingLabels:
    if (np.argmax(cnnTrainingLabel)) == guesses[count]:
        corr += 1
        categoriesRight[np.argmax(cnnTrainingLabel)] += 1
    count += 1

print("The cnn neural network is " + str((corr / len(guesses)) * 100) + "% accurate on the training data")
print(acceptedWords)
count = 0
print((categoriesRight / numTrainingSamples) * 100)

The cnn neural network is 10.0% accurate on the training data
['yes', 'no', 'up', 'down', 'left', 'right', 'on', 'off', 'stop', 'go']
[   0.    0.    0.    0.    0.    0.    0.    0.    0.  100.]


In [174]:
# Take the softmax to convert the logit value to a percentage guess
probability = tf.nn.softmax(model.L6)
# Take the highest probability value as the neural network's guess
prediction = tf.argmax((probability), axis=1)
# Check how accurate the test is
guesses = s.run((prediction), feed_dict={model.x: fixShape(cnnTest)})

count = 0
corr = 0
categoriesRight = np.zeros(10)
for cnnTestLabel in cnnTestLabels:
    if (np.argmax(cnnTestLabel)) == guesses[count]:
        corr += 1
        categoriesRight[np.argmax(cnnTestLabel)] += 1
    count += 1
    
print("The cnn neural network is " + str((corr / len(guesses)) * 100) + "% accurate on the test data")
print(acceptedWords)
print((categoriesRight / numTestSamples) * 100)

The cnn neural network is 10.0% accurate on the test data
['yes', 'no', 'up', 'down', 'left', 'right', 'on', 'off', 'stop', 'go']
[   0.    0.    0.    0.    0.    0.    0.    0.    0.  100.]
