In [1]:
import glob
import hashlib
import os
import time

import numpy as np
import tensorflow as tf
from tensorflow.contrib.framework.python.ops import audio_ops as contrib_audio
keras = tf.keras

# Make sure ONLY use this for the wav->tensor conversion
import scipy.io.wavfile as wavfile

  return f(*args, **kwds)


In [2]:
LABELS = ['yes', 'no', 'up', 'down', 'left', 'right', 'on', 'off', 'stop', 'go', 'silence', 'unknown']
NUM_FOLDS = 5
SAMPLE_INPUT_LENGTH = 16000

In [3]:
def collect_files(directory):
    metadata = {}
    rel_start = len(directory)
    for path in glob.glob(directory + '/**/*.wav'):
        rel_path = path[rel_start + 1:]
        label, fname = os.path.split(path)
        label = os.path.basename(label)
        speaker = fname.split('_')[0]
        try:
            metadata[label][speaker].append(path)
        except KeyError:
            try:
                metadata[label][speaker] = [path]
            except KeyError:
                metadata[label] = {speaker: [path]}    
        
    return metadata

In [4]:
metadata = collect_files('../data/train/audio')
metadata.update(collect_files('../data/samples'))

In [5]:
splits = [[] for _ in range(NUM_FOLDS)]
labels_set = set(LABELS)
for label, v in metadata.items():
    if label == '_background_noise_':
        continue
    if label not in labels_set:
        label = 'unknown'
    for speaker, files in v.items():
        speaker_hash = hashlib.sha1(speaker.encode('utf-8')).hexdigest()
        split_num = int(speaker_hash, 16) % NUM_FOLDS
        splits[split_num].extend([(fname, label) for fname in files])

In [6]:
label_lookup = dict(zip(LABELS, range(len(LABELS))))

In [7]:
data_splits = [
    [(wavfile.read(fname), label) for fname, label in s]
    for s in splits
]

# resize arrays in place
[e[0][1].resize((1, SAMPLE_INPUT_LENGTH)) for s in data_splits for e in s]
data_splits = [
    [np.array([e[0][0] for e in s]),
     np.vstack([e[0][1].astype(np.float) for e in s]),
     np.array([label_lookup[e[1]] for e in s])]
    for s in data_splits
]

In [8]:
# Normalize values to [-1, 1] and convert labels to one-hot
for split in data_splits:
    split[1] = split[1] / np.abs(split[1]).max(axis=1)[:, np.newaxis]
    one_hot = np.zeros((len(split[2]), len(LABELS)))
    one_hot[np.arange(len(split[2])), split[2]] = 1
    split[2] = one_hot

  This is separate from the ipykernel package so we can avoid doing imports until


In [9]:
def weight_variable(shape):
    initial = tf.truncated_normal(shape, stddev=0.1)
    return tf.Variable(initial)

def bias_variable(shape):
    initial = tf.constant(0.1, shape=shape)
    return tf.Variable(initial)

def conv2d(x, W):
    return tf.nn.conv2d(x, W, strides=[1, 1, 1, 1], padding='SAME')

def max_pool_2x2(x):
    return tf.nn.max_pool(x, ksize=[1, 2, 2, 1],
                          strides=[1, 2, 2, 1], padding='SAME')

In [10]:
x = tf.placeholder(tf.float32, shape=[None, SAMPLE_INPUT_LENGTH])
s = tf.placeholder(tf.int32, shape=[None, 1])
y_ = tf.placeholder(tf.float32, shape=[None, len(LABELS)])

See https://www.tensorflow.org/api_guides/python/contrib.signal#Computing_spectrograms

In [11]:
stfts = tf.contrib.signal.stft(x, frame_length=256, frame_step=128,
                               fft_length=1024)
power_spectrograms = tf.real(stfts * tf.conj(stfts))
magnitude_spectrograms = tf.abs(stfts)
log_offset = 1e-6
log_magnitude_spectrograms = tf.log(magnitude_spectrograms + log_offset)

In [12]:
num_spectrogram_bins = magnitude_spectrograms.shape[-1].value
lower_edge_hertz, upper_edge_hertz, num_mel_bins = 80.0, 7600.0, 64
linear_to_mel_weight_matrix = tf.contrib.signal.linear_to_mel_weight_matrix(
    num_mel_bins, num_spectrogram_bins, 16000, lower_edge_hertz,
    upper_edge_hertz)
mel_spectrograms = tf.tensordot(magnitude_spectrograms,
                                linear_to_mel_weight_matrix, 1)
mel_spectrograms.set_shape(
    magnitude_spectrograms.shape[:-1].concatenate(
        linear_to_mel_weight_matrix.shape[-1:]))

In [13]:
log_offset = 1e-6
log_mel_spectrograms = tf.log(mel_spectrograms + log_offset)

In [14]:
num_mfccs = 13
mfccs = tf.contrib.signal.mfccs_from_log_mel_spectrograms(
    log_mel_spectrograms)[..., :num_mfccs]

See https://www.tensorflow.org/get_started/mnist/pros

In [15]:
image_size = [mfccs.shape[-2].value, mfccs.shape[-1].value]
W_conv1 = weight_variable([5, 5, 1, 32])
b_conv1 = bias_variable([32])
mfcc_images = tf.reshape(mfccs, [-1] + image_size + [1], name="mfcc_resize")
h_conv1 = tf.nn.relu(conv2d(mfcc_images, W_conv1) + b_conv1)
h_pool1 = max_pool_2x2(h_conv1)

In [16]:
W_conv2 = weight_variable([5, 5, 32, 64])
b_conv2 = bias_variable([64])
h_conv2 = tf.nn.relu(conv2d(h_pool1, W_conv2) + b_conv2)
h_pool2 = max_pool_2x2(h_conv2)

In [17]:
flatten_size = h_pool2.shape[-3].value * h_pool2.shape[-2].value * h_pool2.shape[-1].value
W_fc1 = weight_variable([flatten_size, 1024])
b_fc1 = bias_variable([1024])
h_pool2_flat = tf.reshape(h_pool2, [-1, flatten_size])
h_fc1 = tf.nn.relu(tf.matmul(h_pool2_flat, W_fc1) + b_fc1)

In [18]:
keep_prob = tf.placeholder(tf.float32)
h_fc1_drop = tf.nn.dropout(h_fc1, keep_prob)

In [19]:
W_fc2 = weight_variable([1024, len(LABELS)])
b_fc2 = bias_variable([len(LABELS)])

y_conv = tf.matmul(h_fc1_drop, W_fc2) + b_fc2

In [20]:
test = [
    np.concatenate([d[0] for d in data_splits[1:]]),
    np.vstack([d[1] for d in data_splits[1:]]),
    np.concatenate([d[2] for d in data_splits[1:]]),
]

In [21]:
def next_batch(num_samples):
    batch_index = 0
    train_length = len(data_splits[0][0])
    order = np.arange(train_length)
    np.random.shuffle(order)
    while True:
        if batch_index + num_samples >= train_length:
            np.random.shuffle(order)
            batch_index = 0

        ret = order[batch_index:batch_index+num_samples] 
        batch_index += num_samples
        yield (data_splits[0][1][ret], data_splits[0][2][ret],)

In [22]:
cross_entropy = tf.reduce_mean(
    tf.nn.softmax_cross_entropy_with_logits(labels=y_, logits=y_conv))
train_step = tf.train.AdamOptimizer(1e-4).minimize(cross_entropy)
correct_prediction = tf.equal(tf.argmax(y_conv, 1), tf.argmax(y_, 1))
cast_float = tf.cast(correct_prediction, tf.float32)
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

hist = {
    'train': [],
    'test': None
}
batch_size = 64
num_epochs = 10
steps = int(num_epochs * len(data_splits[0][1]) / batch_size)
start = time.time()
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    for i in range(steps):
        batch = next(next_batch(batch_size))
        if i % 1000 == 0:
            train_accuracy = accuracy.eval(feed_dict={
                x: batch[0], y_: batch[1], keep_prob: 1.0})
            hist['train'].append(train_accuracy)
            print('step %d, training_accuracy %g' % (i, train_accuracy))
        
        train_step.run(feed_dict={x: batch[0], y_: batch[1], keep_prob: 0.5})
        
    test_accuracy = []
    for i in range(0, len(test[1]), 1024):
        val = accuracy.eval(feed_dict={
            x: test[1][i:i+1024], y_: test[2][i:i+1024], keep_prob: 1.0})
        test_accuracy.append(val)
    test_accuracy = np.mean(test_accuracy)
    hist['test'] = test_accuracy
    print('test_accuracy %g' % test_accuracy)
stop = time.time()
duration = stop - start

step 0, training_accuracy 0.015625
step 1000, training_accuracy 0.703125
step 2000, training_accuracy 0.625
test_accuracy 0.608976


In [23]:
(duration, len(data_splits[0][2]), len(test[2]))

(25.346879720687866, 12982, 53739)