In [1]:
%pylab inline

import pandas as pd
import nltk
import keras
import keras.backend as K
import kapre
import arrow
import pprint
import threading
import pprint
import tensorflow as tf
import tensorflow.contrib.signal
from soph import center_wave
from soph import ex_generator as old_gen

ex_df = pd.read_pickle("data/ex_df.pkl")

Populating the interactive namespace from numpy and matplotlib


Using TensorFlow backend.


In [2]:
tf.__version__

'1.4.1'

In [3]:
VAL = ["val", "test"]
TRAIN = ["train"]
N_BATCH = 512

callbacks = [
    keras.callbacks.EarlyStopping(
        patience=6, verbose=1),
    keras.callbacks.ReduceLROnPlateau(
        factor=.5, patience=3, verbose=1, min_lr=1e-7)
]


val_data = next(old_gen(
    batch_size=sum(ex_df.state.isin(VAL)),
    shuffle=False,
    raw_label=True,
    state=VAL))

traing_gen = old_gen(
    batch_size=N_BATCH,
    raw_label=True,
    state=TRAIN,
    vol_range=.1,
    p_transform=1,
    shift=0
)

# Test base RNN

There are 32 classes. 

Random guesses (balanced) give {{100*(1/32)}}% accuracy. 

Guessing the largest class (unbalanced) give {{100*(2380/64727)}}% accuracy.

First, as an experiment, let's look at a 1D convolution on raw audio.

# Base Spectrogram RNN

A spectrogram is constructed through a short-time fourier transform. 

In [11]:
class LogMagSpectrogram(keras.engine.Layer):
    """
    """

    def __init__(self,
                 frame_length=1024,
                 frame_step=512,
                 fft_length=1024,
                 **kwargs):

        self.frame_length = frame_length
        self.frame_step = frame_step
        self.fft_length = fft_length
        self.log_offset = 1e-6
        super(LogMagSpectrogram, self).__init__(**kwargs)

    def build(self, input_shape):

        super(LogMagSpectrogram, self).build(input_shape)

    def compute_output_shape(self, input_shape):
        batch = input_shape[0]
        n_samples = input_shape[1]
        n_seq = n_samples//self.frame_step
        n_bins = self.fft_length//2 +1
        
        return batch, n_seq, n_bins

    def call(self, x):
        # `stfts` is a complex64 Tensor representing the Short-time Fourier Transform of
        # each signal in `signals`. Its shape is [batch_size, ?, fft_unique_bins]
        # where fft_unique_bins = fft_length // 2 + 1 = 513.
        stfts = tf.contrib.signal.stft(
            x,
            frame_length=self.frame_length,
            frame_step=self.frame_step,
            fft_length=self.fft_length,
            pad_end=True,
        )

        # An energy spectrogram is the magnitude of the complex-valued STFT.
        # A float32 Tensor of shape [batch_size, ?, 513].
        magnitude_spectrograms = tf.abs(stfts)

        log_magnitude_spectrograms = tf.log(
            magnitude_spectrograms + self.log_offset)
        return log_magnitude_spectrograms

    def get_config(self):
        self.frame_length = frame_length
        self.frame_step = frame_step
        self.fft_length = fft_length
        self.log_offset = 1e-6
        config = {
            'frame_length': self.frame_length,
            'frame_step': self.frame_step,
            'fft_length': self.fft_length,
            'log_offset': self.log_offset
        }
        base_config = super(Spectrogram, self).get_config()
        return dict(list(base_config.items()) + list(config.items()))


In [6]:
SR = 16000
N_SEQ = 100
N_STEP = SR//(N_SEQ)
N_LEN = 2*N_STEP
N_DFT = max(2**(int(log2(N_STEP))+1),1024)
DROP = .25
INIT = "he_normal"
ACT = "elu"
N_CAT = int(ex_df.raw_label_i.max()+1)
N_BATCH = 512
VAL = ["val", "test"]
TRAIN = ["train"]
REG = None # keras.regularizers.l2(0.1)

input_layer = keras.layers.Input(shape=(SR,))
input_block = LogMagSpectrogram(frame_length=N_LEN, frame_step=N_STEP, fft_length=1024)(input_layer)

input_block = keras.layers.BatchNormalization()(input_block)
input_block = keras.layers.Dropout(DROP)(input_block)

input_block = keras.layers.Dense(50, activation=ACT)(input_block)
input_block = keras.layers.BatchNormalization()(input_block)
input_block = keras.layers.Dropout(DROP)(input_block)

rnn_block = input_block

rnn_block = keras.layers.Bidirectional(
    keras.layers.GRU(
        50,
        activation=ACT,
        kernel_initializer=INIT,
        dropout=DROP,
        recurrent_dropout=DROP,
        kernel_regularizer=REG, 
        recurrent_regularizer=REG, 
        bias_regularizer=REG
    ), merge_mode="concat")(rnn_block)
rnn_block = keras.layers.BatchNormalization()(rnn_block)

output_layer = keras.layers.Dense(
    N_CAT, activation="softmax", kernel_initializer=INIT)(rnn_block)
rnn_model = keras.Model(inputs=input_layer, outputs=output_layer)
rnn_model.summary()
rnn_model.compile(
    loss='sparse_categorical_crossentropy',
    optimizer='nadam',
    metrics=['sparse_categorical_accuracy'])

[None, 100, 513]
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 16000)             0         
_________________________________________________________________
log_mag_spectrogram_1 (LogMa (None, 100, 513)          0         
_________________________________________________________________
batch_normalization_1 (Batch (None, 100, 513)          2052      
_________________________________________________________________
dropout_1 (Dropout)          (None, 100, 513)          0         
_________________________________________________________________
dense_1 (Dense)              (None, 100, 50)           25700     
_________________________________________________________________
batch_normalization_2 (Batch (None, 100, 50)           200       
_________________________________________________________________
dropout_2 (Dropout)          (None, 100, 50)           0   

In [7]:
traing_gen = old_gen(
    batch_size=N_BATCH,
    raw_label=True,
    state=TRAIN,
    vol_range=.1,
    p_transform=1,
    shift=0
)

history = rnn_model.fit_generator(
    generator=traing_gen,
    steps_per_epoch=sum(ex_df.state.isin(TRAIN)) / N_BATCH,
    epochs=200,
    verbose=1,
    max_queue_size=100,
    callbacks=callbacks,
    validation_data=val_data
)

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 00028: reducing learning rate to 0.0010000000474974513.
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 00047: reducing learning rate to 0.0005000000237487257.
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 00057: reducing learning rate to 0.0002500000118743628.
Epoch 58/200
Epoch 59/200
Epoch 00059: early stopping


# Base Spectrogram with CNN

# Log Mel Spectrogram



In [8]:
class LogMelSpectrogram(keras.engine.Layer):
    """
    """

    def __init__(self,
                 frame_length=1024,
                 frame_step=512,
                 fft_length=1024,
                 lower_edge_hertz=80.0,
                 upper_edge_hertz=7600.0,
                 num_mel_bins=64,
                 sr=16000,
                 **kwargs):

        self.frame_length = frame_length
        self.frame_step = frame_step
        self.fft_length = fft_length
        self.n_bins = self.fft_length // 2 + 1
        self.log_offset = 1e-6
        self.lower_edge_hertz = lower_edge_hertz
        self.upper_edge_hertz = upper_edge_hertz
        self.num_mel_bins = num_mel_bins
        self.sr = sr
        super(LogMelSpectrogram, self).__init__(**kwargs)

    def build(self, input_shape):
        self.linear_to_mel_weight_matrix = tf.contrib.signal.linear_to_mel_weight_matrix(
            self.num_mel_bins, self.n_bins, self.sr, self.lower_edge_hertz,
            self.upper_edge_hertz)
        self.non_trainable_weights.append(self.linear_to_mel_weight_matrix)
        
        super(LogMelSpectrogram, self).build(input_shape)

    def compute_output_shape(self, input_shape):
        batch = input_shape[0]
        n_samples = input_shape[1]
        n_seq = n_samples//self.frame_step

        return batch, n_seq, self.num_mel_bins

    def call(self, x):
        # `stfts` is a complex64 Tensor representing the Short-time Fourier Transform of
        # each signal in `signals`. Its shape is [batch_size, ?, fft_unique_bins]
        # where fft_unique_bins = fft_length // 2 + 1 = 513.
        stfts = tf.contrib.signal.stft(
            x,
            frame_length=self.frame_length,
            frame_step=self.frame_step,
            fft_length=self.fft_length,
            pad_end=True,
        )

        # An energy spectrogram is the magnitude of the complex-valued STFT.
        # A float32 Tensor of shape [batch_size, ?, 513].
        magnitude_spectrograms = tf.abs(stfts)

        #         log_magnitude_spectrograms = tf.log(
        #             magnitude_spectrograms + self.log_offset)

#         mel_spectrograms = tf.tensordot(magnitude_spectrograms,
#                                         self.linear_to_mel_weight_matrix, 1)
        # Note: Shape inference for `tf.tensordot` does not currently handle this case.
#         mel_spectrograms.set_shape(
#             magnitude_spectrograms.shape[:-1].concatenate(
#                 self.linear_to_mel_weight_matrix.shape[-1:]))
        
        mel_spectrograms = K.dot(magnitude_spectrograms, self.linear_to_mel_weight_matrix)

        log_mel_spectrograms = tf.log(mel_spectrograms + self.log_offset)
        
        return mel_spectrograms

    def get_config(self):
        self.lower_edge_hertz = lower_edge_hertz
        self.upper_edge_hertz = upper_edge_hertz
        self.num_mel_bins = num_mel_bins

        config = {
            'frame_length': self.frame_length,
            'frame_step': self.frame_step,
            'fft_length': self.fft_length,
            'log_offset': self.log_offset,
            'n_bins': self.n_bins,
            'lower_edge_hertz': self.lower_edge_hertz,
            'upper_edge_hertz': self.upper_edge_hertz,
            'num_mel_bins': self.num_mel_bins,
            'sr': self.sr
        }
        base_config = super(LogMelSpectrogram, self).get_config()
        return dict(list(base_config.items()) + list(config.items()))

In [9]:
SR = 16000
N_SEQ = 100
F_STEP = SR//(N_SEQ)
F_LEN = 2*N_STEP
FFT_LEN = max(2**(int(log2(N_STEP))+1),1024)
N_MELS = 160
DROP = .25
INIT = "he_normal"
ACT = "elu"
N_CAT = int(ex_df.raw_label_i.max()+1)
N_BATCH = 512
VAL = ["val", "test"]
TRAIN = ["train"]
REG = None # keras.regularizers.l2(0.1)

input_layer = keras.layers.Input(shape=(SR,))
input_block = LogMelSpectrogram(
    frame_length=N_LEN,
    frame_step=N_STEP,
    fft_length=FFT_LEN,
    lower_edge_hertz=80.0,
    upper_edge_hertz=7600.0,
    num_mel_bins=N_MELS,
    sr=SR,
)(input_layer)
print(input_block.shape.as_list())

input_block = keras.layers.BatchNormalization()(input_block)
input_block = keras.layers.Dropout(DROP)(input_block)

input_block = keras.layers.Dense(50, activation=ACT)(input_block)
input_block = keras.layers.BatchNormalization()(input_block)
input_block = keras.layers.Dropout(DROP)(input_block)

rnn_block = input_block

rnn_block = keras.layers.Bidirectional(
    keras.layers.GRU(
        50,
        activation=ACT,
        kernel_initializer=INIT,
        dropout=DROP,
        recurrent_dropout=DROP,
        kernel_regularizer=REG, 
        recurrent_regularizer=REG, 
        bias_regularizer=REG
    ), merge_mode="concat")(rnn_block)
rnn_block = keras.layers.BatchNormalization()(rnn_block)

output_layer = keras.layers.Dense(
    N_CAT, activation="softmax", kernel_initializer=INIT)(rnn_block)
rnn_model = keras.Model(inputs=input_layer, outputs=output_layer)
rnn_model.summary()
rnn_model.compile(
    loss='sparse_categorical_crossentropy',
    optimizer='nadam',
    metrics=['sparse_categorical_accuracy'])

[None, 100, 160]
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 16000)             0         
_________________________________________________________________
log_mel_spectrogram_1 (LogMe (None, 100, 160)          82080     
_________________________________________________________________
batch_normalization_4 (Batch (None, 100, 160)          640       
_________________________________________________________________
dropout_3 (Dropout)          (None, 100, 160)          0         
_________________________________________________________________
dense_3 (Dense)              (None, 100, 50)           8050      
_________________________________________________________________
batch_normalization_5 (Batch (None, 100, 50)           200       
_________________________________________________________________
dropout_4 (Dropout)          (None, 100, 50)           0   

In [None]:
traing_gen = old_gen(
    batch_size=N_BATCH,
    raw_label=True,
    state=TRAIN,
    vol_range=.1,
    p_transform=1,
    shift=0
)

history = rnn_model.fit_generator(
    generator=traing_gen,
    steps_per_epoch=sum(ex_df.state.isin(TRAIN)) / N_BATCH,
    epochs=200,
    verbose=1,
    max_queue_size=100,
    callbacks=callbacks,
    validation_data=val_data
)

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200

In [4]:
class MFCC(keras.engine.Layer):
    """
    """

    def __init__(self,
                 frame_length=1024,
                 frame_step=512,
                 fft_length=1024,
                 lower_edge_hertz=None,
                 upper_edge_hertz=None,
                 num_mel_bins=64,
                 sr=16000,
                 n_mfcc=13,
                 **kwargs):

        self.frame_length = frame_length
        self.frame_step = frame_step
        self.fft_length = fft_length
        self.n_bins = self.fft_length // 2 + 1
        self.log_offset = 1e-6
        if lower_edge_hertz:
            self.lower_edge_hertz = lower_edge_hertz
        else:
            self.lower_edge_hertz = 0
        if upper_edge_hertz:
            self.upper_edge_hertz = upper_edge_hertz
        else:
            self.upper_edge_hertz = sr//2
        self.num_mel_bins = num_mel_bins
        self.sr = sr
        self.n_mfcc = n_mfcc
        super(MFCC, self).__init__(**kwargs)

    def build(self, input_shape):
        self.linear_to_mel_weight_matrix = tf.contrib.signal.linear_to_mel_weight_matrix(
            self.num_mel_bins, self.n_bins, self.sr, self.lower_edge_hertz,
            self.upper_edge_hertz)
        
        self.non_trainable_weights.append(self.linear_to_mel_weight_matrix)
        super(MFCC, self).build(input_shape)

    def compute_output_shape(self, input_shape):
        batch = input_shape[0]
        n_samples = input_shape[1]
        n_seq = n_samples//self.frame_step

        return batch, n_seq, self.n_mfcc

    def call(self, x):

        stfts = tf.contrib.signal.stft(
            x,
            frame_length=self.frame_length,
            frame_step=self.frame_step,
            fft_length=self.fft_length,
            pad_end=True,
        )
        magnitude_spectrograms = tf.abs(stfts)

        mel_spectrograms = K.dot(magnitude_spectrograms, self.linear_to_mel_weight_matrix)

        log_mel_spectrograms = tf.log(mel_spectrograms + self.log_offset)
        
        mfccs = tf.contrib.signal.mfccs_from_log_mel_spectrograms(
                  log_mel_spectrograms
        )[..., :self.n_mfcc]
        
        return mfccs

    def get_config(self):
        self.lower_edge_hertz = lower_edge_hertz
        self.upper_edge_hertz = upper_edge_hertz
        self.num_mel_bins = num_mel_bins

        config = {
            'frame_length': self.frame_length,
            'frame_step': self.frame_step,
            'fft_length': self.fft_length,
            'log_offset': self.log_offset,
            'n_bins': self.n_bins,
            'lower_edge_hertz': self.lower_edge_hertz,
            'upper_edge_hertz': self.upper_edge_hertz,
            'num_mel_bins': self.num_mel_bins,
            'sr': self.sr,
            'n_mfcc': n_mfcc
        }
        base_config = super(MFCC, self).get_config()
        return dict(list(base_config.items()) + list(config.items()))

In [5]:
class DeltaDelta(keras.engine.Layer):
    '''
    Layer that appends deltas as an extra channel
    '''

    def __init__(self, n=2, order=2, **kwargs):
        assert order==1 or order==2
        self.n = n
        self.order = order
        super(DeltaDelta, self).__init__(**kwargs)

    def compute_output_shape(self, input_shape):
        batch = input_shape[0]
        time = input_shape[1]
        features = input_shape[2]


        return batch, time, features, self.order+1

    def build(self, input_shape):

        delta_kernel = np.arange(-self.n, self.n + 1
                                 ).reshape((1, 2 * self.n + 1, 1, 1))
        delta_kernel = delta_kernel/(2*sum(np.arange(self.n+1)**2))

        self.delta_kernel = K.variable(delta_kernel, dtype=K.floatx())

        self.non_trainable_weights.append(self.delta_kernel)
        self.paddings = K.constant([[0,0], [0, 0], [self.n, self.n], [0,0]], dtype="int32")
        super(DeltaDelta, self).build(input_shape)

    def call(self, x, mask=None):
        
        x_orig = tf.expand_dims(x, -1)
        deltas = [x_orig]
        
        to_delta = x_orig
        for i in range(self.order):
            x_pad = tf.pad(to_delta, self.paddings)
            delta = K.conv2d(x_pad, self.delta_kernel, data_format="channels_last")
            deltas.append(delta)
            to_delta = delta

        return K.concatenate(deltas, axis=-1)

    def get_config(self):
        config = {'n': self.n, 'order': self.order}
        base_config = super(DeltaDelta, self).get_config()
        return dict(list(base_config.items()) + list(config.items()))

In [8]:
SR = 16000
N_SEQ = 64
F_STEP = SR//(N_SEQ)
F_LEN = 2*F_STEP
FFT_LEN = max(2**(int(log2(F_STEP))+1),1024)
N_MELS = 64
DROP = .2
INIT = "he_normal"
ACT = "elu"
N_MFCC = N_MELS
D_ORDER = 2
REG = None
N_CAT = int(ex_df.raw_label_i.max()+1)

input_layer = keras.layers.Input(shape=(SR,))
input_layer = keras.layers.Input(shape=(SR,))
input_block = MFCC(
    frame_length=F_LEN,
    frame_step=F_STEP,
    fft_length=FFT_LEN,
    num_mel_bins=N_MELS,
    sr=SR,
    n_mfcc=N_MFCC,
    upper_edge_hertz=4000,
    lower_edge_hertz=40
)(input_layer)
print(input_block.shape.as_list())
input_block = DeltaDelta()(input_block)

# input_block = keras.layers.BatchNormalization()(input_block)
input_block = keras.layers.Dropout(DROP)(input_block)


cnn_block = input_block 

cnn_block = keras.layers.Conv2D(20, 16, padding="same", activation=ACT)(cnn_block)
cnn_block = keras.layers.MaxPool2D()(cnn_block)
cnn_block = keras.layers.BatchNormalization()(cnn_block)
cnn_block = keras.layers.Dropout(DROP)(cnn_block)

cnn_block = keras.layers.Conv2D(20, 8, padding="same", activation=ACT)(cnn_block)
cnn_block = keras.layers.MaxPool2D()(cnn_block)
cnn_block = keras.layers.BatchNormalization()(cnn_block)
cnn_block = keras.layers.Dropout(DROP)(cnn_block)

cnn_block = keras.layers.Conv2D(20, 4, padding="same", activation=ACT)(cnn_block)
cnn_block = keras.layers.MaxPool2D()(cnn_block)
cnn_block = keras.layers.BatchNormalization()(cnn_block)
cnn_block = keras.layers.Dropout(DROP)(cnn_block)


cnn_block = keras.layers.Conv2D(30, 2, padding="same", activation=ACT)(cnn_block)
cnn_block = keras.layers.MaxPool2D()(cnn_block)
cnn_block = keras.layers.BatchNormalization()(cnn_block)
cnn_block = keras.layers.Dropout(DROP)(cnn_block)

cnn_block = keras.layers.Conv2D(40, 2, padding="same", activation=ACT)(cnn_block)
cnn_block = keras.layers.MaxPool2D()(cnn_block)
cnn_block = keras.layers.BatchNormalization()(cnn_block)
cnn_block = keras.layers.Dropout(DROP)(cnn_block)

cnn_block = keras.layers.Conv2D(50, 2, padding="same", activation=ACT)(cnn_block)
cnn_block = keras.layers.MaxPool2D()(cnn_block)
cnn_block = keras.layers.BatchNormalization()(cnn_block)
cnn_block = keras.layers.Dropout(DROP)(cnn_block)

output_block = keras.layers.Flatten()(cnn_block)

output_layer = keras.layers.Dense(
    N_CAT, activation="softmax", kernel_initializer=INIT)(output_block)
cnn_model = keras.Model(inputs=input_layer, outputs=output_layer)
cnn_model.summary(line_length=100)
cnn_model.compile(
    loss='sparse_categorical_crossentropy',
    optimizer='nadam',
    metrics=['sparse_categorical_accuracy'])

history = cnn_model.fit_generator(
    generator=traing_gen,
    steps_per_epoch=sum(ex_df.state.isin(TRAIN)) / N_BATCH,
    epochs=200,
    verbose=1,
    max_queue_size=100,
    callbacks=callbacks,
    validation_data=val_data
)

[None, 64, 64]
____________________________________________________________________________________________________
Layer (type)                                 Output Shape                            Param #        
input_2 (InputLayer)                         (None, 16000)                           0              
____________________________________________________________________________________________________
mfcc_1 (MFCC)                                (None, 64, 64)                          32832          
____________________________________________________________________________________________________
delta_delta_1 (DeltaDelta)                   (None, 64, 64, 3)                       5              
____________________________________________________________________________________________________
dropout_1 (Dropout)                          (None, 64, 64, 3)                       0              
____________________________________________________________________________

KeyboardInterrupt: 

In [14]:
F_LEN

400

In [19]:
SR = 16000
N_SEQ = 100
F_STEP = 160
F_LEN = 400
FFT_LEN = 512
N_MELS = 40
DROP = .1
INIT = "he_normal"
ACT = "elu"
N_MFCC = 13
D_ORDER = 2
REG = None
N_CAT = int(ex_df.raw_label_i.max()+1)

input_layer = keras.layers.Input(shape=(SR,))
input_block = MFCC(
    frame_length=F_LEN,
    frame_step=F_STEP,
    fft_length=FFT_LEN,
    num_mel_bins=N_MELS,
    upper_edge_hertz=4000,
    lower_edge_hertz=40,
    sr=SR,
    n_mfcc=N_MFCC,
)(input_layer)
# input_block = DeltaDelta()(input_block)
# input_block = keras.layers.Reshape((N_SEQ, N_MFCC*(D_ORDER+1)))(input_block)

input_block = keras.layers.BatchNormalization()(input_block)
input_block = keras.layers.Dropout(DROP)(input_block)



rnn_block = input_block

# rnn_block = keras.layers.Bidirectional(
#     keras.layers.GRU(
#         32,
#         activation=ACT,
#         kernel_initializer=INIT,
#         dropout=DROP,
#         recurrent_dropout=DROP,
#         kernel_regularizer=REG, 
#         recurrent_regularizer=REG, 
#         bias_regularizer=REG,
#         return_sequences=True
#     ), merge_mode="concat")(rnn_block)
# rnn_block = keras.layers.BatchNormalization()(rnn_block)

rnn_block = keras.layers.Bidirectional(
    keras.layers.GRU(
        25,
        activation=ACT,
        kernel_initializer=INIT,
        dropout=DROP,
        recurrent_dropout=DROP,
        kernel_regularizer=REG, 
        recurrent_regularizer=REG, 
        bias_regularizer=REG
    ), merge_mode="concat")(rnn_block)
rnn_block = keras.layers.BatchNormalization()(rnn_block)

output_layer = keras.layers.Dense(
    N_CAT, activation="softmax", kernel_initializer=INIT)(rnn_block)
rnn_model = keras.Model(inputs=input_layer, outputs=output_layer)
rnn_model.summary(line_length=100)
rnn_model.compile(
    loss='sparse_categorical_crossentropy',
    optimizer='nadam',
    metrics=['sparse_categorical_accuracy'])

____________________________________________________________________________________________________
Layer (type)                                 Output Shape                            Param #        
input_10 (InputLayer)                        (None, 16000)                           0              
____________________________________________________________________________________________________
mfcc_9 (MFCC)                                (None, 100, 13)                         10280          
____________________________________________________________________________________________________
batch_normalization_25 (BatchNormalization)  (None, 100, 13)                         52             
____________________________________________________________________________________________________
dropout_14 (Dropout)                         (None, 100, 13)                         0              
___________________________________________________________________________________________

In [None]:
traing_gen = old_gen(
    batch_size=N_BATCH,
    raw_label=True,
    state=TRAIN,
    vol_range=.1,
    p_transform=.5,
    shift=.5
)

history = rnn_model.fit_generator(
    generator=traing_gen,
    steps_per_epoch=sum(ex_df.state.isin(TRAIN)) / N_BATCH,
    epochs=200,
    verbose=1,
    max_queue_size=100,
    callbacks=callbacks,
    validation_data=val_data
)

Epoch 1/200
 17/103 [===>..........................] - ETA: 1:19 - loss: 0.6702 - sparse_categorical_accuracy: 0.8009

# CTC RNN

That RNN is pretty good. I've designed it to be sparse and super fast—it does an epoch of ~60k examples in 13s and has only 12k trainable params. And it still gets ~80% validation accuracy! 

Now use as similar architecture to build a CTC-based RNN

In [109]:
nltk.download('cmudict')
arpabet = nltk.corpus.cmudict.dict()

words = list(ex_df.raw_label.unique())
words.remove(np.nan)
words.remove("silence")

phone_dict = dict()
phone_set = set()
maxlen = 0
for w in words:
    phones = arpabet[w][0]
    phones = [p.strip("01") for p in phones] #remove emphasis on vowels
    phones_b = phones
    phone_dict[w] = phones_b
    phone_set |= set(phones)
    if (len(phones_b)) > maxlen:
        maxlen = len(phones_b)
# phone_dict["silence"] = ["-"]
alphabet = sorted(list(phone_set)) + ["-"]

def text_to_labels(text):
    phones = phone_dict[text]
    ret = [alphabet.index(p) for p in phones]
    return ret

N_CAT = len(alphabet)
pprint.pprint(phone_dict, compact=True)
print("{} phonemes in alphabet".format(N_CAT))

pprint.pprint(alphabet, compact=True)

[nltk_data] Downloading package cmudict to /home/ubuntu/nltk_data...
[nltk_data]   Package cmudict is already up-to-date!
{'bed': ['B', 'EH', 'D'],
 'bird': ['B', 'ER', 'D'],
 'cat': ['K', 'AE', 'T'],
 'dog': ['D', 'AO', 'G'],
 'down': ['D', 'AW', 'N'],
 'eight': ['EY', 'T'],
 'five': ['F', 'AY', 'V'],
 'four': ['F', 'AO', 'R'],
 'go': ['G', 'OW'],
 'happy': ['HH', 'AE', 'P', 'IY'],
 'house': ['HH', 'AW', 'S'],
 'left': ['L', 'EH', 'F', 'T'],
 'marvin': ['M', 'AA', 'R', 'V', 'IH', 'N'],
 'nine': ['N', 'AY', 'N'],
 'no': ['N', 'OW'],
 'off': ['AO', 'F'],
 'on': ['AA', 'N'],
 'one': ['W', 'AH', 'N'],
 'right': ['R', 'AY', 'T'],
 'seven': ['S', 'EH', 'V', 'AH', 'N'],
 'sheila': ['SH', 'IY', 'L', 'AH'],
 'six': ['S', 'IH', 'K', 'S'],
 'stop': ['S', 'T', 'AA', 'P'],
 'three': ['TH', 'R', 'IY'],
 'tree': ['T', 'R', 'IY'],
 'two': ['T', 'UW'],
 'up': ['AH', 'P'],
 'wow': ['W', 'AW'],
 'yes': ['Y', 'EH', 'S'],
 'zero': ['Z', 'IH', 'R', 'OW']}
33 phonemes in alphabet
['AA', 'AE', 'AH', 'AO', 'A

In [111]:
print("zero", text_to_labels("zero"))
print("silence", text_to_labels("wow"))

zero [31, 14, 22, 20]
silence [29, 4]


In [112]:
def ex_generator(
        batch_size=32,
        shuffle=True,
        state="train",
        num_seq=None,
        input_len=10,
        p_transform=0,
        vol_range=0,
        shift=0,
):

    epoch_df = ex_df[ex_df.state.isin(state)&(ex_df.raw_label != "silence")]
    num_ex = len(epoch_df)
    indices = np.arange(num_ex)

    # epoch loop runs
    while True:

        # shuffle anew every epoch
        if shuffle:
            epoch_df = epoch_df.sample(frac=1)

        # batch loop
        for i in np.arange(0, num_ex, batch_size):

            batch_df = epoch_df.iloc[i:i + batch_size, :]

            x = np.zeros((len(batch_df), 16000))
            labels = np.zeros((len(batch_df), maxlen))
            label_len = np.zeros((len(batch_df), 1))

            # example loop
            for b in range(len(batch_df)):

                x[b, ...] = center_wave(
                    epoch_df.fn.values[b],
                    vol_range=vol_range,
                    shift=shift,
                    p_transform=p_transform)

                labels_i = text_to_labels(epoch_df.raw_label.values[b])
                label_len[b] = len(labels_i)
                labels[b, :len(labels_i)] = labels_i

            inputs = {
                'wav': x,
                'labels': labels,
                'input_len': np.full((len(batch_df), 1), N_SEQ),
                'label_len': label_len
            }
            outputs = {
                'ctc': np.zeros([len(batch_df)]),
                'ler': np.zeros([len(batch_df)])
            }  # dummy data for dummy loss function

            yield (inputs, outputs)

In [174]:
SR = 16000
N_SEQ = 100
F_STEP = SR // (N_SEQ)
F_LEN = 2 * N_STEP
FFT_LEN = max(2**(int(log2(N_STEP)) + 1), 1024)
N_MELS = 160
DROP = .25
INIT = "he_normal"
ACT = "elu"
N_MFCC = N_MELS
D_ORDER = 2
REG = None

input_layer = keras.layers.Input(shape=(SR, ), name='wav')
input_block = MFCC(
    frame_length=N_LEN,
    frame_step=N_STEP,
    fft_length=FFT_LEN,
    num_mel_bins=N_MELS,
    sr=SR,
    n_mfcc=N_MFCC,
)(input_layer)
print(input_block.shape.as_list())
input_block = DeltaDelta()(input_block)
input_block = keras.layers.Reshape((N_SEQ,
                                    N_MFCC * (D_ORDER + 1)))(input_block)

input_block = keras.layers.BatchNormalization(center=False, scale=False)(input_block)
input_block = keras.layers.Dropout(DROP)(input_block)

input_block = keras.layers.Dense(
    50,
    activation=ACT,
    kernel_initializer=INIT,
    kernel_regularizer=REG,
    bias_regularizer=REG
)(input_block)
input_block = keras.layers.BatchNormalization(center=False, scale=False)(input_block)
input_block = keras.layers.Dropout(DROP)(input_block)

rnn_block = input_block

rnn_block = keras.layers.Bidirectional(
    keras.layers.GRU(
        50,
        activation=ACT,
        kernel_initializer=INIT,
        dropout=DROP,
        recurrent_dropout=DROP,
        kernel_regularizer=REG,
        recurrent_regularizer=REG,
        bias_regularizer=REG,
        return_sequences=True,
    ),
    merge_mode="concat")(rnn_block)
rnn_block = keras.layers.BatchNormalization(center=False, scale=False)(rnn_block)

# def ctc_lambda_func(args):
#     y_pred, labels, input_length, label_length = args
#     return K.ctc_batch_cost(labels, y_pred, input_length, label_length)

## OUT BLOCK
logits = keras.layers.Dense(N_CAT, kernel_initializer=INIT, name='logits')(rnn_block)

# these inputs are for the CTC loss
labels = keras.layers.Input(name='labels', shape=[maxlen], dtype='float32')
input_len = keras.layers.Input(name='input_len', shape=[1], dtype='int64')
label_len = keras.layers.Input(name='label_len', shape=[1], dtype='int64')

# LOSS
def ctc_loss(args):
    labels, label_len, logits, input_len = args
    input_len = tf.to_int32(tf.squeeze(input_len))
    label_len = tf.to_int32(tf.squeeze(label_len))
    sparse_labels = tf.to_int32(K.ctc_label_dense_to_sparse(labels, label_len))

    logits = tf.transpose(logits, perm=[1, 0, 2])

    return tf.expand_dims(
        tf.nn.ctc_loss(
            inputs=logits, labels=sparse_labels, sequence_length=input_len), 1)

loss_out = keras.layers.Lambda(
    ctc_loss, output_shape=(1, ),
    name='ctc')([labels, label_len, logits, input_len])



# def get_ler(args):
#     labels, label_len, logits, input_len = args
#     input_len = tf.to_int32(tf.squeeze(input_len))
#     batch_n = K.shape(input_len)[0]
#     print(batch_n)

#     decoded, log_prob = tf.nn.ctc_greedy_decoder(logits, input_len)
#     # Inaccuracy: label error rate

#     label_len = tf.to_int32(tf.squeeze(label_len))
#     sparse_labels =  tf.to_int32(K.ctc_label_dense_to_sparse(labels, label_len))

#     ler = tf.reduce_mean(tf.edit_distance(tf.cast(decoded[0], tf.int32),
#                                           sparse_labels))
    
#     return ler

# ler_out = keras.layers.Lambda(
#     get_ler,
#     name="ler"
# )([labels, label_len, logits, input_len])

get_logits = K.function([input_layer], [logits])


ctc_model = keras.Model(
    inputs=[input_layer, labels, input_len, label_len], outputs=loss_out)
ctc_model.summary()

[None, 100, 160]
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
wav (InputLayer)                (None, 16000)        0                                            
__________________________________________________________________________________________________
mfcc_23 (MFCC)                  (None, 100, 160)     82080       wav[0][0]                        
__________________________________________________________________________________________________
delta_delta_22 (DeltaDelta)     (None, 100, 160, 3)  5           mfcc_23[0][0]                    
__________________________________________________________________________________________________
reshape_23 (Reshape)            (None, 100, 480)     0           delta_delta_22[0][0]             
____________________________________________________________________________________________

In [214]:
## MODEL

val_data = next(ex_generator(
    batch_size=sum(ex_df.state.isin(VAL)&(ex_df.raw_label != "silence")),
    shuffle=False,
    state=VAL))

train_gen = ex_generator(
    batch_size=N_BATCH,
    state=TRAIN,
    vol_range=.1,
    p_transform=1,
    shift=0
)

def acc(y_true, y_pred): 
    return y_pred

ctc_model.compile(
    loss={
        "ctc": lambda y_true, y_pred: y_pred
         },
    optimizer="nadam",
#     metrics={"ler": acc}
)

callbacks = [
    keras.callbacks.EarlyStopping(patience=6, verbose=1),
    keras.callbacks.ReduceLROnPlateau(
        factor=.5, patience=3, verbose=1, min_lr=1e-8)
]

history = ctc_model.fit_generator(
    generator=train_gen,
    steps_per_epoch=sum(ex_df.state.isin(TRAIN)&(ex_df.raw_label != "silence")) / N_BATCH,
    epochs=10,
    verbose=1,
    max_queue_size=100,
    callbacks=callbacks,
    validation_data=val_data)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 00005: reducing learning rate to 0.0010000000474974513.
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


# Generate predictions

In [176]:
test_data = next(ex_generator(
    batch_size=512,
    shuffle=False,
    state=VAL))

In [178]:
test_data[0].keys()

dict_keys(['labels', 'input_len', 'wav', 'label_len'])

In [180]:
logits_arr = test_func([test_data[0]['wav']])[0]

In [181]:
logits_arr.shape

(512, 100, 33)

In [208]:
input_len = K.variable(value=np.full((512,1),100), dtype='int64')
logits = K.variable(value=logits_arr, dtype='float32')
labels = K.variable(value=test_data[0]['labels'], dtype='float32')
label_len = K.variable(value=test_data[0]['label_len'], dtype='int64')

In [209]:
input_len = tf.to_int32(tf.squeeze(input_len))
logits = K.permute_dimensions(logits, (1,0,2))
decoded, log_prob = tf.nn.ctc_greedy_decoder(logits, input_len)

In [210]:
label_len = tf.to_int32(tf.squeeze(label_len))
sparse_labels =  tf.to_int32(K.ctc_label_dense_to_sparse(labels, label_len))
dist = tf.edit_distance(tf.cast(decoded[0], tf.int32),
                                          sparse_labels)
ler = tf.reduce_mean(dist)

In [211]:
sess = K.get_session()

In [213]:
with sess.as_default():
    print(ler.eval())

2.1601562


In [198]:
labels = keras.layers.Input(name='labels', shape=[maxlen], dtype='float32')
input_len = keras.layers.Input(name='input_len', shape=[1], dtype='int64')
label_len = keras.layers.Input(name='label_len', shape=[1], dtype='int64')
logits = keras.layers.Input(name='logits', shape=[N_SEQ, N_CAT], dtype='float32')

def get_ler(args):
    labels, label_len, logits, input_len = args
    input_len = tf.to_int32(tf.squeeze(input_len))
    batch_n = K.shape(input_len)[0]

    decoded, log_prob = tf.nn.ctc_greedy_decoder(logits, input_len)
    # Inaccuracy: label error rate

    label_len = tf.to_int32(tf.squeeze(label_len))
    sparse_labels =  tf.to_int32(K.ctc_label_dense_to_sparse(labels, label_len))

    dist = tf.edit_distance(tf.cast(decoded[0], tf.int32),
                                          sparse_labels)
    
    print(K.int_shape(dist))
    ler = tf.reduce_mean(dist)
    print(ler)
    return tf.expand_dims(ler,1)

ler_out = keras.layers.Lambda(
    get_ler,
    name="ler"
)([labels, label_len, logits, input_len])

ler_model = keras.Model(inputs=[labels, input_len, label_len, logits], outputs=ler_out)

ler_model.compile(loss=lambda y_true, y_pred: y_pred, optimizer="nadam")

None
Tensor("ler_9/Mean:0", dtype=float32)
None
Tensor("ler_9/Mean_1:0", dtype=float32)


TypeError: object of type 'NoneType' has no len()

In [191]:
x['input_len'].shape

(512, 1)

In [189]:
x = test_data[0]
x["logits"] = logits_arr
y = {"ler":test_data[1]['ctc']}

In [192]:
ler_model.predict(x, batch_size=512, verbose=1)

FailedPreconditionError: len(sequence_length) != batch_size.  len(sequence_length):  512 batch_size: 100
	 [[Node: ler_3/CTCGreedyDecoder = CTCGreedyDecoder[merge_repeated=true, _device="/job:localhost/replica:0/task:0/device:CPU:0"](_arg_logits_2_0_3, ler_3/ToInt32/_4811)]]
	 [[Node: ler_3/ToInt64_1/_4858 = _Send[T=DT_INT64, client_terminated=false, recv_device="/job:localhost/replica:0/task:0/device:CPU:0", send_device="/job:localhost/replica:0/task:0/device:GPU:0", send_device_incarnation=1, tensor_name="edge_150_ler_3/ToInt64_1", _device="/job:localhost/replica:0/task:0/device:GPU:0"](ler_3/ToInt64_1)]]

Caused by op 'ler_3/CTCGreedyDecoder', defined at:
  File "/usr/lib/python3.5/runpy.py", line 184, in _run_module_as_main
    "__main__", mod_spec)
  File "/usr/lib/python3.5/runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "/usr/local/lib/python3.5/dist-packages/ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "/usr/local/lib/python3.5/dist-packages/traitlets/config/application.py", line 658, in launch_instance
    app.start()
  File "/usr/local/lib/python3.5/dist-packages/ipykernel/kernelapp.py", line 478, in start
    self.io_loop.start()
  File "/usr/local/lib/python3.5/dist-packages/zmq/eventloop/ioloop.py", line 177, in start
    super(ZMQIOLoop, self).start()
  File "/usr/local/lib/python3.5/dist-packages/tornado/ioloop.py", line 888, in start
    handler_func(fd_obj, events)
  File "/usr/local/lib/python3.5/dist-packages/tornado/stack_context.py", line 277, in null_wrapper
    return fn(*args, **kwargs)
  File "/usr/local/lib/python3.5/dist-packages/zmq/eventloop/zmqstream.py", line 440, in _handle_events
    self._handle_recv()
  File "/usr/local/lib/python3.5/dist-packages/zmq/eventloop/zmqstream.py", line 472, in _handle_recv
    self._run_callback(callback, msg)
  File "/usr/local/lib/python3.5/dist-packages/zmq/eventloop/zmqstream.py", line 414, in _run_callback
    callback(*args, **kwargs)
  File "/usr/local/lib/python3.5/dist-packages/tornado/stack_context.py", line 277, in null_wrapper
    return fn(*args, **kwargs)
  File "/usr/local/lib/python3.5/dist-packages/ipykernel/kernelbase.py", line 281, in dispatcher
    return self.dispatch_shell(stream, msg)
  File "/usr/local/lib/python3.5/dist-packages/ipykernel/kernelbase.py", line 232, in dispatch_shell
    handler(stream, idents, msg)
  File "/usr/local/lib/python3.5/dist-packages/ipykernel/kernelbase.py", line 397, in execute_request
    user_expressions, allow_stdin)
  File "/usr/local/lib/python3.5/dist-packages/ipykernel/ipkernel.py", line 208, in do_execute
    res = shell.run_cell(code, store_history=store_history, silent=silent)
  File "/usr/local/lib/python3.5/dist-packages/ipykernel/zmqshell.py", line 533, in run_cell
    return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
  File "/usr/local/lib/python3.5/dist-packages/IPython/core/interactiveshell.py", line 2728, in run_cell
    interactivity=interactivity, compiler=compiler, result=result)
  File "/usr/local/lib/python3.5/dist-packages/IPython/core/interactiveshell.py", line 2850, in run_ast_nodes
    if self.run_code(code, result):
  File "/usr/local/lib/python3.5/dist-packages/IPython/core/interactiveshell.py", line 2910, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-183-409dc4287154>", line 24, in <module>
    )([labels, label_len, logits, input_len])
  File "/usr/local/lib/python3.5/dist-packages/keras/engine/topology.py", line 603, in __call__
    output = self.call(inputs, **kwargs)
  File "/usr/local/lib/python3.5/dist-packages/keras/layers/core.py", line 651, in call
    return self.function(inputs, **arguments)
  File "<ipython-input-183-409dc4287154>", line 11, in get_ler
    decoded, log_prob = tf.nn.ctc_greedy_decoder(logits, input_len)
  File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/ops/ctc_ops.py", line 221, in ctc_greedy_decoder
    inputs, sequence_length, merge_repeated=merge_repeated)
  File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/ops/gen_ctc_ops.py", line 147, in _ctc_greedy_decoder
    merge_repeated=merge_repeated, name=name)
  File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/framework/op_def_library.py", line 787, in _apply_op_helper
    op_def=op_def)
  File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/framework/ops.py", line 2956, in create_op
    op_def=op_def)
  File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/framework/ops.py", line 1470, in __init__
    self._traceback = self._graph._extract_stack()  # pylint: disable=protected-access

FailedPreconditionError (see above for traceback): len(sequence_length) != batch_size.  len(sequence_length):  512 batch_size: 100
	 [[Node: ler_3/CTCGreedyDecoder = CTCGreedyDecoder[merge_repeated=true, _device="/job:localhost/replica:0/task:0/device:CPU:0"](_arg_logits_2_0_3, ler_3/ToInt32/_4811)]]
	 [[Node: ler_3/ToInt64_1/_4858 = _Send[T=DT_INT64, client_terminated=false, recv_device="/job:localhost/replica:0/task:0/device:CPU:0", send_device="/job:localhost/replica:0/task:0/device:GPU:0", send_device_incarnation=1, tensor_name="edge_150_ler_3/ToInt64_1", _device="/job:localhost/replica:0/task:0/device:GPU:0"](ler_3/ToInt64_1)]]


In [188]:
test_data[1]['ctc']

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0.

In [167]:
layer_test = ctc_model.layers_by_depth[0][0]

In [173]:
layer_test.get_output_at(0)

<tf.Tensor 'ctc_8/ExpandDims:0' shape=(?, 1) dtype=float32>

In [27]:
ctc_pred = K.function([input_layer], [y_pred])

def decode_batch(test_func, word_batch):
    out = test_func([word_batch])[0]
    ret = []
    for j in range(out.shape[0]):
        out_best = list(np.argmax(out[j, 2:], 1))
        out_best = [k for k, g in itertools.groupby(out_best)]
        outstr = labels_to_text(out_best)
        ret.append(outstr)
    return ret

In [129]:
ctc_model.predict(val_data[0], batch_size=512, verbose=1)



array([[20.72673 ],
       [16.544771],
       [20.867598],
       ...,
       [19.715319],
       [18.101973],
       [19.872498]], dtype=float32)

In [116]:
val_data

({'input_len': array([[100],
         [100],
         [100],
         ...,
         [100],
         [100],
         [100]]), 'label_len': array([[3.],
         [3.],
         [3.],
         ...,
         [3.],
         [3.],
         [3.]]), 'labels': array([[22.,  5., 25.,  0.,  0.,  0.],
         [22.,  5., 25.,  0.,  0.,  0.],
         [22.,  5., 25.,  0.,  0.,  0.],
         ...,
         [11.,  3., 22.,  0.,  0.,  0.],
         [11.,  3., 22.,  0.,  0.,  0.],
         [11.,  3., 22.,  0.,  0.,  0.]]), 'wav': array([[ 3.59990406e-07,  3.89989607e-07,  3.39990939e-07, ...,
           2.99992005e-07,  3.59990406e-07,  3.79989873e-07],
         [-3.99894024e-08, -4.89870180e-07, -5.09864881e-07, ...,
          -5.79846335e-07, -4.79872829e-07, -3.49907271e-07],
         [-5.99836401e-08, -5.99836401e-08, -5.99836401e-08, ...,
          -1.09970007e-07, -2.09942740e-07, -3.49904567e-07],
         ...,
         [-1.89965367e-07, -1.49972658e-07,  2.49954431e-07, ...,
           3.199416

In [26]:
test_ex = ex_df[ex_df.state=="test"].sample(n=1)

label = test_ex.raw_label.values[0]
fn = test_ex.fn.values[0]
wav = center_wave(fn)

test_batch = np.zeros((1,1,16000))
test_batch[0,...] = wav

In [30]:
ctc_model.trainable = False

In [None]:
ctc_mode

In [31]:
decode_batch(ctc_pred, test_batch)


InvalidArgumentError: You must feed a value for placeholder tensor 'batch_normalization_1/keras_learning_phase' with dtype bool
	 [[Node: batch_normalization_1/keras_learning_phase = Placeholder[dtype=DT_BOOL, shape=<unknown>, _device="/job:localhost/replica:0/task:0/gpu:0"]()]]

Caused by op 'batch_normalization_1/keras_learning_phase', defined at:
  File "/usr/lib/python3.5/runpy.py", line 184, in _run_module_as_main
    "__main__", mod_spec)
  File "/usr/lib/python3.5/runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "/usr/local/lib/python3.5/dist-packages/ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "/usr/local/lib/python3.5/dist-packages/traitlets/config/application.py", line 658, in launch_instance
    app.start()
  File "/usr/local/lib/python3.5/dist-packages/ipykernel/kernelapp.py", line 477, in start
    ioloop.IOLoop.instance().start()
  File "/usr/local/lib/python3.5/dist-packages/zmq/eventloop/ioloop.py", line 177, in start
    super(ZMQIOLoop, self).start()
  File "/usr/local/lib/python3.5/dist-packages/tornado/ioloop.py", line 888, in start
    handler_func(fd_obj, events)
  File "/usr/local/lib/python3.5/dist-packages/tornado/stack_context.py", line 277, in null_wrapper
    return fn(*args, **kwargs)
  File "/usr/local/lib/python3.5/dist-packages/zmq/eventloop/zmqstream.py", line 440, in _handle_events
    self._handle_recv()
  File "/usr/local/lib/python3.5/dist-packages/zmq/eventloop/zmqstream.py", line 472, in _handle_recv
    self._run_callback(callback, msg)
  File "/usr/local/lib/python3.5/dist-packages/zmq/eventloop/zmqstream.py", line 414, in _run_callback
    callback(*args, **kwargs)
  File "/usr/local/lib/python3.5/dist-packages/tornado/stack_context.py", line 277, in null_wrapper
    return fn(*args, **kwargs)
  File "/usr/local/lib/python3.5/dist-packages/ipykernel/kernelbase.py", line 283, in dispatcher
    return self.dispatch_shell(stream, msg)
  File "/usr/local/lib/python3.5/dist-packages/ipykernel/kernelbase.py", line 235, in dispatch_shell
    handler(stream, idents, msg)
  File "/usr/local/lib/python3.5/dist-packages/ipykernel/kernelbase.py", line 399, in execute_request
    user_expressions, allow_stdin)
  File "/usr/local/lib/python3.5/dist-packages/ipykernel/ipkernel.py", line 196, in do_execute
    res = shell.run_cell(code, store_history=store_history, silent=silent)
  File "/usr/local/lib/python3.5/dist-packages/ipykernel/zmqshell.py", line 533, in run_cell
    return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
  File "/usr/local/lib/python3.5/dist-packages/IPython/core/interactiveshell.py", line 2728, in run_cell
    interactivity=interactivity, compiler=compiler, result=result)
  File "/usr/local/lib/python3.5/dist-packages/IPython/core/interactiveshell.py", line 2850, in run_ast_nodes
    if self.run_code(code, result):
  File "/usr/local/lib/python3.5/dist-packages/IPython/core/interactiveshell.py", line 2910, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-5-b8de7b69af0a>", line 7, in <module>
    "DeltaDelta": kapre.utils.DeltaDelta,
  File "/usr/local/lib/python3.5/dist-packages/keras/models.py", line 240, in load_model
    model = model_from_config(model_config, custom_objects=custom_objects)
  File "/usr/local/lib/python3.5/dist-packages/keras/models.py", line 314, in model_from_config
    return layer_module.deserialize(config, custom_objects=custom_objects)
  File "/usr/local/lib/python3.5/dist-packages/keras/layers/__init__.py", line 55, in deserialize
    printable_module_name='layer')
  File "/usr/local/lib/python3.5/dist-packages/keras/utils/generic_utils.py", line 140, in deserialize_keras_object
    list(custom_objects.items())))
  File "/usr/local/lib/python3.5/dist-packages/keras/engine/topology.py", line 2500, in from_config
    process_node(layer, node_data)
  File "/usr/local/lib/python3.5/dist-packages/keras/engine/topology.py", line 2457, in process_node
    layer(input_tensors[0], **kwargs)
  File "/usr/local/lib/python3.5/dist-packages/keras/engine/topology.py", line 603, in __call__
    output = self.call(inputs, **kwargs)
  File "/usr/local/lib/python3.5/dist-packages/keras/layers/normalization.py", line 190, in call
    training=training)
  File "/usr/local/lib/python3.5/dist-packages/keras/backend/tensorflow_backend.py", line 2740, in in_train_phase
    training = learning_phase()
  File "/usr/local/lib/python3.5/dist-packages/keras/backend/tensorflow_backend.py", line 121, in learning_phase
    name='keras_learning_phase')
  File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/ops/array_ops.py", line 1548, in placeholder
    return gen_array_ops._placeholder(dtype=dtype, shape=shape, name=name)
  File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/ops/gen_array_ops.py", line 2094, in _placeholder
    name=name)
  File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/framework/op_def_library.py", line 767, in apply_op
    op_def=op_def)
  File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/framework/ops.py", line 2630, in create_op
    original_op=self._default_original_op, op_def=op_def)
  File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/framework/ops.py", line 1204, in __init__
    self._traceback = self._graph._extract_stack()  # pylint: disable=protected-access

InvalidArgumentError (see above for traceback): You must feed a value for placeholder tensor 'batch_normalization_1/keras_learning_phase' with dtype bool
	 [[Node: batch_normalization_1/keras_learning_phase = Placeholder[dtype=DT_BOOL, shape=<unknown>, _device="/job:localhost/replica:0/task:0/gpu:0"]()]]


# Test tensorflow layer for reading wav files

In [None]:
class MyLayer(Layer):

    def __init__(self, output_dim, **kwargs):
        self.output_dim = output_dim
        super(MyLayer, self).__init__(**kwargs)

    def build(self, input_shape):
        # Create a trainable weight variable for this layer.
        self.kernel = self.add_weight(name='kernel', 
                                      shape=(input_shape[1], self.output_dim),
                                      initializer='uniform',
                                      trainable=True)
        super(MyLayer, self).build(input_shape)  # Be sure to call this somewhere!

    def call(self, x):
        return K.dot(x, self.kernel)

    def compute_output_shape(self, input_shape):
        return (input_shape[0], self.output_dim)