In [5]:
%pylab inline
import keras
import keras.backend as K
import kapre
import pandas as pd
import scipy.io.wavfile as wav
import numpy as np
import random
import arrow
import threading
import pprint
from soph import ex_generator, center_wave

Populating the interactive namespace from numpy and matplotlib


`%matplotlib` prevents importing * from pylab and numpy
  "\n`%matplotlib` prevents importing * from pylab and numpy"


In [6]:
ex_df = pd.read_pickle("data/ex_df.pkl")

src_args = {
    "dropout_prob": .1,
    "activation": "elu",
    "batch_size": 512,
    "regularize": True,
    "l2_reg": 0.0005,
    "init": "glorot_normal",                   # glorot, tnormal, lsuv
    "init_param": 0.05,
    "filters_start": 50,
    "filters_step": 25,
    "kernel_size": 10,
    "cnn_pad": "same",
    "cnn_stride": 1,
    "cnn_stack": 2,
    "n_mfcc": 40,
    "n_mels": 80,
    "n_dft": 1024,
    "n_hop": 160,
    "pool": "max",
    "pool_pad": "same",
    "batch_normalize": True,
    "train_state": ["train"],
    "val_state": ["val", "test"],
    "p_transform": 1,
    "vol_range": .1,
    "shift": 1,
    "delta_delta": True,
    "lr_step": .5,
    "lr_patience": 1,
    "power_melgram": 2.0,
    "return_decibel_melgram": True,
    "trainable_fb": False,
    "trainable_kernel": False,
    "early_patience": 4
}
pprint.pprint(src_args)

{'activation': 'elu',
 'batch_normalize': True,
 'batch_size': 512,
 'cnn_pad': 'same',
 'cnn_stack': 2,
 'cnn_stride': 1,
 'delta_delta': True,
 'dropout_prob': 0.1,
 'early_patience': 4,
 'filters_start': 50,
 'filters_step': 25,
 'init': 'glorot_normal',
 'init_param': 0.05,
 'kernel_size': 10,
 'l2_reg': 0.0005,
 'lr_patience': 1,
 'lr_step': 0.5,
 'n_dft': 1024,
 'n_hop': 160,
 'n_mels': 80,
 'n_mfcc': 40,
 'p_transform': 1,
 'pool': 'max',
 'pool_pad': 'same',
 'power_melgram': 2.0,
 'regularize': True,
 'return_decibel_melgram': True,
 'shift': 1,
 'train_state': ['train'],
 'trainable_fb': False,
 'trainable_kernel': False,
 'val_state': ['val', 'test'],
 'vol_range': 0.1}


In [7]:
num_cat = 12
start_time = arrow.now()
current_time = start_time.to('US/Eastern').format('YYYY-MM-DD-HH-mm')

drop = src_args["dropout_prob"]

init = src_args["init"]

if init == None:
    init = 'glorot_uniform'
elif init == "tnormal":
    init_param = src_args["init_param"] if src_args["init_param"] else 0.01
    init = keras.initializers.TruncatedNormal(stddev=src_args["init_stdd"])
elif init == "lsuv":
    init = keras.initializers.RandomNormal(stddev=1)

if src_args["regularize"]:
    reg = keras.regularizers.l2(src_args["l2_reg"])
else:
    reg = None

In [8]:
input_layer = keras.layers.Input(shape=(1, 16000))
input_block = kapre.time_frequency.MFCC(
    n_mfcc=int(src_args["n_mfcc"]),
    n_mels=int(src_args["n_mels"]),
    n_dft=int(src_args["n_dft"]),
    n_hop=int(src_args["n_hop"]),
    power_melgram=src_args["power_melgram"],
    return_decibel_melgram=src_args["return_decibel_melgram"],
    trainable_kernel=src_args["trainable_kernel"],
    trainable_fb=src_args["trainable_fb"],
)(input_layer)
if src_args["delta_delta"]:
    input_block = kapre.utils.DeltaDelta(n=2)(input_block)
    input_block = keras.layers.Permute((1, 3, 2))(input_block)
    input_block = keras.layers.Reshape((src_args["n_mfcc"]*3*10,10))(input_block)
else:
    input_block = keras.layers.Permute((1, 3, 2))(input_block)
    input_block = keras.layers.Reshape((src_args["n_mfcc"]*10,10))(input_block)
    
input_block = keras.layers.Permute((2, 1))(input_block)

In [9]:
time = lambda x, y:  keras.layers.TimeDistributed(x)(y)

In [24]:
rnn_block = time(keras.layers.Dense(100, activation=src_args["activation"]), input_block)
rnn_block = time(keras.layers.BatchNormalization(), rnn_block)
rnn_block = time(keras.layers.Dropout(drop), rnn_block)
rnn_block = time(keras.layers.Dense(100, activation=src_args["activation"]), rnn_block)
rnn_block = time(keras.layers.BatchNormalization(), rnn_block)
rnn_block = time(keras.layers.Dropout(drop), rnn_block)

att_layer = keras.layers.Bidirectional(
    keras.layers.GRU(
        10,
        activation=src_args["activation"],
        dropout=drop,
        recurrent_dropout=drop,
        return_sequences=True,
#         return_state=True,
    ), merge_mode='mul')(rnn_block)
att_layer = keras.layers.BatchNormalization()(att_layer)

rnn_block = keras.layers.merge((rnn_block, att_layer), concat_axis=2, mode='concat')

rnn_block = keras.layers.Bidirectional(
    keras.layers.GRU(
        50,
        activation=src_args["activation"],
        dropout=drop,
        recurrent_dropout=drop,
    ))(rnn_block)
rnn_block = keras.layers.BatchNormalization()(rnn_block)


  name=name)


In [25]:
rnn_block.shape.as_list()

[None, 100]

In [26]:
output_layer = keras.layers.Dense(num_cat, activation="softmax")(rnn_block)

In [27]:
rnn_model = keras.Model(inputs=input_layer, outputs=output_layer)
rnn_model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            (None, 1, 16000)     0                                            
__________________________________________________________________________________________________
mfcc_2 (MFCC)                   (None, 40, 100, 1)   1094864     input_2[0][0]                    
__________________________________________________________________________________________________
delta_delta_2 (DeltaDelta)      (None, 40, 100, 3)   5           mfcc_2[0][0]                     
__________________________________________________________________________________________________
permute_3 (Permute)             (None, 40, 3, 100)   0           delta_delta_2[0][0]              
__________________________________________________________________________________________________
reshape_2 

In [None]:
log_base = "logs/rnn/{}/".format(current_time)
callbacks = [
    keras.callbacks.TensorBoard(
        log_dir=log_base + 'tb',
        batch_size=src_args["batch_size"],
        histogram_freq=0,
        write_grads=False,
        write_images=True
    ),
    keras.callbacks.ModelCheckpoint(
        filepath=log_base + 'model-checkpoint.hdf5',
        monitor='val_acc',
        verbose=0,
        save_best_only=True,
        save_weights_only=False,
        mode='auto',
        period=1),
    keras.callbacks.CSVLogger(log_base + 'training.log'),
    keras.callbacks.EarlyStopping(
        patience=src_args["early_patience"], verbose=1),
    keras.callbacks.ReduceLROnPlateau(
        factor=src_args["lr_step"], patience=src_args["lr_patience"], verbose=1, min_lr=1e-7)
]

rnn_model.compile(
    loss='sparse_categorical_crossentropy',
    optimizer='nadam',
    metrics=['accuracy'])


def launchTensorBoard():
    import os
    os.system('pkill tensorboard')
    os.system('tensorboard --logdir=' + log_base + 'tb')
    return


t = threading.Thread(target=launchTensorBoard, args=([]))
t.start()

val_data = next(ex_generator(
    batch_size=sum(ex_df.state.isin(src_args["val_state"])),
    shuffle=False,
    state=src_args["val_state"],
    vol_range=0,
    displacement=0,
    p_transform=0))

traing_gen = ex_generator(
    batch_size=src_args["batch_size"],
    shuffle=True,
    state=src_args["train_state"],
    vol_range=src_args["vol_range"],
    shift=src_args["shift"],
    p_transform=src_args["p_transform"])

history = rnn_model.fit_generator(
    generator=traing_gen,
    steps_per_epoch=sum(ex_df.state.isin(
        src_args["train_state"])) / src_args["batch_size"],
    epochs=200,
    verbose=1,
    max_queue_size=100,
    callbacks=callbacks,
    validation_data=val_data
)

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 00009: reducing learning rate to 0.0010000000474974513.
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 00014: reducing learning rate to 0.0005000000237487257.
Epoch 15/200
Epoch 16/200

In [5]:
a = array([
    [1,2,3,4],
    [1,2,3,4],
    [1,2,3,4],
    [1,2,3,4],
    [1,2,3,4],
    [1,2,3,4],
])
a = a.reshape(1,6,4)
print(a,a.shape)

[[[1 2 3 4]
  [1 2 3 4]
  [1 2 3 4]
  [1 2 3 4]
  [1 2 3 4]
  [1 2 3 4]]] (1, 6, 4)


In [6]:
b = a.reshape((3,-1,4))
print(b,b.shape)

[[[1 2 3 4]
  [1 2 3 4]]

 [[1 2 3 4]
  [1 2 3 4]]

 [[1 2 3 4]
  [1 2 3 4]]] (3, 2, 4)


In [7]:
input_block = [
    keras.layers.InputLayer(input_shape=(1, maxlen)), # None, 1, 16000
    kapre.time_frequency.Melspectrogram(
            sr=16000,
            n_mels=n_mels,
            n_dft=256,
            n_hop=int(n_hop),
            power_melgram=2.0,
            trainable_kernel=False,
            return_decibel_melgram=True),             # None, n_mels, n_steps, 1
    keras.layers.Permute((3,2,1)),                    # None, 1, n_steps, n_mels
    keras.layers.Reshape((n_seq,-1,n_mels)),          # None, n_seq, n_steps, n_mels
]

In [8]:
cnn_block = []

for i, block in enumerate(src_args["cnn_blocks"]):
    for n, k in block:
        cnn_block.extend([
            keras.layers.TimeDistributed(
                keras.layers.Conv1D(
                    n,
                    kernel_size=k,
                    padding="same",
                    activation=act,
                    kernel_initializer=init)),
            keras.layers.TimeDistributed(keras.layers.BatchNormalization()),
        ])
    cnn_block.extend([
        keras.layers.TimeDistributed(
            keras.layers.MaxPooling1D(pool_size=2, padding='same')),
        keras.layers.TimeDistributed(keras.layers.Dropout(drop)),
    ])

cnn_block.append(keras.layers.TimeDistributed(keras.layers.Flatten()))

In [9]:
rnn_cnn_model = keras.Sequential(input_block + 
                                 cnn_block + 
[
    keras.layers.Bidirectional(
        keras.layers.GRU(
            128,
            activation=act,
            dropout=drop,
            recurrent_dropout=drop)),
    keras.layers.BatchNormalization(),
#     keras.layers.Dropout(drop),
    keras.layers.Dense(num_cat, activation="softmax"),
])
rnn_cnn_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 1, 16000)          0         
_________________________________________________________________
melspectrogram_1 (Melspectro (None, 40, 160, 1)        71208     
_________________________________________________________________
permute_1 (Permute)          (None, 1, 160, 40)        0         
_________________________________________________________________
reshape_1 (Reshape)          (None, 16, 10, 40)        0         
_________________________________________________________________
time_distributed_1 (TimeDist (None, 16, 10, 128)       15488     
_________________________________________________________________
time_distributed_2 (TimeDist (None, 16, 10, 128)       512       
_________________________________________________________________
time_distributed_3 (TimeDist (None, 16, 5, 128)        0         
__________

In [10]:
log_base = "logs/rnn_cnn/{}/".format(current_time)
callbacks = [
    keras.callbacks.TensorBoard(
        log_dir= log_base+'tb',
        histogram_freq=0,
        batch_size=src_args["batch_size"],
        write_graph=True,
        write_grads=True,
        write_images=True,
    ),
    keras.callbacks.ModelCheckpoint(
        filepath=log_base+'model-checkpoint.hdf5',
        monitor='val_loss',
        verbose=1,
        save_best_only=True,
        save_weights_only=False,
        mode='auto',
        period=1),
    keras.callbacks.CSVLogger(log_base+'training.log'),
    keras.callbacks.EarlyStopping(patience=10, verbose=1),
    keras.callbacks.ReduceLROnPlateau(
        factor=0.5, patience=1, verbose=1, min_lr=1e-8)
]

rnn_cnn_model.compile(
    loss='sparse_categorical_crossentropy',
    optimizer='nadam',
    metrics=['accuracy'])

In [11]:
def launchTensorBoard():
    import os
    os.system('pkill tensorboard')
    os.system('tensorboard --logdir=' + log_base+'tb')
    return

import threading
t = threading.Thread(target=launchTensorBoard, args=([]))
t.start()

In [12]:
val_data = next(ex_generator(
        batch_size=sum(ex_df.state.isin(src_args["val_state"])),
        shuffle=False,
        state=src_args["val_state"],
        vol_range=0,
        displacement=0,
        p_transform=0))

In [13]:
rnn_cnn_model.fit_generator(
    generator=ex_generator(
        batch_size=src_args["batch_size"],
        shuffle=True,
        state=src_args["train_state"],
        shift=src_args["shift"],
        vol_range=src_args["vol_range"],
        displacement=src_args["displacement"],
        p_transform=src_args["p_transform"]),
    steps_per_epoch=sum(ex_df.state.isin(src_args["train_state"])) / src_args["batch_size"],
    epochs=200,
    verbose=1,
    max_queue_size=100,
    callbacks=callbacks,
    validation_data=val_data
)

Epoch 1/200
Epoch 2/200
Epoch 3/200

Epoch 00003: reducing learning rate to 0.0010000000474974513.
Epoch 4/200

Epoch 00004: reducing learning rate to 0.0005000000237487257.
Epoch 5/200

Epoch 00005: reducing learning rate to 0.0002500000118743628.
Epoch 6/200

Epoch 00006: reducing learning rate to 0.0001250000059371814.
Epoch 7/200

Epoch 00007: reducing learning rate to 6.25000029685907e-05.
Epoch 8/200

KeyboardInterrupt: 