In [1]:
from __future__ import print_function
from __future__ import division
import os, sys
import numpy as np
import pandas as pd
from builtins import range
from sklearn.metrics import roc_auc_score
import librosa, librosa.display
import matplotlib.pyplot as plt
% matplotlib inline

import tensorflow as tf
import keras
from keras.models import Sequential, Model
from keras.layers import Activation, Dense, Flatten, Input, Reshape, Dropout, Permute
from keras.layers.convolutional import Conv2D
from keras.layers.normalization import BatchNormalization
from keras.layers.recurrent import GRU
from keras.layers.pooling import MaxPooling2D, GlobalAveragePooling2D
from keras.layers.merge import Concatenate
from keras import backend as K
from keras.backend.tensorflow_backend import set_session

os.environ["CUDA_VISIBLE_DEVICES"]="0" # the number of the GPU
config = tf.ConfigProto()
config.gpu_options.per_process_gpu_memory_fraction = 0.5 # percentage to be used
set_session(tf.Session(config=config))

from kapre.time_frequency import Melspectrogram
from global_config import *


Using TensorFlow backend.


Channel-last, i.e., (None, n_freq, n_time, n_ch)


In [2]:
def data_gen(df_subset, ys, is_shuffle, batch_size=20):
    """Data generator.
    df_subset: pandas dataframe, with rows subset
    ys: numpy arrays, N-by-8 one-hot-encoded labels
    is_shuffle: shuffle every batch if True.
    batch_size: integer, size of batch. len(df_subset) % batch_size should be 0.
    """
    n_data = len(df_subset)
    n_batch = n_data // batch_size
    if n_data % batch_size != 0:
        print("= WARNING =")
        print("  n_data % batch_size != 0 but this code does not assume it")
        print("  so the residual {} sample(s) will be ignored.".format(n_data % batch_size))

    while True:
        for batch_i in range(n_batch):
            if is_shuffle:
                batch_idxs = np.random.choice(n_data, batch_size, replace=False)
            else:
                batch_idxs = range(batch_i * batch_size, (batch_i + 1) * batch_size)

            src_batch = np.array([np.load(os.path.join(DIR_PEDAL_SEGMENT_NPY, df_subset.loc[df_subset.index[i]].filepath.split('.')[0]+'.npy')) for i in batch_idxs],
                                 dtype=K.floatx())
            src_batch = src_batch[:, np.newaxis, :]  # make (batch, N) to (batch, 1, N) for kapre compatible

            y_batch = np.array([ys[i] for i in batch_idxs],
                               dtype=K.floatx())
            
            yield src_batch, y_batch
        
        
def get_callbacks(name,patience):
    if not os.path.exists(DIR_SAVE_MODEL):
        os.makedirs(DIR_SAVE_MODEL)    
    early_stopper = keras.callbacks.EarlyStopping(patience=patience)
    model_saver = keras.callbacks.ModelCheckpoint(os.path.join(DIR_SAVE_MODEL,"{}_best_model.h5".format(name)),
                                                  save_best_only=True)
    weight_saver = keras.callbacks.ModelCheckpoint(os.path.join(DIR_SAVE_MODEL,"{}_best_weights.h5".format(name)),
                                                   save_best_only=True,
                                                   save_weights_only=True)
    csv_logger = keras.callbacks.CSVLogger(os.path.join(DIR_SAVE_MODEL,"{}.log".format(name)))
    return [early_stopper, model_saver, weight_saver, csv_logger]

## Time-Invariant Models for Pedal-Segment Binary Classification

use raw-audio input, which is converted to melspectrogram using Kapre

In [3]:
def model_multi_kernel_shape(n_out, input_shape=SEGMENT_INPUT_SHAPE,
                             out_activation='softmax'):
    """

    Symbolic summary:
    > c2' - p2 - c2 - p2 - c2 - p2 - c2 - p3 - d1
    where c2' -> multiple kernel shapes

    Parameters
    ----------
        n_out: integer, number of output nodes
        input_shape: tuple, an input shape, which doesn't include batch-axis.
        out_activation: activation function on the output
    """
    audio_input = Input(shape=input_shape)

    x = Melspectrogram(n_dft=N_FFT, n_hop=HOP_LENGTH, sr=SR, n_mels=128, power_melgram=2.0, return_decibel_melgram=True)(audio_input)
    x = BatchNormalization(axis=channel_axis)(x)

    x1 = Conv2D(7, (20, 3), padding='same', kernel_regularizer=keras.regularizers.l2(reg_w))(x)
    x2 = Conv2D(7, (3, 3), padding='same', kernel_regularizer=keras.regularizers.l2(reg_w))(x)
    x3 = Conv2D(7, (3, 20), padding='same', kernel_regularizer=keras.regularizers.l2(reg_w))(x)

    x = Concatenate(axis=channel_axis)([x1, x2, x3])

    x = BatchNormalization(axis=channel_axis)(x)
    x = Activation('relu')(x)
    x = MaxPooling2D((2, 2), padding='same')(x)
    x = Dropout(0.25)(x)
    
    x = Conv2D(21, (3, 3), padding='same', kernel_regularizer=keras.regularizers.l2(reg_w))(x)
    x = BatchNormalization(axis=channel_axis)(x)
    x = Activation('relu')(x)
    x = MaxPooling2D((2, 2), padding='same')(x)
    x = Dropout(0.25)(x)

    x = Conv2D(21, (3, 3), padding='same', kernel_regularizer=keras.regularizers.l2(reg_w))(x)
    x = BatchNormalization(axis=channel_axis)(x)
    x = Activation('relu')(x)
    x = MaxPooling2D((2, 2), padding='same')(x)
    x = Dropout(0.25)(x)

    x = Conv2D(21, (3, 3), padding='same', kernel_regularizer=keras.regularizers.l2(reg_w))(x)
    x = BatchNormalization(axis=channel_axis)(x)
    x = Activation('relu')(x)
    x = MaxPooling2D((4, 4), padding='same')(x)
    x = Dropout(0.25)(x)

    x = GlobalAveragePooling2D()(x)

    out = Dense(n_out, activation=out_activation, kernel_regularizer=keras.regularizers.l2(reg_w))(x)

    model = Model(audio_input, out)

    return model


def model_crnn_icassp2017_choi(n_out, input_shape=SEGMENT_INPUT_SHAPE,
                               out_activation='softmax'):
    """A simplified model of 
    Convolutional Recurrent Neural Networks for Music Classification,
    K Choi, G Fazekas, M Sandler, K Choi, ICASSP, 2017, New Orleans, USA

    Symbolic summary:
    > c2 - p2 - c2 - p2 - c2 - p2 - c2 - p2 - r1 - r2 - d1

    Parameters
    ----------
        n_out: integer, number of output nodes
        input_shape: tuple, an input shape, which doesn't include batch-axis.
        out_activation: activation function on the output

    """

    audio_input = Input(shape=input_shape)

    x = Melspectrogram(n_dft=N_FFT, n_hop=HOP_LENGTH, sr=SR, n_mels=128, power_melgram=2.0, return_decibel_melgram=True)(audio_input)
    x = BatchNormalization(axis=channel_axis)(x)

    x = Conv2D(21, (3, 3), padding='same', kernel_regularizer=keras.regularizers.l2(reg_w))(x)
    x = BatchNormalization(axis=channel_axis)(x)
    x = Activation('relu')(x)
    x = MaxPooling2D((2, 2), padding='same')(x)
    x = Dropout(0.25)(x)

    x = Conv2D(21, (3, 3), padding='same', kernel_regularizer=keras.regularizers.l2(reg_w))(x)
    x = BatchNormalization(axis=channel_axis)(x)
    x = Activation('relu')(x)
    x = MaxPooling2D((2, 2), padding='same')(x)
    x = Dropout(0.25)(x)

    x = Conv2D(21, (3, 3), padding='same', kernel_regularizer=keras.regularizers.l2(reg_w))(x)
    x = BatchNormalization(axis=channel_axis)(x)
    x = Activation('relu')(x)
    x = MaxPooling2D((2, 2), padding='same')(x)
    x = Dropout(0.25)(x)

    x = Conv2D(21, (3, 3), padding='same', kernel_regularizer=keras.regularizers.l2(reg_w))(x)
    x = BatchNormalization(axis=channel_axis)(x)
    x = Activation('relu')(x)
    x = MaxPooling2D((4, 4), padding='same')(x)
    x = Dropout(0.25)(x)

    if K.image_dim_ordering() == 'channels_first':
        x = Permute((3, 1, 2))(x)

    x = Reshape((-1, 21))(x)

    # GRU block 1, 2, output
    x = GRU(41, return_sequences=True, name='gru1')(x)
    x = GRU(41, return_sequences=False, name='gru2')(x)
    x = Dropout(0.3)(x)

    out = Dense(n_out, activation=out_activation, kernel_regularizer=keras.regularizers.l2(reg_w))(x)

    model = Model(audio_input, out)

    return model


def model_conv3x3_ismir2016_choi(n_out, input_shape=SEGMENT_INPUT_SHAPE,
                                 out_activation='softmax'):
    """ A simplified model of 
    Automatic Tagging Using Deep Convolutional Neural Networks,
    K Choi, G Fazekas, M Sandler, ISMIR, 2016, New York, USA

    Symbolic summary:
    > c2 - p2 - c2 - p2 - c2 - p2 - c2 - p2 - c2 - p3 - d1

    Modifications: 
        * n_mels (96 -> 32)
        * n_channels (many -> [16, 24, 32, 40, 48])
        * remove dropout
        * maxpooling (irregular to fit the size -> all (2, 2))
        * add GlobalAveragePooling2D
    """

    model = Sequential()
    model.add(Melspectrogram(n_dft=N_FFT, n_hop=HOP_LENGTH, sr=SR, n_mels=128, power_melgram=2.0, 
                             return_decibel_melgram=True,
                             input_shape=input_shape))
    model.add(BatchNormalization(axis=channel_axis))

    model.add(Conv2D(10, (3, 3), padding='same', kernel_regularizer=keras.regularizers.l2(reg_w)))
    model.add(BatchNormalization(axis=channel_axis))
    model.add(Activation('relu'))
    model.add(MaxPooling2D((2, 2), padding='same'))
    model.add(Dropout(0.25))
    
    model.add(Conv2D(15, (3, 3), padding='same', kernel_regularizer=keras.regularizers.l2(reg_w)))
    model.add(BatchNormalization(axis=channel_axis))
    model.add(Activation('relu'))
    model.add(MaxPooling2D((2, 2), padding='same'))
    model.add(Dropout(0.25))

    model.add(Conv2D(15, (3, 3), padding='same', kernel_regularizer=keras.regularizers.l2(reg_w)))
    model.add(BatchNormalization(axis=channel_axis))
    model.add(Activation('relu'))
    model.add(MaxPooling2D((2, 2), padding='same'))
    model.add(Dropout(0.25))

    model.add(Conv2D(20, (3, 3), padding='same', kernel_regularizer=keras.regularizers.l2(reg_w)))
    model.add(BatchNormalization(axis=channel_axis))
    model.add(Activation('relu'))
    model.add(MaxPooling2D((2, 2), padding='same'))
    model.add(Dropout(0.25))

    model.add(Conv2D(20, (3, 3), padding='same', kernel_regularizer=keras.regularizers.l2(reg_w)))
    model.add(BatchNormalization(axis=channel_axis))
    model.add(Activation('relu'))
    model.add(MaxPooling2D((2, 2), padding='same'))
    model.add(Dropout(0.25))

    model.add(GlobalAveragePooling2D())

    model.add(Dense(n_out, activation=out_activation, kernel_regularizer=keras.regularizers.l2(reg_w)))

    return model


def model_conv1d_icassp2014_sander(n_out, input_shape=SEGMENT_INPUT_SHAPE,
                                   out_activation='softmax'):
    """A simplified model of
    End-to-end learning for music audio,
    Sander Dieleman and Benjamin Schrauwen, ICASSP, 2014

    Symbolic summary:
    > c1 - p1 - c1 - p1 - c1 - p1 - p3 - d1

    Modifications: 
        * Add BatchNormalization
        * n_mels (128 -> 32)
        * n_layers (2 -> 3)
        * add GlobalAveragePooling2D

    Parameters
    ----------
        n_out: integer, number of output nodes
        input_shape: tuple, an input shape, which doesn't include batch-axis.
        out_activation: activation function on the output

    """

    model = Sequential()
    model.add(Melspectrogram(n_dft=N_FFT, n_hop=HOP_LENGTH, sr=SR, n_mels=128, power_melgram=2.0, 
                             return_decibel_melgram=True,input_shape=input_shape))
    model.add(Conv2D(30, (32, 4), padding='valid', kernel_regularizer=keras.regularizers.l2(reg_w)))  # (None, 16, 1, N)
    model.add(BatchNormalization(axis=channel_axis))
    model.add(Activation('relu'))
    model.add(MaxPooling2D((1, 4), padding='same'))
    model.add(Dropout(0.25))

    model.add(Conv2D(30, (1, 4), padding='same', kernel_regularizer=keras.regularizers.l2(reg_w)))
    model.add(BatchNormalization(axis=channel_axis))
    model.add(Activation('relu'))
    model.add(MaxPooling2D((1, 4), padding='same'))
    model.add(Dropout(0.25))

    model.add(Conv2D(30, (1, 4), padding='same', kernel_regularizer=keras.regularizers.l2(reg_w)))
    model.add(BatchNormalization(axis=channel_axis))
    model.add(Activation('relu'))
    model.add(MaxPooling2D((1, 4), padding='same'))
    model.add(Dropout(0.25))

    model.add(GlobalAveragePooling2D())

    model.add(Dense(n_out, activation=out_activation, kernel_regularizer=keras.regularizers.l2(reg_w)))

    return model


In [4]:
dataset_name = 'pedal-segment_npydf_small.csv'
model_name = 'cnn3x3'
exp_name = 'small-segment_{}'.format(model_name)
reg_w = 1e-4
batch_size = 250
epochs = 20
patience = 10

print("-" * 60)
print("Beici: Welcome! Lets do something deep with {}.".format(dataset_name))
print("       I'm assuming you finished pre-processing.")
print("       We're gonna use {} model.".format(model_name))
csv_path = os.path.join(DIR_PEDAL_METADATA, dataset_name)

tracks = pd.read_csv(csv_path)
training = tracks.loc[tracks['category'] == 'train']
validation = tracks.loc[tracks['category'] == 'valid']
test = tracks.loc[tracks['category'] == 'test']

# print("Beici: We're loading and modifying label values.")
y_train = training.label.values
y_valid = validation.label.values
y_test = test.label.values

y_train = keras.utils.to_categorical(y_train, 2)
y_valid = keras.utils.to_categorical(y_valid, 2)
y_test = keras.utils.to_categorical(y_test, 2)

# callbacks
callbacks = get_callbacks(name=exp_name, patience=patience)
early_stopper, model_saver, weight_saver, csv_logger = callbacks

# print("Beici: Preparing data generators for training and validation...")
steps_per_epoch = len(y_train) // batch_size
gen_train = data_gen(training, y_train, True, batch_size=batch_size)
gen_valid = data_gen(validation, y_valid, False, batch_size=batch_size)
gen_test = data_gen(test, y_test, False, batch_size=batch_size)

print("Beici: Getting model...")
if model_name == 'multi_kernel':
    model = model_multi_kernel_shape(n_out=2)
elif model_name == 'crnn':
    model = model_crnn_icassp2017_choi(n_out=2)
elif model_name == 'cnn3x3':
    model = model_conv3x3_ismir2016_choi(n_out=2)
elif model_name == 'cnn1d':
    model = model_conv1d_icassp2014_sander(n_out=2)

model.compile('adam', 'categorical_crossentropy', metrics=['accuracy'])

# model.summary()

print("Beici: Starting to train...")
model.fit_generator(gen_train, steps_per_epoch, epochs=epochs,
                    callbacks=callbacks,
                    validation_data=gen_valid,
                    validation_steps=len(y_valid) // batch_size)

------------------------------------------------------------
Beici: Welcome! Lets do something deep with pedal-segment_npydf_small.csv.
       I'm assuming you finished pre-processing.
       We're gonna use cnn3x3 model.
Beici: Getting model...
Beici: Starting to train...
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0xbbb9e50>

In [5]:
print("Beici: Training is done. Loading the best weights...")
model.load_weights(os.path.join(DIR_SAVE_MODEL,"{}_best_weights.h5".format(exp_name)))

print("       Evaluating...")
scores = model.evaluate_generator(gen_test, len(y_test) // batch_size)
y_pred = model.predict_generator(gen_test, len(y_test) // batch_size)
auc = roc_auc_score(y_test, y_pred)

print("Beici: Done for {}!".format(model_name))
print("       test set loss: {}".format(scores[0]))
print("       test set accuracy: {}".format(scores[1]))
print("       test set auc: {}".format(auc))

Beici: Training is done. Loading the best weights...
       Evaluating...
Beici: Done for cnn3x3!
       test set loss: 0.172541262023
       test set accuracy: 0.93580005765
       test set auc: 0.99490304


In [4]:
dataset_name = 'pedal-segment_vd.csv'
model_name = 'multi_kernel'
exp_name = 'segment_{}'.format(model_name)
reg_w = 1e-4
batch_size = 250
epochs = 50
patience = 10

print("-" * 60)
print("Beici: Welcome! Lets do something deep with {}.".format(dataset_name))
print("       I'm assuming you finished pre-processing.")
print("       We're gonna use {} model.".format(model_name))
csv_path = os.path.join(DIR_PEDAL_METADATA, dataset_name)

tracks = pd.read_csv(csv_path)
training = tracks.loc[tracks['category'] == 'train']
validation = tracks.loc[tracks['category'] == 'valid']
test = tracks.loc[tracks['category'] == 'test']

# print("Beici: We're loading and modifying label values.")
y_train = training.label.values
y_valid = validation.label.values
y_test = test.label.values

y_train = keras.utils.to_categorical(y_train, 2)
y_valid = keras.utils.to_categorical(y_valid, 2)
y_test = keras.utils.to_categorical(y_test, 2)

# callbacks
callbacks = get_callbacks(name=exp_name, patience=patience)
early_stopper, model_saver, weight_saver, csv_logger = callbacks

# print("Beici: Preparing data generators for training and validation...")
steps_per_epoch = len(y_train) // batch_size
gen_train = data_gen(training, y_train, True, batch_size=batch_size)
gen_valid = data_gen(validation, y_valid, False, batch_size=batch_size)
gen_test = data_gen(test, y_test, False, batch_size=batch_size)

print("Beici: Getting model...")
if model_name == 'multi_kernel':
    model = model_multi_kernel_shape(n_out=2)
elif model_name == 'crnn':
    model = model_crnn_icassp2017_choi(n_out=2)
elif model_name == 'cnn3x3':
    model = model_conv3x3_ismir2016_choi(n_out=2)
elif model_name == 'cnn1d':
    model = model_conv1d_icassp2014_sander(n_out=2)

model.compile('adam', 'categorical_crossentropy', metrics=['accuracy'])

# model.summary()

print("Beici: Starting to train...")
model.fit_generator(gen_train, steps_per_epoch, epochs=epochs,
                    callbacks=callbacks,
                    validation_data=gen_valid,
                    validation_steps=len(y_valid) // batch_size)

------------------------------------------------------------
Beici: Welcome! Lets do something deep with pedal-segment_vd.csv.
       I'm assuming you finished pre-processing.
       We're gonna use multi_kernel model.
Beici: Getting model...
Beici: Starting to train...
  n_data % batch_size != 0 but this code does not assume it

  so the residual 194 sample(s) will be ignored.
  n_data % batch_size != 0 but this code does not assume it
  so the residual 204 sample(s) will be ignored.
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50


<keras.callbacks.History at 0x1356d850>

In [None]:
print("Beici: Training is done. Loading the best weights...")
model.load_weights(os.path.join(DIR_SAVE_MODEL,"{}_best_weights.h5".format(exp_name)))

print("       Evaluating...")
scores = model.evaluate_generator(gen_test, len(y_test) // batch_size)
y_pred = model.predict_generator(gen_test, len(y_test) // batch_size)

In [7]:
auc = roc_auc_score(y_test[:len(y_pred)], y_pred)

print("Beici: Done for {}!".format(model_name))
print("       test set loss: {}".format(scores[0]))
print("       test set accuracy: {}".format(scores[1]))
print("       test set auc: {}".format(auc))

Beici: Done for multi_kernel!
       test set loss: 0.0483642033362
       test set accuracy: 0.988722819034
       test set auc: 0.998992069409


In [None]:
print("Beici: Plot for {}!".format(model_name))
df_log = pd.read_csv(os.path.join(DIR_SAVE_MODEL,"{}.log".format(exp_name)))
fig = plt.figure(figsize=(14,7))
fig.suptitle("Accuracy and Loss Plot of Model: {}".format(model_name))

ax1 = plt.subplot(121)
tr_acc, = ax1.plot(range(len(df_log)), df_log.acc.values)
val_acc, = ax1.plot(range(len(df_log)), df_log.val_acc.values)
ax1.legend([tr_acc, val_acc], ['Train Accuracy', 'Val Accuracy'])
ax1.set_xlabel('Epochs')
ax1.set_ylabel('Accuracy')
ax2 = plt.subplot(122)
tr_loss, = ax2.plot(range(len(df_log)), df_log.loss.values)
val_loss, = ax2.plot(range(len(df_log)), df_log.val_loss.values)
ax2.legend([tr_loss, val_loss], ['Train Loss', 'Val Loss'])
ax2.set_xlabel('Epochs')
ax2.set_ylabel('Loss')

plt.show()