In [11]:
from __future__ import print_function
from __future__ import division
import os, sys
import numpy as np
import pandas as pd
from builtins import range
from sklearn.metrics import roc_auc_score
import librosa, librosa.display
import matplotlib.pyplot as plt
% matplotlib inline

import tensorflow as tf
import keras
from keras.models import Sequential, Model
from keras.layers import Activation, Dense, Flatten, Input, Reshape, Dropout, Permute
from keras.layers.convolutional import Conv2D
from keras.layers.normalization import BatchNormalization
from keras.layers.recurrent import GRU
from keras.layers.pooling import MaxPooling2D, GlobalAveragePooling2D
from keras.layers.merge import Concatenate
from keras import backend as K
from keras.backend.tensorflow_backend import set_session

os.environ["CUDA_VISIBLE_DEVICES"]="0" # the number of the GPU
config = tf.ConfigProto()
config.gpu_options.per_process_gpu_memory_fraction = 0.5 # percentage to be used
set_session(tf.Session(config=config))

from kapre.time_frequency import Melspectrogram
from global_config import *

import multiprocessing
from multiprocessing import Pool
N_JOBS = 50

In [13]:
def gen_filepaths(df, dataroot=None):
    """Generate file path (column name 'filepath') from given dataframe """
    for filepath in df.filepath.values:
        yield os.path.join(dataroot, filepath)
        
def get_mfcc(filename, dataroot):    
    csv_filename = 'pedal-{}_npydf_small.csv'.format(filename)
    df = pd.read_csv(os.path.join(DIR_PEDAL_METADATA, csv_filename))
    training = df.loc[df['category'] == 'train']
    validation = df.loc[df['category'] == 'valid']
    print('pedal-{}: Dataframe with size {} for training and {} for validation.'.format(filename,len(training),len(validation)))
    
    for (task_data, task_name) in zip([training, validation], ['train', 'valid']):
        print('Getting MFCC features to {}...'.format(task_name))
        npy_filename = 'small-{}_{}_mfcc.npy'.format(filename, task_name)
        gen_f = gen_filepaths(task_data, dataroot=dataroot)

        pool = Pool(N_JOBS)
        paths = list(gen_f)
        feats = pool.map(_path_to_mfccs, paths)
        feats = np.array(feats)
        np.save(os.path.join(DIR_SAVE_MODEL, npy_filename), feats)
        print('  done!')
        pool.close()
        pool.join()
    
def _path_to_mfccs(path):
    src = np.load(path)
    mfcc = librosa.feature.mfcc(src, SR, n_mfcc=20)
    dmfcc = mfcc[:, 1:] - mfcc[:, :-1]
    ddmfcc = dmfcc[:, 1:] - dmfcc[:, :-1]
    return np.concatenate((np.mean(mfcc, axis=1), np.std(mfcc, axis=1),
                           np.mean(dmfcc, axis=1), np.std(dmfcc, axis=1),
                           np.mean(ddmfcc, axis=1), np.std(ddmfcc, axis=1)), axis=0)

def load_xy(filename, task_name):
    
    npy_filename = 'small-{}_{}_mfcc.npy'.format(filename, task_name)
    x = np.load(os.path.join(DIR_SAVE_MODEL, npy_filename))
    
    csv_filename = 'pedal-{}_npydf_small.csv'.format(filename)
    df = pd.read_csv(os.path.join(DIR_PEDAL_METADATA, csv_filename))
    task_data = df.loc[df['category'] == task_name]   
    y = task_data.label.values
    return x, y

### Get MFCC features from the small datasets

Features are saved in `./save-model/small-{onset or segment}_{train or valid}_mfcc.npy`

In [14]:
# filenames = ['onset', 'segment']
# dataroots = [DIR_PEDAL_ONSET_NPY, DIR_PEDAL_SEGMENT_NPY]
# for idx, (filename, dr) in enumerate(zip(filenames, dataroots)):
#     get_mfcc(filename, dataroot=dr)

# '''
# pedal-onset: Dataframe with size 70000 for training and 20000 for validation.
# Getting MFCC features to train...
#   done!
# Getting MFCC features to valid...
#   done!
# pedal-segment: Dataframe with size 70000 for training and 20000 for validation.
# Getting MFCC features to train...
#   done!
# Getting MFCC features to valid...
#   done!
# '''

### Do grid search cross validation to get the best SVM parameters for binary classification

Run `python mfcc_gridsearch.py` to save the best parameters in `./save-model/small-{onset or segment}_mfcc_svc_best_params.npy`

In [None]:
# tiny version of mfcc_gridsearch.py using a small portion of data

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import log_loss, roc_auc_score

class OptionalStandardScaler(StandardScaler):
    def __init__(self, on=False):
        self.on = on  # bool
        if self.on:
            super(OptionalStandardScaler, self).__init__(with_mean=True, with_std=True)
        else:
            super(OptionalStandardScaler, self).__init__(with_mean=False, with_std=False)

n_cpu = multiprocessing.cpu_count()
n_jobs = int(n_cpu * 0.8)
print('There are {} cpu available, {} (80%) of them will be used for our jobs.'.format(n_cpu, n_jobs))

# gps = [{"C": [0.1, 2.0, 8.0, 32.0], "kernel": ['rbf'],
#         "gamma": [0.5 ** i for i in [3, 5, 7, 9, 11, 13]] + ['auto']},
#        {"C": [0.1, 2.0, 8.0, 32.0], "kernel": ['linear']}
#       ]
gps = [{"C": [0.1, 2.0], "kernel": ['rbf'],
        "gamma": [0.5 ** i for i in [3, 5]]},
       {"C": [0.1, 2.0], "kernel": ['linear']}
      ]
classifier = SVC
dataroots = [DIR_PEDAL_ONSET_NPY, DIR_PEDAL_SEGMENT_NPY]
filenames = ['onset', 'segment']

for filename in filenames:
        
    x_train, y_train = load_xy(filename, task_name='train')
    x_valid, y_valid = load_xy(filename, task_name='valid')
    # x = np.concatenate((x_train, x_valid), axis=0)
    # y = np.concatenate((y_train, y_valid), axis=0)
    # cv = [(np.arange(len(x_train)), np.arange(len(x_train),len(x)))]
    x = np.concatenate((x_train[:30], x_valid[:10]), axis=0)
    y = np.concatenate((y_train[:30], y_valid[:10]), axis=0)
    cv = [(np.arange(30), np.arange(30,len(x)))]
    clname = classifier.__name__
    estimators = [('stdd', OptionalStandardScaler()), ('clf', classifier())]
    pipe = Pipeline(estimators)

    params = []
    for dct in gps:
        sub_params = {'stdd__on': [True, False]}
        sub_params.update({'clf__' + key: value for (key, value) in dct.iteritems()})
        params.append(sub_params)

    clf = GridSearchCV(pipe, params, cv=cv, n_jobs=n_jobs, pre_dispatch='8*n_jobs').fit(x, y)
    save_npy_path = os.path.join(DIR_SAVE_MODEL,'testmfcc-{}.npy'.format(filename))
    np.save(save_npy_path, [clf.best_params_])
    print('best score of pedal-{} {}: {}'.format(filename, clname, clf.best_score_))
    print(clf.best_params_)    

### Load the SVM with best parameters to get the scores on the small validation dataset

In [15]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, ParameterGrid
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import log_loss, roc_auc_score

filenames = ['onset', 'segment']
dataroots = [DIR_PEDAL_ONSET_NPY, DIR_PEDAL_SEGMENT_NPY]

for filename in filenames:
    print('===== Pedal-{} SVC Best Parameters ====='.format(filename))
    x_train, y_train = load_xy(filename, task_name='train')
    x_valid, y_valid = load_xy(filename, task_name='valid')
        
    save_npy_path = os.path.join(DIR_SAVE_MODEL,'small-{}_mfcc_svc_best_params.npy'.format(filename))
    parameter = np.load(save_npy_path)[0]
    if parameter['clf__kernel']=='linear':
        clf = SVC(kernel='linear', C=parameter['clf__C']).fit(x_train, y_train)
    else:
        clf = SVC(kernel=parameter['clf__kernel'], C=parameter['clf__C'], gamma=parameter['clf__gamma']).fit(x_train, y_train)

    y_pred = clf.predict(x_valid)
    loss = log_loss(y_valid, y_pred)
    acc = clf.score(x_valid, y_valid)
    auc = roc_auc_score(y_valid, y_pred)
    
    print('{}'.format(parameter))
    print("      valid set loss: {}".format(loss))
    print("  valid set accuracy: {}".format(acc))
    print("       valid set auc: {}".format(auc))