In [1]:
LOAD = True

In [2]:
import numpy as np
np.random.seed(1001)

import os
import shutil

import IPython
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from tqdm import tqdm_notebook
from sklearn.model_selection import StratifiedKFold

import wave
from scipy.io import wavfile

import librosa
import numpy as np
import scipy

from keras import losses, models, optimizers
from keras.activations import relu, softmax
from keras.callbacks import (EarlyStopping, LearningRateScheduler,
                             ModelCheckpoint, TensorBoard, ReduceLROnPlateau)
from keras.layers import (Convolution1D, Dense, Dropout, GlobalAveragePooling1D, 
                          GlobalMaxPool1D, Input, MaxPool1D, concatenate)
from keras.utils import Sequence, to_categorical

from keras.layers import (Convolution2D, GlobalAveragePooling2D, BatchNormalization, Flatten,
                          GlobalMaxPool2D, MaxPool2D, concatenate, Activation)
from keras.utils import Sequence, to_categorical
from keras import backend as K
from keras.models import Sequential

%matplotlib inline
matplotlib.style.use('ggplot')

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [3]:
train = pd.read_csv("../data/train.csv")
test = pd.read_csv("../data/test_post_competition.csv")

In [4]:
train.head()

Unnamed: 0,fname,label,manually_verified
0,00044347.wav,Hi-hat,0
1,001ca53d.wav,Saxophone,1
2,002d256b.wav,Trumpet,0
3,0033e230.wav,Glockenspiel,1
4,00353774.wav,Cello,1


In [5]:
# Remove None Label from test data
test.drop(test[test.label == 'None'].index, inplace=True)
test.head()

Unnamed: 0,fname,label,usage,freesound_id,license
4,00326aa9.wav,Oboe,Private,355125,Attribution
5,0038a046.wav,Bass_drum,Private,90621,Creative Commons 0
8,007759c4.wav,Saxophone,Private,13406,Creative Commons 0
9,008afd93.wav,Saxophone,Private,358962,Attribution
12,00ae03f6.wav,Chime,Private,78203,Attribution


In [6]:
test.head()

Unnamed: 0,fname,label,usage,freesound_id,license
4,00326aa9.wav,Oboe,Private,355125,Attribution
5,0038a046.wav,Bass_drum,Private,90621,Creative Commons 0
8,007759c4.wav,Saxophone,Private,13406,Creative Commons 0
9,008afd93.wav,Saxophone,Private,358962,Attribution
12,00ae03f6.wav,Chime,Private,78203,Attribution


In [7]:
# Use classes having 300 samples
category_group = train.groupby(['label']).count().reset_index()
category_group.sort_values(['manually_verified'], inplace=True, ascending=False)
category_group = category_group[category_group.manually_verified >= 300]
category_group = category_group['label']
print('No. of Classes: ', len(category_group))
train  = train.loc[train['label'].isin(category_group)]
test  = test.loc[test['label'].isin(category_group)]
print('No. of Training Samples: {}, No. of Test Samples: {}'.format(len(train), len(test)))

No. of Classes:  18
No. of Training Samples: 5400, No. of Test Samples: 823


#### Hyper-parameters

In [8]:
SAMPLING_RATE = 44100
DURATION = 2
CLASSES = 18
LEARNING_RATE = .001
MAX_EPOCHS = 50
BATCH_SIZE = 32
AUDIO_LENGTH = SAMPLING_RATE * DURATION
N_MFCC = 40
DIM = (N_MFCC, 1 + int(np.floor(AUDIO_LENGTH/512)), 1)

#### MFCC Conv2D Model

In [9]:
def build_model():
    inp = Input(shape=(DIM[0], DIM[1],1))
    x = Convolution2D(32, (4,10), padding="same")(inp)
    x = BatchNormalization()(x)
    x = Activation("relu")(x)
    x = MaxPool2D()(x)

    x = Convolution2D(32, (4,10), padding="same")(x)
    x = BatchNormalization()(x)
    x = Activation("relu")(x)
    x = MaxPool2D()(x)

    x = Convolution2D(32, (4,10), padding="same")(x)
    x = BatchNormalization()(x)
    x = Activation("relu")(x)
    x = MaxPool2D()(x)

    x = Convolution2D(32, (4,10), padding="same")(x)
    x = BatchNormalization()(x)
    x = Activation("relu")(x)
    x = MaxPool2D()(x)

    x = Flatten()(x)
    x = Dense(64)(x)
    x = BatchNormalization()(x)
    x = Activation("relu")(x)
    out = Dense(CLASSES, activation=softmax)(x)

    model = models.Model(inputs=inp, outputs=out)
    opt = optimizers.Adam(LEARNING_RATE)

    model.compile(optimizer=opt, loss=losses.categorical_crossentropy, metrics=['acc'])
    return model

In [10]:
build_model().summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 40, 173, 1)        0         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 40, 173, 32)       1312      
_________________________________________________________________
batch_normalization_1 (Batch (None, 40, 173, 32)       128       
_________________________________________________________________
activation_1 (Activation)    (None, 40, 173, 32)       0         
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 20, 86, 32)        0         
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 20, 86, 32)        40992     
_________________________________________________________________
batch_normalization_2 (Batch (None, 20, 86, 32)        128       
__________

In [11]:
LABELS = list(train.label.unique())
label_idx = {label: i for i, label in enumerate(LABELS)}
train.set_index("fname", inplace=True)
test.set_index("fname", inplace=True)
train["label_idx"] = train.label.apply(lambda x: label_idx[x])
test["label_idx"] = test.label.apply(lambda x: label_idx[x])

In [12]:
def prepare_data(df, data_dir):
    X = np.empty(shape=(df.shape[0], DIM[0], DIM[1], 1))
    input_length = AUDIO_LENGTH
    for i, fname in tqdm_notebook(enumerate(df.index)):
        file_path = data_dir + fname
        data, _ = librosa.core.load(file_path, sr=SAMPLING_RATE, res_type="kaiser_fast")

        # Random offset / Padding
        if len(data) > input_length:
            max_offset = len(data) - input_length
            offset = np.random.randint(max_offset)
            data = data[offset:(input_length+offset)]
        else:
            if input_length > len(data):
                max_offset = input_length - len(data)
                offset = np.random.randint(max_offset)
            else:
                offset = 0
            data = np.pad(data, (offset, input_length - len(data) - offset), "constant")

        data = librosa.feature.mfcc(data, sr=SAMPLING_RATE, n_mfcc=N_MFCC)
        data = np.expand_dims(data, axis=-1)
        X[i,] = data
    return X

In [13]:
if not LOAD:
    X_train = prepare_data(train, '../data/audio_train/')
    X_test = prepare_data(test, '../data/audio_test/')

In [14]:
y_train = to_categorical(train.label_idx, num_classes=CLASSES)
y_test = to_categorical(test.label_idx, num_classes=CLASSES)

In [15]:
# Normalization
if not LOAD:
    mean = np.mean(X_train, axis=0)
    std = np.std(X_train, axis=0)

    X_train = (X_train - mean)/std
    X_test = (X_test - mean)/std

### Training..

In [16]:
PREDICTION_FOLDER = "predictions"
if not LOAD:
    if not os.path.exists(PREDICTION_FOLDER):
        os.mkdir(PREDICTION_FOLDER)
    if os.path.exists('logs/' + PREDICTION_FOLDER):
        shutil.rmtree('logs/' + PREDICTION_FOLDER)

In [17]:
if not LOAD:
    K.clear_session()

    checkpoint = ModelCheckpoint('best_weights.h5', monitor='val_loss', verbose=1, save_best_only=True)
    early = EarlyStopping(monitor="val_loss", mode="min", patience=5)
    tb = TensorBoard(log_dir='./logs/' + PREDICTION_FOLDER, write_graph=True)
    callbacks_list = [checkpoint, early, tb]

    model = build_model()

    history = model.fit(X_train, y_train, validation_data=(X_test, y_test), callbacks=callbacks_list, 
                        batch_size=64, epochs=MAX_EPOCHS)

    model.save_weights('weights_final.h5')

In [18]:
# load best weights
model = build_model()
model.load_weights('best_weights.h5')

In [19]:
if not LOAD:
    # Save train predictions
    predictions = model.predict(X_train, batch_size=64, verbose=1)
    np.save(PREDICTION_FOLDER + '/train_predictions.npy', predictions)
    train_eval = model.evaluate(x=X_train, y=y_train, batch_size=64, verbose=1)
    print('Training Accuracy: ', train_eval[1])

predictions = np.load(PREDICTION_FOLDER + '/train_predictions.npy')
from sklearn.metrics import classification_report
print('Train Classification Report:')
print(classification_report(np.argmax(y_train, axis=1), np.argmax(predictions, axis=1)))

Train Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.97      0.93       300
           1       0.94      0.97      0.95       300
           2       0.99      0.91      0.95       300
           3       0.93      0.94      0.94       300
           4       0.99      0.98      0.98       300
           5       0.90      0.93      0.92       300
           6       0.98      0.86      0.91       300
           7       0.84      0.92      0.88       300
           8       0.96      0.87      0.91       300
           9       0.96      0.92      0.94       300
          10       0.93      0.96      0.95       300
          11       0.78      0.92      0.85       300
          12       0.94      0.97      0.96       300
          13       0.86      0.92      0.89       300
          14       0.95      0.71      0.81       300
          15       0.91      0.99      0.95       300
          16       0.96      0.92      0.94       30

In [20]:
if not LOAD:
    # Save test predictions
    predictions = model.predict(X_test, batch_size=64, verbose=1)
    np.save(PREDICTION_FOLDER + '/test_predictions.npy', predictions)
    test_eval = model.evaluate(x=X_test, y=y_test, batch_size=64, verbose=1)
    print('Test Accuracy: ', test_eval[1])

predictions = np.load(PREDICTION_FOLDER + '/test_predictions.npy')

from sklearn.metrics import classification_report
print('Test Classification Report:')
print(classification_report(np.argmax(y_test, axis=1), np.argmax(predictions, axis=1)))

Test Classification Report:
              precision    recall  f1-score   support

           0       0.67      0.82      0.74        39
           1       0.80      0.77      0.79       110
           2       0.79      0.73      0.76        37
           3       0.71      0.83      0.77        54
           4       0.76      0.84      0.80        56
           5       0.61      0.79      0.69        34
           6       0.96      0.71      0.82        38
           7       0.56      0.81      0.67        27
           8       0.75      0.60      0.67        30
           9       0.77      0.73      0.75        55
          10       0.92      0.79      0.85        28
          11       0.20      0.21      0.20        29
          12       0.73      0.88      0.80        40
          13       0.67      0.62      0.64        29
          14       1.00      0.25      0.40        32
          15       0.84      1.00      0.91        32
          16       0.61      0.51      0.55        45

In [21]:
# Calculate top3 accuracy
test.reset_index(inplace=True)
test.drop(['fname', 'usage', 'freesound_id', 'license'], axis=1, inplace=True)
top_3 = np.array(LABELS)[np.argsort(-predictions, axis=1)[:, :3]]
predicted_labels = [' '.join(list(x)) for x in top_3]
test['predicted_label'] = predicted_labels
top3_acc = 0.
for index, row in test.iterrows():
    if row['label'] in row['predicted_label'].split():
        top3_acc += 1
top3_acc /= len(test)
print('Test Top3 Accuracy: ', top3_acc)

Test Top3 Accuracy:  0.9234507897934386
