In [1]:
LOAD = True

In [1]:
import numpy as np
np.random.seed(1001)

import os
import shutil

import IPython
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from tqdm import tqdm_notebook
from sklearn.model_selection import StratifiedKFold

import wave
from scipy.io import wavfile

import librosa
import numpy as np
import scipy

from keras import losses, models, optimizers
from keras.activations import relu, softmax
from keras.callbacks import (EarlyStopping, LearningRateScheduler,
                             ModelCheckpoint, TensorBoard, ReduceLROnPlateau)
from keras.layers import (Convolution1D, Dense, Dropout, GlobalAveragePooling1D, 
                          GlobalMaxPool1D, Input, MaxPool1D, concatenate)
from keras.utils import Sequence, to_categorical

from keras.layers import (Convolution2D, GlobalAveragePooling2D, BatchNormalization, Flatten,
                          GlobalMaxPool2D, MaxPool2D, concatenate, Activation)
from keras.utils import Sequence, to_categorical
from keras import backend as K
from keras.models import Sequential

%matplotlib inline
matplotlib.style.use('ggplot')

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
train = pd.read_csv("../data/train.csv")
test = pd.read_csv("../data/test_post_competition.csv")

In [3]:
train.head()

Unnamed: 0,fname,label,manually_verified
0,00044347.wav,Hi-hat,0
1,001ca53d.wav,Saxophone,1
2,002d256b.wav,Trumpet,0
3,0033e230.wav,Glockenspiel,1
4,00353774.wav,Cello,1


In [4]:
# Remove None Label from test data
test.drop(test[test.label == 'None'].index, inplace=True)
test.head()

Unnamed: 0,fname,label,usage,freesound_id,license
4,00326aa9.wav,Oboe,Private,355125,Attribution
5,0038a046.wav,Bass_drum,Private,90621,Creative Commons 0
8,007759c4.wav,Saxophone,Private,13406,Creative Commons 0
9,008afd93.wav,Saxophone,Private,358962,Attribution
12,00ae03f6.wav,Chime,Private,78203,Attribution


In [5]:
print('No. of Classes: ', len(train.label.unique()))
print('No. of Training Samples: {}, No. of Test Samples: {}'.format(len(train), len(test)))

No. of Classes:  41
No. of Training Samples: 9473, No. of Test Samples: 1600


#### Hyper-parameters

In [6]:
SAMPLING_RATE = 16_000
DURATION = 2
CLASSES = 41
FOLDS = 10
LEARNING_RATE = .001
MAX_EPOCHS = 50
BATCH_SIZE = 64
AUDIO_LENGTH = SAMPLING_RATE * DURATION
DIM = (AUDIO_LENGTH,1)

#### Conv1D Model

In [7]:
def get_1d_conv_model():
    
    inp = Input(shape=(AUDIO_LENGTH,1))
    x = Convolution1D(16, 9, activation=relu)(inp)
    x = Convolution1D(16, 9, activation=relu)(x)
    x = MaxPool1D(16)(x)
    x = Dropout(rate=0.1)(x)
    
    x = Convolution1D(32, 3, activation=relu)(x)
    x = Convolution1D(32, 3, activation=relu)(x)
    x = MaxPool1D(4)(x)
    x = Dropout(rate=0.1)(x)
    
    x = Convolution1D(32, 3, activation=relu)(x)
    x = Convolution1D(32, 3, activation=relu)(x)
    x = MaxPool1D(4)(x)
    x = Dropout(rate=0.1)(x)
    
    x = Convolution1D(256, 3, activation=relu)(x)
    x = Convolution1D(256, 3, activation=relu)(x)
    x = GlobalMaxPool1D()(x)
    x = Dropout(rate=0.2)(x)

    x = Dense(64, activation=relu)(x)
    x = Dense(1028, activation=relu)(x)
    out = Dense(CLASSES, activation=softmax)(x)

    model = models.Model(inputs=inp, outputs=out)
    opt = optimizers.Adam(LEARNING_RATE)

    model.compile(optimizer=opt, loss=losses.categorical_crossentropy, metrics=['acc'])
    return model

In [9]:
# from keras.utils.vis_utils import plot_model
# model = get_1d_conv_model()
# plot_model(model, to_file='model_plot.png')

In [9]:
LABELS = list(train.label.unique())
label_idx = {label: i for i, label in enumerate(LABELS)}
train.set_index("fname", inplace=True)
test.set_index("fname", inplace=True)
train["label_idx"] = train.label.apply(lambda x: label_idx[x])
test["label_idx"] = test.label.apply(lambda x: label_idx[x])

In [10]:
def prepare_data(df, data_dir):
    X = np.empty(shape=(df.shape[0], DIM[0], 1))
    input_length = AUDIO_LENGTH
    for i, fname in tqdm_notebook(enumerate(df.index)):
        file_path = data_dir + fname
        data, _ = librosa.core.load(file_path, sr=SAMPLING_RATE, res_type="kaiser_fast")

        # Random offset / Padding
        if len(data) > input_length:
            max_offset = len(data) - input_length
            offset = np.random.randint(max_offset)
            data = data[offset:(input_length+offset)]
        else:
            if input_length > len(data):
                max_offset = input_length - len(data)
                offset = np.random.randint(max_offset)
            else:
                offset = 0
            data = np.pad(data, (offset, input_length - len(data) - offset), "constant")
        data = data[:, np.newaxis]
        
        X[i,] = data
    return X

In [11]:
if not LOAD:
    X_train = prepare_data(train, '../data/audio_train/')
    X_test = prepare_data(test, '../data/audio_test/')

In [12]:
y_train = to_categorical(train.label_idx, num_classes=CLASSES)
y_test = to_categorical(test.label_idx, num_classes=CLASSES)

In [13]:
# Normalization
if not LOAD:
    mean = np.mean(X_train, axis=0)
    std = np.std(X_train, axis=0)

    X_train = (X_train - mean)/std
    X_test = (X_test - mean)/std

### Training..

In [14]:
PREDICTION_FOLDER = "predictions"
if not LOAD:
    if not os.path.exists(PREDICTION_FOLDER):
        os.mkdir(PREDICTION_FOLDER)

    if os.path.exists('logs/' + PREDICTION_FOLDER):
        shutil.rmtree('logs/' + PREDICTION_FOLDER)

In [15]:
if not LOAD:
    checkpoint = ModelCheckpoint('best_weights.h5', monitor='val_loss', verbose=1, save_best_only=True)
    early = EarlyStopping(monitor="val_loss", mode="min", patience=5)
    tb = TensorBoard(log_dir='./logs/' + PREDICTION_FOLDER, write_graph=True)
    callbacks_list = [checkpoint, early, tb]

    model = get_1d_conv_model()

    history = model.fit(X_train, y_train, validation_data=(X_test, y_test), callbacks=callbacks_list, 
                        batch_size=64, epochs=MAX_EPOCHS)

    model.save_weights('weights_final.h5')

In [16]:
# load best weights
model = get_1d_conv_model()
model.load_weights('best_weights.h5')

In [17]:
if not LOAD:
    # Save train predictions
    predictions = model.predict(X_train, batch_size=64, verbose=1)
    np.save(PREDICTION_FOLDER + '/train_predictions.npy', predictions)
    train_eval = model.evaluate(x=X_train, y=y_train, batch_size=64, verbose=1)
    print('Training Accuracy: ', train_eval[1])

predictions = np.load(PREDICTION_FOLDER + '/train_predictions.npy')
from sklearn.metrics import classification_report
print('Train Classification Report:')
print(classification_report(np.argmax(y_train, axis=1), np.argmax(predictions, axis=1)))

Train Classification Report:
              precision    recall  f1-score   support

           0       0.84      0.85      0.84       300
           1       0.82      0.64      0.72       300
           2       0.90      0.71      0.80       300
           3       0.58      0.70      0.63        94
           4       0.54      0.63      0.58       300
           5       0.79      0.77      0.78       279
           6       0.73      0.67      0.70       147
           7       0.59      0.69      0.64       300
           8       0.65      0.87      0.74       119
           9       0.67      0.85      0.75       139
          10       0.82      0.87      0.84       300
          11       0.80      0.69      0.74       270
          12       0.85      0.70      0.77       300
          13       0.70      0.82      0.76       300
          14       0.93      0.85      0.89       300
          15       0.74      0.93      0.82       299
          16       0.61      0.59      0.60       30

In [18]:
if not LOAD:
    # Save test predictions
    predictions = model.predict(X_test, batch_size=64, verbose=1)
    np.save(PREDICTION_FOLDER + '/test_predictions.npy', predictions)
    test_eval = model.evaluate(x=X_test, y=y_test, batch_size=64, verbose=1)
    print('Test Accuracy: ', test_eval[1])

predictions = np.load(PREDICTION_FOLDER + '/test_predictions.npy')

from sklearn.metrics import classification_report
print('Test Classification Report:')
print(classification_report(np.argmax(y_test, axis=1), np.argmax(predictions, axis=1)))

Test Classification Report:
              precision    recall  f1-score   support

           0       0.64      0.74      0.69        39
           1       0.87      0.65      0.75       110
           2       0.75      0.73      0.74        37
           3       0.54      0.48      0.51        29
           4       0.48      0.52      0.50        54
           5       0.72      0.72      0.72        39
           6       0.74      0.44      0.55        63
           7       0.38      0.66      0.48        56
           8       0.57      0.65      0.61        26
           9       0.62      0.75      0.68        28
          10       0.66      0.85      0.74        34
          11       0.67      0.34      0.45        29
          12       0.88      0.74      0.80        38
          13       0.48      0.93      0.63        27
          14       0.76      0.73      0.75        30
          15       0.51      0.88      0.64        42
          16       0.47      0.42      0.44        55

In [19]:
# Calculate top3 accuracy
test.reset_index(inplace=True)
test.drop(['fname', 'usage', 'freesound_id', 'license'], axis=1, inplace=True)
top_3 = np.array(LABELS)[np.argsort(-predictions, axis=1)[:, :3]]
predicted_labels = [' '.join(list(x)) for x in top_3]
test['predicted_label'] = predicted_labels
top3_acc = 0.
for index, row in test.iterrows():
    if row['label'] in row['predicted_label'].split():
        top3_acc += 1
top3_acc /= len(test)
print('Test Top3 Accuracy: ', top3_acc)

Test Top3 Accuracy:  0.82875
