In [2]:
import os
import pickle
from glob import iglob
import numpy as np
import librosa

In [1]:
import pandas as pd
import sys

In [4]:
from keras.callbacks import ReduceLROnPlateau,ModelCheckpoint
from keras.utils.np_utils import to_categorical
import keras.backend as K
from keras import regularizers
from keras.layers import Lambda
from keras.layers.convolutional import Conv1D, MaxPooling1D
from keras.layers.core import Activation, Dense
from keras.layers.normalization import BatchNormalization
from keras.models import Sequential
import numpy as np
import pickle
import os
from glob import glob

In [8]:
import keras_models as km
from sklearn.metrics import recall_score,precision_score,f1_score
def rec_pre_f1_MRS(y_test,pred_test):
    rec=recall_score(y_test.tolist(),pred_test,average=None)
    pre=precision_score(y_test.tolist(),pred_test,average=None)
    f1=f1_score(y_test,pred_test,average=None)
    return rec,pre,f1

In [3]:
def read_audio_from_filename(filename, target_sr):
    audio, _ = librosa.load(filename, sr=target_sr, mono=True)
    audio = audio.reshape(-1, 1)
    return audio

In [5]:
DATA_AUDIO_DIR = "/vol/work3/berhe/MRS_Detection/SceneAudio/"
TARGET_SR = 8000
OUTPUT_DIR = "/vol/work3/berhe/MRS_Detection/SceneAudio/"
AUDIO_LENGTH = 200000

dataset_Df=pd.read_csv("Scene_Dataset_Normalized.csv")
sceneLabels=dataset_Df.MRS.tolist()
sceneLabels=[i if i==0 else 1 for i in sceneLabels]
def convert_data():
    for i in range(1,445):
        #print("Scene {}".format(i))
        sys.stdout.write('Scene %d\r' % i)
        sys.stdout.flush()
        wav_filename=DATA_AUDIO_DIR+"Scene_"+str(i)+".wav"
        #y, sr=librosa.load(audioFile)
        class_id = sceneLabels[i-1]
        audio_buf = read_audio_from_filename(wav_filename, target_sr=TARGET_SR)
        # normalize mean 0, variance 1
        audio_buf = (audio_buf - np.mean(audio_buf)) / np.std(audio_buf)
        original_length = len(audio_buf)
        print(i, wav_filename, original_length, np.round(np.mean(audio_buf), 4), np.std(audio_buf))
        if original_length < AUDIO_LENGTH:
            audio_buf = np.concatenate((audio_buf, np.zeros(shape=(AUDIO_LENGTH - original_length, 1))))
            print('PAD New length =', len(audio_buf))
        elif original_length > AUDIO_LENGTH:
            audio_buf = audio_buf[original_length-AUDIO_LENGTH:original_length]
            print('CUT New length =', len(audio_buf))

        output_folder = OUTPUT_DIR

        output_filename = os.path.join(output_folder, "Scene_"+str(i) + 'end.pkl')

        out = {'class_id': class_id,
               'audio': audio_buf,
               'sr': TARGET_SR}
        with open(output_filename, 'wb') as w:
            pickle.dump(out, w)

In [6]:
#convert_data()

In [9]:
def m5(num_classes=5):
    print('Using Model M5')
    m = Sequential()
    m.add(Conv1D(128,
                 input_shape=[AUDIO_LENGTH, 1],
                 kernel_size=80,
                 strides=4,
                 padding='same',
                 kernel_initializer='glorot_uniform',
                 kernel_regularizer=regularizers.l2(l=0.0001)))
    m.add(BatchNormalization())
    m.add(Activation('relu'))
    m.add(MaxPooling1D(pool_size=4, strides=None))
    m.add(Conv1D(128,
                 kernel_size=3,
                 strides=1,
                 padding='same',
                 kernel_initializer='glorot_uniform',
                 kernel_regularizer=regularizers.l2(l=0.0001)))
    m.add(BatchNormalization())
    m.add(Activation('relu'))
    m.add(MaxPooling1D(pool_size=4, strides=None))
    m.add(Conv1D(256,
                 kernel_size=3,
                 strides=1,
                 padding='same',
                 kernel_initializer='glorot_uniform',
                 kernel_regularizer=regularizers.l2(l=0.0001)))
    m.add(BatchNormalization())
    m.add(Activation('relu'))
    m.add(MaxPooling1D(pool_size=4, strides=None))
    m.add(Conv1D(512,
                 kernel_size=3,
                 strides=1,
                 padding='same',
                 kernel_initializer='glorot_uniform',
                 kernel_regularizer=regularizers.l2(l=0.0001)))
    m.add(BatchNormalization())
    m.add(Activation('relu'))
    m.add(MaxPooling1D(pool_size=4, strides=None))
    m.add(Lambda(lambda x: K.mean(x, axis=1)))  # Same as GAP for 1D Conv Layer
    m.add(Dense(num_classes, activation='softmax'))
    return m


In [10]:
def get_data(file_list):
    def load_into(_filename, _x, _y):
        with open(_filename, 'rb') as f:
            audio_element = pickle.load(f)
            _x.append(audio_element['audio'])
            _y.append(int(audio_element['class_id']))

    x, y = [], []
    for filename in file_list:
        load_into(filename, x, y)
    return np.array(x), np.array(y)

In [50]:
model = m5(num_classes=2)

Using Model M5


In [51]:
model.compile(optimizer='adam',
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])

In [52]:
train_files = glob(os.path.join(OUTPUT_DIR, '**end.pkl'))
x_tr, y_tr = get_data(train_files)
#y_tr = to_categorical(y_tr, num_classes=num_classes)

In [53]:
x_train, x_test, y_train, y_test=km.processDataB(x_tr,y_tr,testSize=0.33)

(444,) (147,)


In [54]:
x_train.shape,x_test.shape

((297, 200000, 1), (147, 200000, 1))

In [55]:
checkpointer = ModelCheckpoint(filepath='weights.best.audioFile'+'.hdf5',
                                   verbose=1, save_best_only=True)
#callbacks=[checkpointer]
model.fit(x=x_train,
              y=y_train,
              batch_size=64,
              epochs=10,
              verbose=1,
              shuffle=True,
              validation_data=(x_test, y_test)
              )

Train on 297 samples, validate on 147 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.callbacks.History at 0x7f265c41cd68>

In [56]:
pred_test=model.predict_classes(x_test)
print(rec_pre_f1_MRS(y_test,pred_test))

(array([0.9453125 , 0.10526316]), array([0.87681159, 0.22222222]), array([0.90977444, 0.14285714]))


In [57]:
model.evaluate(x_test, y_test, verbose=0)

[0.6285640147267556, 0.8367347121238708]