In [0]:
!pip install librosa

In [0]:
import scipy.io.wavfile as wav
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, Conv2D, MaxPooling2D
from keras.utils import to_categorical
import librosa
import os
from sklearn.model_selection import train_test_split
from keras.utils import to_categorical
import numpy as np
from tqdm import tqdm

In [0]:
allLabels = ['Barood','Blast', 'Bum', 'Fire','Khoon', 
             'Maar', 'Moat', 'Murder', 'Smuggle', 'Taawaan', 'negative']
def get_labels(allLabels):
    labels = allLabels
    label_indices = np.arange(0, len(labels))
    return labels, label_indices

In [0]:
def wav2mfcc(file_path, max_len=100):
    sr, wave = wav.read(file_path)
    mfcc = librosa.feature.mfcc(wave, sr=sr)

    # If maximum length exceeds mfcc lengths then pad the remaining ones
    if (max_len > mfcc.shape[1]):
        pad_width = max_len - mfcc.shape[1]
        mfcc = np.pad(mfcc, pad_width=((0, 0), (0, pad_width)), mode='constant')

    # Else cutoff the remaining parts
    else:
        mfcc = mfcc[:, :max_len]
    
    return mfcc

In [0]:
def get_train_test(split_ratio=0.8, random_state=42):
    # Get available labels
    labels, indices= get_labels(allLabels)

    # Getting first arrays
    X = np.load(labels[0] + '.npy')
    y = np.zeros(X.shape[0])
    print(X.shape)

    # Append all of the dataset into one single array, same goes for y
    for i, label in enumerate(labels[1:]):
        print(label,i+1)
        x = np.load(label + '.npy')
        print(x.shape)
        X = np.vstack((X, x))
        y = np.append(y, np.full(x.shape[0], fill_value= (i + 1)))

    assert X.shape[0] == len(y)

    return train_test_split(X, y, test_size= (1 - split_ratio), random_state=random_state, shuffle=True)
  
# Loading train set and test set
X_train, X_test, y_train, y_test = get_train_test()
print(X_train.shape)


(979, 20, 100)
Blast 1
(998, 20, 100)
Bum 2
(1000, 20, 100)
Fire 3
(1000, 20, 100)
Khoon 4
(1000, 20, 100)
Maar 5
(1000, 20, 100)
Moat 6
(1000, 20, 100)
Murder 7
(999, 20, 100)
Smuggle 8
(999, 20, 100)
Taawaan 9
(999, 20, 100)
negative 10
(10000, 20, 100)
(15979, 20, 100)


In [0]:
# Feature 1 dimension
feature_dim_1 = 20
# Second dimension of the feature is dim2
feature_dim_2 = 100

channel = 1
epochs = 20
batch_size = 100
verbose = 1
num_classes = 11

# Reshaping to perform 2D convolution
X_train = X_train.reshape(X_train.shape[0], feature_dim_1, feature_dim_2, channel)
X_test = X_test.reshape(X_test.shape[0], feature_dim_1, feature_dim_2, channel)

y_train_hot = to_categorical(y_train)
y_test_hot = to_categorical(y_test)


def get_model():
    model = Sequential()
    model.add(Conv2D(32, kernel_size=(2, 2), activation='relu', input_shape=(feature_dim_1, feature_dim_2, channel)))
    model.add(Conv2D(48, kernel_size=(2, 2), activation='relu'))
    model.add(Conv2D(120, kernel_size=(2, 2), activation='relu'))
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(Dropout(0.25))
    model.add(Flatten())
    model.add(Dense(128, activation='relu'))
    model.add(Dropout(0.25))
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.4))
    model.add(Dense(num_classes, activation='softmax'))
    model.compile(loss=keras.losses.categorical_crossentropy,
                  optimizer=keras.optimizers.Adadelta(),
                  metrics=['accuracy'])
    return model

In [0]:
model = get_model()
CP = keras.callbacks.ModelCheckpoint('model-{epoch:03d}-{val_loss:.2f}.h5', monitor='val_loss', verbose=1, 
                                     save_best_only=True, save_weights_only=False, mode='auto', period=1)
model.fit(X_train, y_train_hot, batch_size=batch_size, epochs=epochs, verbose=verbose, callbacks = [CP], 
          validation_data=(X_test, y_test_hot))

Train on 15979 samples, validate on 3995 samples
Epoch 1/20

Epoch 00001: val_loss improved from inf to 0.83062, saving model to model-001-0.83.h5
Epoch 2/20

Epoch 00002: val_loss improved from 0.83062 to 0.52557, saving model to model-002-0.53.h5
Epoch 3/20

Epoch 00003: val_loss improved from 0.52557 to 0.35294, saving model to model-003-0.35.h5
Epoch 4/20

Epoch 00004: val_loss improved from 0.35294 to 0.29544, saving model to model-004-0.30.h5
Epoch 5/20

Epoch 00005: val_loss improved from 0.29544 to 0.24715, saving model to model-005-0.25.h5
Epoch 6/20

Epoch 00006: val_loss did not improve from 0.24715
Epoch 7/20

Epoch 00007: val_loss improved from 0.24715 to 0.24707, saving model to model-007-0.25.h5
Epoch 8/20

Epoch 00008: val_loss improved from 0.24707 to 0.24522, saving model to model-008-0.25.h5
Epoch 9/20

Epoch 00009: val_loss improved from 0.24522 to 0.21354, saving model to model-009-0.21.h5
Epoch 10/20

Epoch 00010: val_loss did not improve from 0.21354
Epoch 11/20


<keras.callbacks.History at 0x7f855960a160>

In [0]:
model = keras.models.load_model('model-016-0.20.h5')

In [0]:
# Predicting one sample
def predict(filepath, model):
    sample = wav2mfcc(filepath)
    sample_reshaped = sample.reshape(1, feature_dim_1, feature_dim_2, channel)
    print(model.predict(sample_reshaped))
    return get_labels(allLabels)[0][
            np.argmax(model.predict(sample_reshaped))
    ]
    

model = model
print(predict('5.wav', model=model))

[[1.7842256e-02 1.8564482e-04 8.9623392e-01 8.3583879e-04 4.0888963e-03
  1.0611506e-03 1.0204834e-04 4.2228472e-05 4.8055600e-02 3.0836878e-02
  7.1561296e-04]]
1000Bum


In [0]:
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_46 (Conv2D)           (None, 19, 99, 32)        160       
_________________________________________________________________
conv2d_47 (Conv2D)           (None, 18, 98, 48)        6192      
_________________________________________________________________
conv2d_48 (Conv2D)           (None, 17, 97, 120)       23160     
_________________________________________________________________
max_pooling2d_16 (MaxPooling (None, 8, 48, 120)        0         
_________________________________________________________________
dropout_46 (Dropout)         (None, 8, 48, 120)        0         
_________________________________________________________________
flatten_16 (Flatten)         (None, 46080)             0         
_________________________________________________________________
dense_46 (Dense)             (None, 128)               5898368   
__________