In [1]:
# import libraries
import numpy as np
import librosa
import librosa.display
import os
from sklearn.model_selection import train_test_split
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, Conv2D, MaxPooling2D, BatchNormalization
from sklearn.metrics import classification_report
from utils import wav2mfcc, model, get_data
import utils
import keras
import test

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
import numpy as np
import librosa
import os
from keras.utils import to_categorical

# wav2mfcc.py

def wav2mfcc(file_path, max_pad_len=20):
    """uses librosa to identify the file, the path, set the sample rate (sr),
    and create the proper shape????"""
    # Load an audio file as a floating point time series.
    # To preserve the native sampling rate of the file, use sr=None.
    # mono=True converts the audio signal to mono.
    wave, sr = librosa.load(file_path, mono=True, sr=None)
    # wave = ??
    wave = wave[::3]
    # Mel-frequency cepstral coefficients (MFCCs)
    mfcc = librosa.feature.mfcc(wave, sr=8000)
    # set the dimension of the padded array
    pad_width = max_pad_len - mfcc.shape[1]
    mfcc = np.pad(mfcc, pad_width=((0, 0), (0, pad_width)),
                  mode='constant') # pads a 'constant' value
    return mfcc


def get_data():
    """create a function that loops through the list of audio file names,
    adds '.wav' to them, and appends it to a new list, mfccs"""
    labels = []
    mfccs = []
    
    # when searching all the files in this folder
    for f in os.listdir('./recordings'):
        # if a file ends in .wav
        if f.endswith('.wav'):
            # MFCC; append the file with its new name in the mfccs folder
            mfccs.append(wav2mfcc('./recordings/' + f))

            # List of labels
            # splits the name of the files?
            label = f.split('_')[0]
            labels.append(label)
    
    # convert the returned data to an array
    # to_categorical converts a class vector (integers)...
    # ...to binary class matrix.
    return np.asarray(mfccs), to_categorical(labels)

    # MIKE---why optional below? 

# if __name__ == '__main__':
#     mfccs, labels = get_data()
#     print(mfccs.shape)
#     print(labels.shape)

In [3]:
import utils
from sklearn.model_selection import train_test_split

def get_all():
    """create the test train split, create the dimensions, 
    instantiate X and y, and instantiate the model"""
    
    # call Keras utils to read the audio files and convert from
    # .wav to mfcc
    mfccs, labels = utils.wav2mfcc.get_data()
    
    # instantiate the dimensions, channels, classes
    dim_1 = mfccs.shape[1]
    dim_2 = mfccs.shape[2]
    channels = 1
    classes = 10
    
    # instantiate X and y
    X = mfccs
    X = X.reshape((mfccs.shape[0], dim_1, dim_2, channels))
    y = labels
    
    # instantiate the input shape
    input_shape = (dim_1, dim_2, channels)
    
    # set up test-train, test size is 10%; random state established
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.1, random_state=1)
    
    # instantiate the model
    model = utils.model.get_cnn_model(input_shape, classes)

    return X_train, X_test, y_train, y_test, model

In [4]:
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, Conv2D, MaxPooling2D, BatchNormalization

#model.py

"""creating a model for the convolutional neural network. i can add more...."""
def get_cnn_model(input_shape, num_classes):
    model = Sequential()

    model.add(Conv2D(32, kernel_size=(2, 2), activation='relu',
                     input_shape=input_shape))
    
    model.add(BatchNormalization())

    model.add(Conv2D(48, kernel_size=(2, 2), activation='relu'))
    model.add(BatchNormalization())

    model.add(Conv2D(120, kernel_size=(2, 2), activation='relu'))
    model.add(BatchNormalization())

    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(Dropout(0.25))

    model.add(Flatten())

    model.add(Dense(128, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dropout(0.15))
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.15))
    model.add(Dense(16, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dropout(0.4))
    model.add(Dense(num_classes, activation='softmax'))
    model.compile(loss=keras.losses.categorical_crossentropy,
                  optimizer=keras.optimizers.Adadelta(), metrics=['accuracy'])

    return model

    # MIKE---why optional below? 


# if __name__ == '__main__':
#     model = get_cnn_model((20, 20, 1), 10)
#     #ann_viz(model, title="Neural Network Model", filename='../images/model.gv')
#     print(model.summary())

In [5]:
import keras
from sklearn.metrics import classification_report
from utils import wav2mfcc, model, get_data
from keras.utils import to_categorical

# test.py
def check_preds(X, y):
    """instantiate the trained model, predictions, and print results
    in a classification report from sklearn"""

    # use Keras function load_model
    trained_model = keras.models.load_model('trained_model.h5')
    # predict_classes is exclusive to Sequential class
    predictions = trained_model.predict_classes(X)
    
    # classification report builds a text report showing the
    # main classification metrics; 
    # to_categorical converts a class vector (integers)
    # to a binary class matrix.
    print(classification_report(y, to_categorical(predictions)))

    # MIKE---why optional below? 

# if __name__ == '__main__':
#     _, X_test, _, y_test, _ = get_data.get_all()
#
#     check_preds(X_test, y_test)

In [6]:
from utils import model, wav2mfcc, get_data
import test
import keras

X_train, X_test, y_train, y_test, cnn_model = get_data.get_all()

print(cnn_model.summary())

# deleted hyperparameter callbacks=[keras_callback] due to errors
cnn_model.fit(X_train, y_train, batch_size=64, epochs=20,
              verbose=1, validation_split=0.1)

cnn_model.save('trained_modelv1.h5')

test.check_preds(X_test, y_test)

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_1 (Conv2D)            (None, 19, 19, 32)        160       
_________________________________________________________________
batch_normalization_1 (Batch (None, 19, 19, 32)        128       
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 18, 18, 48)        6192      
_________________________________________________________________
batch_normalization_2 (Batch (None, 18, 18, 48)        192       
_________________________________________________________________
conv2d_3 (Conv2D)            (None, 17, 17, 120)       23160     
_________________________________________________________________

In [None]:
## MFCCS isn't called?
import matplotlib.pyplot as plt
plt.figure(figsize=(10, 4))
librosa.display.specshow(mfccs, x_axis='time')
plt.colorbar()
plt.title('MFCC')
plt.tight_layout()
plt.show()

codes & articles referenced:
- https://github.com/adhishthite/sound-mnist
- https://github.com/Jakobovski/free-spoken-digit-dataset