# 2D CNN classification of images of raw spectrograms

In [1]:
import os
import pandas as pd
from tqdm import tqdm
import shutil
import librosa
import numpy as np
import librosa.display
import errno
from scipy.misc import imsave
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, Conv2D, MaxPooling2D, Input
import time
from keras.utils import np_utils
from keras import backend as K
from sklearn.manifold import TSNE
from keras.models import load_model
from keras.callbacks import ModelCheckpoint, EarlyStopping, CSVLogger, ReduceLROnPlateau
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix
import seaborn as sns
from sklearn.decomposition import PCA
import math
from keras.preprocessing.image import ImageDataGenerator

nb_class = 3

Using TensorFlow backend.


In [2]:
target_train_folder_images = 'spectrograms/train/fold4/'
target_validation_folder_images = 'spectrograms/validation/fold4/'

rows, cols = 257, 313
SR = 16000
N_FFT = 512
HOP_LEN = 512

## Check if folder path exists

In [None]:
def make_sure_path_exists(path):
    try:
        os.makedirs(path)
    except OSError as exception:
        if exception.errno != errno.EEXIST:
            raise

In [None]:
train_wavs = 'audio/train/fold4/'
validation_wavs = 'audio/validation/fold4/'

## Training audio to spectrograms

In [None]:
for path, subdirs, files in os.walk(train_wavs):
    path = path.replace(train_wavs, "")
    print("The total length of audio files in folder:", path, "is:", len(files))
    make_sure_path_exists(target_train_folder_images + "/" + path)

    for audio_name in tqdm(files):
        data, rate = librosa.load(train_wavs + "/" + path + "/" + audio_name, mono=True, sr=SR)

        X = librosa.stft(data, n_fft=N_FFT, hop_length=HOP_LEN)
        D = librosa.amplitude_to_db(np.abs(X))

        D = np.flipud(D)

#         plt.imshow(D, cmap='gray')
#         plt.show()

        if D.shape[0] != rows or D.shape[1] != cols:
            print(audio_name, D.shape)
            
            input("wait")

        imsave(target_train_folder_images + "/" + path + "/" + audio_name.split(".wav")[0] + '.png', D)

## Validation audio to spectrograms

In [None]:
for path, subdirs, files in os.walk(validation_wavs):
    path = path.replace(validation_wavs, "")
    print("The total length of audio files in folder:", path, "is:", len(files))
    make_sure_path_exists(target_validation_folder_images + "/" + path)

    for audio_name in tqdm(files):
        data, rate = librosa.load(validation_wavs + "/" + path + "/" + audio_name, mono=True, sr=SR)

        X = librosa.stft(data, n_fft=N_FFT, hop_length=HOP_LEN)
        D = librosa.amplitude_to_db(np.abs(X))

        D = np.flipud(D)

        # plt.imshow(D, cmap='gray')
        # plt.show()

        if D.shape[0] != rows or D.shape[1] != cols:
            print(audio_name, D.shape)

        imsave(target_validation_folder_images + "/" + path + "/" + audio_name.split(".wav")[0] + '.png', D)

## Training the model

In [3]:
model_name = 'keras_spectrograms_fold4'
best_weights_path = model_name + '.h5'
log_path = model_name + '.log'

In [4]:
monitor = 'val_acc'

input_shape = (257, 313, 1)

batch_size = 32
epochs = 50
es_patience = 8
rlr_patience = 5
rlr_factor = 0.1

In [5]:
print("########## - Train folder summary")

train_total_files = 0

for path, subdirs, files in os.walk(target_train_folder_images):
    path = path.replace(target_train_folder_images + "/", "")
    if path != target_train_folder_images:

        print(path, len(files))
        train_total_files += len(files)

print()
print("########## - Valid folder summary")

valid_total_files = 0

for path, subdirs, files in os.walk(target_validation_folder_images):
    path = path.replace(target_validation_folder_images + "/", "")
    if path != target_validation_folder_images:

        print(path, len(files))
        valid_total_files += len(files)

print()
print("########## - Total files", train_total_files + valid_total_files)
print("########## - Total classes", nb_class)

########## - Train folder summary
spectrograms/train/fold4/outdoor 1170
spectrograms/train/fold4/vehicle 936
spectrograms/train/fold4/indoor 1404

########## - Valid folder summary
spectrograms/validation/fold4/outdoor 390
spectrograms/validation/fold4/vehicle 312
spectrograms/validation/fold4/indoor 468

########## - Total files 4680
########## - Total classes 3


In [6]:
train_datagen = ImageDataGenerator(rescale=1. / 255)

valid_datagen = ImageDataGenerator(rescale=1. / 255)

print("\n########## - Train folder")
train_generator = train_datagen.flow_from_directory(
    target_train_folder_images,
    target_size=(input_shape[0], input_shape[1]),
    batch_size=batch_size,
    color_mode='grayscale',
    shuffle=True)

print("########## - Valid folder")
valid_generator = valid_datagen.flow_from_directory(
    target_validation_folder_images,
    target_size=(input_shape[0], input_shape[1]),
    batch_size=batch_size,
    color_mode='grayscale',
    shuffle=False)

print(valid_generator.class_indices)


########## - Train folder
Found 3510 images belonging to 3 classes.
########## - Valid folder
Found 1170 images belonging to 3 classes.
{'vehicle': 2, 'outdoor': 1, 'indoor': 0}


In [7]:
def f1(y_true, y_pred):
    def recall(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

    def precision(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision

    precision = precision(y_true, y_pred)
    recall = recall(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))


def sklearn_f1(label, pred):
    final_preds = []

    for i in range(len(pred)):
        final_preds.append(np.argmax(pred[i]))

    final_preds = np.array(final_preds)

    return f1_score(label, final_preds, labels=None, pos_label=1, average='macro', sample_weight=None)

In [8]:
def construct_model(input_shape=input_shape, num_classes = 3):
    
    model = Sequential()
    model.add(Conv2D(32, kernel_size=(2, 2), activation='relu', input_shape=input_shape))
#     model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(Conv2D(48, kernel_size=(2, 2), activation='relu'))
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(Conv2D(120, kernel_size=(2, 2), activation='relu'))
    model.add(MaxPooling2D(pool_size=(2, 2)))
#     model.add(Dropout(0.25))
    model.add(Flatten())
#     model.add(Dense(128, activation='relu'))
#     model.add(Dropout(0.25))
#     model.add(Dense(64, activation='relu'))
#     model.add(Dropout(0.4))
    model.add(Dense(num_classes, activation='softmax', name="output_layer"))
    model.compile(loss=keras.losses.categorical_crossentropy,
                  optimizer=keras.optimizers.Adam(),
                  metrics=['accuracy', f1])
    model.summary()
    
    return model

In [9]:
# img_input = Input(shape=input_shape)

model = construct_model(input_shape=input_shape, num_classes=nb_class)

# model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy', f1])
##############################################################################

callbacks_list = [ModelCheckpoint(monitor=monitor,
                                  filepath=best_weights_path,
                                  save_best_only=True,
                                  save_weights_only=True,
                                  mode='max',
                                  verbose=1),
                  EarlyStopping(monitor=monitor,
                                patience=es_patience,
                                verbose=1),
                  ReduceLROnPlateau(monitor=monitor,
                                    factor=rlr_factor,
                                    patience=rlr_patience,
                                    verbose=1),
                  CSVLogger(filename=log_path)]

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_1 (Conv2D)            (None, 256, 312, 32)      160       
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 255, 311, 48)      6192      
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 127, 155, 48)      0         
_________________________________________________________________
conv2d_3 (Conv2D)            (None, 126, 154, 120)     23160     
_________________________________________________________________
max_pooling2d_2 (MaxPooling2 (None, 63, 77, 120)       0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 582120)            0         
_________________________________________________________________
output_layer (Dense)         (None, 3)                 1746363   
Total para

In [None]:
print("\n########## - TRAINING\n")

history = model.fit_generator(train_generator,
                              steps_per_epoch=int(math.ceil(float(train_total_files) / float(batch_size))),
                              validation_data=valid_generator,
                              validation_steps=int(math.ceil(float(valid_total_files) / float(batch_size))),
                              epochs=epochs,
                              callbacks=callbacks_list,
                              shuffle=False)


########## - TRAINING

Epoch 1/50

Epoch 00001: val_acc improved from -inf to 0.71795, saving model to keras_spectrograms_fold4.h5
Epoch 2/50

Epoch 00002: val_acc improved from 0.71795 to 0.77009, saving model to keras_spectrograms_fold4.h5
Epoch 3/50

Epoch 00003: val_acc improved from 0.77009 to 0.83333, saving model to keras_spectrograms_fold4.h5
Epoch 4/50

Epoch 00004: val_acc improved from 0.83333 to 0.83846, saving model to keras_spectrograms_fold4.h5
Epoch 5/50

Epoch 00005: val_acc improved from 0.83846 to 0.84188, saving model to keras_spectrograms_fold4.h5
Epoch 6/50

Epoch 00006: val_acc did not improve from 0.84188
Epoch 7/50

Epoch 00007: val_acc improved from 0.84188 to 0.88718, saving model to keras_spectrograms_fold4.h5
Epoch 8/50

Epoch 00008: val_acc improved from 0.88718 to 0.88889, saving model to keras_spectrograms_fold4.h5
Epoch 9/50

Epoch 00009: val_acc did not improve from 0.88889
Epoch 10/50

Epoch 00010: val_acc did not improve from 0.88889
Epoch 11/50

Ep

In [None]:
# list all data in history
print(history.history.keys())
# summarize history for accuracy
plt.plot(history.history['acc'])
plt.plot(history.history['val_acc'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='upper left')
plt.show()
# summarize history for loss
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='upper left')
plt.show()

In [None]:
print("Training accuracy: %.2f%% / Validation accuracy: %.2f%%" % 
      (100*history.history['acc'][-1], 100*history.history['val_acc'][-1]))

In [None]:
valid_generator.reset()
Y_pred = model.predict_generator(valid_generator, valid_total_files // batch_size+1)
y_pred = np.argmax(Y_pred, axis=1)
# print('Confusion Matrix')
# print(confusion_matrix(valid_generator.classes, y_pred))
# print('Classification Report')
target_names = ['indoor', 'outdoor', 'vehicle']
# print(classification_report(valid_generator.classes, y_pred, target_names=target_names))

In [None]:
sns.heatmap(confusion_matrix(valid_generator.classes, y_pred), 
            annot=True, fmt="d", xticklabels=target_names, yticklabels=target_names, cbar=False)

In [None]:
print("Accuracy: ", accuracy_score(valid_generator.classes, y_pred))
print("F1 Score: ", f1_score(valid_generator.classes, y_pred, average="macro"))
print("Precision Score: ", precision_score(valid_generator.classes, y_pred, average="macro"))
print("Recall Score: ", recall_score(valid_generator.classes, y_pred, average="macro")) 
print(classification_report(valid_generator.classes, y_pred, target_names=target_names))