In [1]:
import librosa
import numpy as np

def extract_features(file_name):
   
    try:
        audio, sample_rate = librosa.load(file_name, res_type='kaiser_fast') 
        mfccs = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=40)
        mfccsscaled = np.mean(mfccs.T,axis=0)
        
    except Exception as e:
        print("Error encountered while parsing file: ", file)
        return None 
     
    return mfccsscaled

In [2]:
# Load various imports 
import pandas as pd
import os
import librosa

# Set the path to the full UrbanSound dataset 
datasetpath = 'UrbanSound8K/audio/'

metadata = pd.read_csv('UrbanSound8K/metadata/UrbanSound8K.csv')

features = []

# Iterate through each sound file and extract the features 
for index, row in metadata.iterrows():
    file_name = os.path.join(os.path.relpath(datasetpath), 'fold' + str(row["fold"]), str(row["slice_file_name"]))
        
    class_label = row["class"]
    data = extract_features(file_name)
    
    features.append([data, class_label])

# Convert into a Panda dataframe 
featuresdf = pd.DataFrame(features, columns=['feature','class_label'])

print('Finished feature extraction from', len(featuresdf), 'files')



Finished feature extraction from 8732 files


In [3]:
from sklearn.preprocessing import LabelEncoder
from keras.utils.np_utils import to_categorical

# Convert features and corresponding classification labels into numpy arrays
X = np.array(featuresdf.feature.tolist())
y = np.array(featuresdf.class_label.tolist())

# Encode the classification labels
le = LabelEncoder()
yy = to_categorical(le.fit_transform(y))

In [4]:
# split the dataset 
from sklearn.model_selection import train_test_split 

x_train, x_test, y_train, y_test = train_test_split(X, yy, test_size=0.2, random_state = 42)

In [5]:
import numpy as np
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten
from keras.layers import Convolution2D, MaxPooling2D
from keras.optimizers import Adam
from keras.utils import np_utils
from sklearn import metrics 

num_labels = yy.shape[1]
filter_size = 2

# Construct model 
model = Sequential()

model.add(Dense(256, input_shape=(40,)))
model.add(Activation('relu'))
model.add(Dropout(0.5))

model.add(Dense(256))
model.add(Activation('relu'))
model.add(Dropout(0.5))

model.add(Dense(num_labels))
model.add(Activation('softmax'))

In [6]:
# Compile the model
model.compile(loss='categorical_crossentropy', metrics=['accuracy'], optimizer='adam')

In [7]:
# Display model architecture summary 
model.summary()

# Calculate pre-training accuracy 
score = model.evaluate(x_test, y_test, verbose=0)
accuracy = 100*score[1]

print("Pre-training accuracy: %.4f%%" % accuracy)

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 256)               10496     
_________________________________________________________________
activation (Activation)      (None, 256)               0         
_________________________________________________________________
dropout (Dropout)            (None, 256)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 256)               65792     
_________________________________________________________________
activation_1 (Activation)    (None, 256)               0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 10)                2

In [8]:
from keras.callbacks import ModelCheckpoint 
from datetime import datetime 

num_epochs = 100
num_batch_size = 32

checkpointer = ModelCheckpoint(filepath='saved_models/weights.best.basic_mlp.hdf5', 
                               verbose=1, save_best_only=True)
start = datetime.now()

model.fit(x_train, y_train, batch_size=num_batch_size, epochs=num_epochs, validation_data=(x_test, y_test), callbacks=[checkpointer], verbose=1)


duration = datetime.now() - start
print("Training completed in time: ", duration)

Epoch 1/100

Epoch 00001: val_loss improved from inf to 2.18556, saving model to saved_models\weights.best.basic_mlp.hdf5
Epoch 2/100

Epoch 00002: val_loss improved from 2.18556 to 2.03110, saving model to saved_models\weights.best.basic_mlp.hdf5
Epoch 3/100

Epoch 00003: val_loss improved from 2.03110 to 1.90238, saving model to saved_models\weights.best.basic_mlp.hdf5
Epoch 4/100

Epoch 00004: val_loss improved from 1.90238 to 1.70706, saving model to saved_models\weights.best.basic_mlp.hdf5
Epoch 5/100

Epoch 00005: val_loss improved from 1.70706 to 1.58203, saving model to saved_models\weights.best.basic_mlp.hdf5
Epoch 6/100

Epoch 00006: val_loss improved from 1.58203 to 1.49279, saving model to saved_models\weights.best.basic_mlp.hdf5
Epoch 7/100

Epoch 00007: val_loss improved from 1.49279 to 1.39706, saving model to saved_models\weights.best.basic_mlp.hdf5
Epoch 8/100

Epoch 00008: val_loss improved from 1.39706 to 1.28440, saving model to saved_models\weights.best.basic_mlp.h


Epoch 00071: val_loss did not improve from 0.44465
Epoch 72/100

Epoch 00072: val_loss did not improve from 0.44465
Epoch 73/100

Epoch 00073: val_loss did not improve from 0.44465
Epoch 74/100

Epoch 00074: val_loss did not improve from 0.44465
Epoch 75/100

Epoch 00075: val_loss did not improve from 0.44465
Epoch 76/100

Epoch 00076: val_loss did not improve from 0.44465
Epoch 77/100

Epoch 00077: val_loss did not improve from 0.44465
Epoch 78/100

Epoch 00078: val_loss did not improve from 0.44465
Epoch 79/100

Epoch 00079: val_loss did not improve from 0.44465
Epoch 80/100

Epoch 00080: val_loss improved from 0.44465 to 0.43275, saving model to saved_models\weights.best.basic_mlp.hdf5
Epoch 81/100

Epoch 00081: val_loss did not improve from 0.43275
Epoch 82/100

Epoch 00082: val_loss improved from 0.43275 to 0.43225, saving model to saved_models\weights.best.basic_mlp.hdf5
Epoch 83/100

Epoch 00083: val_loss did not improve from 0.43225
Epoch 84/100

Epoch 00084: val_loss did not 

In [9]:
# Evaluating the model on the training and testing set
score = model.evaluate(x_train, y_train, verbose=0)
print("Training Accuracy: ", score[1])

score = model.evaluate(x_test, y_test, verbose=0)
print("Testing Accuracy: ", score[1])

Training Accuracy:  0.9328561425209045
Testing Accuracy:  0.8740698099136353


In [10]:
def print_prediction(file_name):
    prediction_feature = np.array([extract_features(file_name)])

    predicted_vector = model.predict_classes(prediction_feature)
    predicted_class = le.inverse_transform(predicted_vector) 
    print("The predicted class is:", predicted_class[0], '\n') 

    predicted_proba_vector = model.predict_proba(prediction_feature) 
    predicted_proba = predicted_proba_vector[0]
    for i in range(len(predicted_proba)): 
        category = le.inverse_transform(np.array([i]))
        print(category[0], "\t\t : ", format(predicted_proba[i], '.32f') )

In [11]:
filename = 'Evaluation audio/siren_1.wav'

print_prediction(filename)



The predicted class is: siren 

air_conditioner 		 :  0.00000226412885240279138088226318
car_horn 		 :  0.00024764781119301915168762207031
children_playing 		 :  0.00059285259339958429336547851562
dog_bark 		 :  0.05433553457260131835937500000000
drilling 		 :  0.00010718799603637307882308959961
engine_idling 		 :  0.22807702422142028808593750000000
gun_shot 		 :  0.00064293330069631338119506835938
jackhammer 		 :  0.00010047126852441579103469848633
siren 		 :  0.70871073007583618164062500000000
street_music 		 :  0.00718332361429929733276367187500


