In [1]:
# retrieve the preprocessed data from previous notebook

%store -r x_train 
%store -r x_test 
%store -r y_train 
%store -r y_test 
%store -r yy 
%store -r le

CNNs require a fixed size for all inputs. To overcome this we will zero pad the output vectors to make them all the same size. 

In [2]:
import numpy as np
max_pad_len = 174

def extract_features(file_name):
   
    try:
        audio, sample_rate = librosa.load(file_name, res_type='kaiser_fast') 
        mfccs = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=40)
        pad_width = max_pad_len - mfccs.shape[1]
        mfccs = np.pad(mfccs, pad_width=((0, 0), (0, pad_width)), mode='constant')
        
    except Exception as e:
        print("Error encountered while parsing file: ", file_name)
        return None 
     
    return mfccs

In [3]:
# Load various imports 
import pandas as pd
import os
import librosa

# Set the path to the full UrbanSound dataset 
fulldatasetpath = '../UrbanSound Dataset sample'

metadata = pd.read_csv('../UrbanSound Dataset sample/UrbanSound8K.csv')

features = []

# Iterate through each sound file and extract the features 
for index, row in metadata.iterrows():
    
    file_name = os.path.join(os.path.abspath(fulldatasetpath),'fold'+str(row["fold"])+'/',str(row["slice_file_name"]))
    
    class_label = row["class_name"]
    data = extract_features(file_name)
    
    features.append([data, class_label])

# Convert into a Panda dataframe 
featuresdf = pd.DataFrame(features, columns=['feature','class_label'])

print('Finished feature extraction from ', len(featuresdf), ' files') 

Finished feature extraction from  8732  files


In [4]:
from sklearn.preprocessing import LabelEncoder
from keras.utils import to_categorical

# Convert features and corresponding classification labels into numpy arrays
X = np.array(featuresdf.feature.tolist())
y = np.array(featuresdf.class_label.tolist())

# Encode the classification labels
le = LabelEncoder()
yy = to_categorical(le.fit_transform(y)) 

# split the dataset 
from sklearn.model_selection import train_test_split 

x_train, x_test, y_train, y_test = train_test_split(X, yy, test_size=0.2, random_state = 42)

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [6]:
import numpy as np
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten
from keras.layers import Convolution2D, Conv2D, MaxPooling2D, GlobalAveragePooling2D
from keras.optimizers import Adam
from keras.utils import np_utils
from sklearn import metrics 

num_rows = 40
num_columns = 174
num_channels = 1

x_train = x_train.reshape(x_train.shape[0], num_rows, num_columns, num_channels)
x_test = x_test.reshape(x_test.shape[0], num_rows, num_columns, num_channels)

num_labels = yy.shape[1]
filter_size = 2

# Construct model 
model = Sequential()
model.add(Conv2D(filters=16, kernel_size=2, input_shape=(num_rows, num_columns, num_channels), activation='relu'))
model.add(MaxPooling2D(pool_size=2))
model.add(Dropout(0.2))

model.add(Conv2D(filters=32, kernel_size=2, activation='relu'))
model.add(MaxPooling2D(pool_size=2))
model.add(Dropout(0.2))

model.add(Conv2D(filters=64, kernel_size=2, activation='relu'))
model.add(MaxPooling2D(pool_size=2))
model.add(Dropout(0.2))

model.add(Conv2D(filters=128, kernel_size=2, activation='relu'))
model.add(MaxPooling2D(pool_size=2))
model.add(Dropout(0.2))
model.add(GlobalAveragePooling2D())

model.add(Dense(num_labels, activation='softmax')) 

Instructions for updating:
Colocations handled automatically by placer.


In [7]:
# Compile the model
model.compile(loss='categorical_crossentropy', metrics=['accuracy'], optimizer='adam') 

In [8]:
# Display model architecture summary 
model.summary()

# Calculate pre-training accuracy 
score = model.evaluate(x_test, y_test, verbose=1)
accuracy = 100*score[1]

print("Pre-training accuracy: %.4f%%" % accuracy)

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_1 (Conv2D)            (None, 39, 173, 16)       80        
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 19, 86, 16)        0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 19, 86, 16)        0         
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 18, 85, 32)        2080      
_________________________________________________________________
max_pooling2d_2 (MaxPooling2 (None, 9, 42, 32)         0         
_________________________________________________________________
dropout_2 (Dropout)          (None, 9, 42, 32)         0         
_________________________________________________________________
conv2d_3 (Conv2D)            (None, 8, 41, 64)        

In [9]:
from keras.callbacks import ModelCheckpoint 
from datetime import datetime 

#num_epochs = 12
#num_batch_size = 128

num_epochs = 72
num_batch_size = 256

checkpointer = ModelCheckpoint(filepath='saved_models/weights.best.basic_cnn.hdf5', 
                               verbose=1, save_best_only=True)
start = datetime.now()

model.fit(x_train, y_train, batch_size=num_batch_size, epochs=num_epochs, validation_data=(x_test, y_test), callbacks=[checkpointer], verbose=1)


duration = datetime.now() - start
print("Training completed in time: ", duration)

Instructions for updating:
Use tf.cast instead.
Train on 6985 samples, validate on 1747 samples
Epoch 1/72

Epoch 00001: val_loss improved from inf to 2.08014, saving model to saved_models/weights.best.basic_cnn.hdf5
Epoch 2/72

Epoch 00002: val_loss improved from 2.08014 to 1.94956, saving model to saved_models/weights.best.basic_cnn.hdf5
Epoch 3/72

Epoch 00003: val_loss improved from 1.94956 to 1.72106, saving model to saved_models/weights.best.basic_cnn.hdf5
Epoch 4/72

Epoch 00004: val_loss improved from 1.72106 to 1.59165, saving model to saved_models/weights.best.basic_cnn.hdf5
Epoch 5/72

Epoch 00005: val_loss improved from 1.59165 to 1.48346, saving model to saved_models/weights.best.basic_cnn.hdf5
Epoch 6/72

Epoch 00006: val_loss improved from 1.48346 to 1.37276, saving model to saved_models/weights.best.basic_cnn.hdf5
Epoch 7/72

Epoch 00007: val_loss improved from 1.37276 to 1.32341, saving model to saved_models/weights.best.basic_cnn.hdf5
Epoch 8/72

Epoch 00008: val_loss


Epoch 00033: val_loss improved from 0.69010 to 0.64446, saving model to saved_models/weights.best.basic_cnn.hdf5
Epoch 34/72

Epoch 00034: val_loss did not improve from 0.64446
Epoch 35/72

Epoch 00035: val_loss did not improve from 0.64446
Epoch 36/72

Epoch 00036: val_loss did not improve from 0.64446
Epoch 37/72

Epoch 00037: val_loss did not improve from 0.64446
Epoch 38/72

Epoch 00038: val_loss improved from 0.64446 to 0.58752, saving model to saved_models/weights.best.basic_cnn.hdf5
Epoch 39/72

Epoch 00039: val_loss did not improve from 0.58752
Epoch 40/72

Epoch 00040: val_loss improved from 0.58752 to 0.53986, saving model to saved_models/weights.best.basic_cnn.hdf5
Epoch 41/72

Epoch 00041: val_loss did not improve from 0.53986
Epoch 42/72

Epoch 00042: val_loss did not improve from 0.53986
Epoch 43/72

Epoch 00043: val_loss did not improve from 0.53986
Epoch 44/72

Epoch 00044: val_loss did not improve from 0.53986
Epoch 45/72

Epoch 00045: val_loss did not improve from 0.


Epoch 00072: val_loss improved from 0.41470 to 0.39369, saving model to saved_models/weights.best.basic_cnn.hdf5
Training completed in time:  0:17:04.089576


In [10]:
# Evaluating the model on the training and testing set
score = model.evaluate(x_train, y_train, verbose=0)
print("Training Accuracy: ", score[1])

score = model.evaluate(x_test, y_test, verbose=0)
print("Testing Accuracy: ", score[1])

Training Accuracy:  0.9324266314506531
Testing Accuracy:  0.8712077736854553


### Predictions

In [11]:
def print_prediction(file_name):
    prediction_feature = extract_features(file_name) 
    prediction_feature = prediction_feature.reshape(1, num_rows, num_columns, num_channels)

    predicted_vector = model.predict_classes(prediction_feature)
    predicted_class = le.inverse_transform(predicted_vector) 
    print("The predicted class is:", predicted_class[0], '\n') 

    predicted_proba_vector = model.predict_proba(prediction_feature) 
    predicted_proba = predicted_proba_vector[0]
    for i in range(len(predicted_proba)): 
        category = le.inverse_transform(np.array([i]))
        print(category[0], "\t\t : ", format(predicted_proba[i], '.32f') )

In [12]:
# Class: Air Conditioner

filename = '../UrbanSound Dataset sample/audio/100852-0-0-0.wav' 
print_prediction(filename) 

The predicted class is: air_conditioner 

air_conditioner 		 :  0.99563133716583251953125000000000
car_horn 		 :  0.00020358270558062940835952758789
children_playing 		 :  0.00022027218074072152376174926758
dog_bark 		 :  0.00001087700547941494733095169067
drilling 		 :  0.00207596528343856334686279296875
engine_idling 		 :  0.00026502329274080693721771240234
gun_shot 		 :  0.00004613533019437454640865325928
jackhammer 		 :  0.00087470823200419545173645019531
siren 		 :  0.00000142599765240447595715522766
street_music 		 :  0.00067064189352095127105712890625


In [13]:
# Class: Drilling

filename = '../UrbanSound Dataset sample/audio/103199-4-0-0.wav'
print_prediction(filename) 

The predicted class is: drilling 

air_conditioner 		 :  0.00000042938685851368063595145941
car_horn 		 :  0.00004225774682709015905857086182
children_playing 		 :  0.00000014005190962507185759022832
dog_bark 		 :  0.00000000305991920690473762078909
drilling 		 :  0.99971979856491088867187500000000
engine_idling 		 :  0.00000000353655948970299505162984
gun_shot 		 :  0.00000000008939533141516520231562
jackhammer 		 :  0.00001107652587961638346314430237
siren 		 :  0.00000035053454894296010024845600
street_music 		 :  0.00022583712416235357522964477539


In [14]:
# Class: Street music 

filename = '../UrbanSound Dataset sample/audio/101848-9-0-0.wav'
print_prediction(filename) 

The predicted class is: street_music 

air_conditioner 		 :  0.00833441503345966339111328125000
car_horn 		 :  0.00465157674625515937805175781250
children_playing 		 :  0.04517330974340438842773437500000
dog_bark 		 :  0.00047527611604891717433929443359
drilling 		 :  0.00010076705802930518984794616699
engine_idling 		 :  0.00000989730506262276321649551392
gun_shot 		 :  0.00000000005706900924051794277148
jackhammer 		 :  0.00000185486965165182482451200485
siren 		 :  0.00284131290391087532043457031250
street_music 		 :  0.93841165304183959960937500000000


In [15]:
# Class: Car Horn 

filename = '../UrbanSound Dataset sample/audio/100648-1-0-0.wav'
print_prediction(filename) 

The predicted class is: drilling 

air_conditioner 		 :  0.00284333573654294013977050781250
car_horn 		 :  0.20432898402214050292968750000000
children_playing 		 :  0.00917182397097349166870117187500
dog_bark 		 :  0.15067791938781738281250000000000
drilling 		 :  0.23780646920204162597656250000000
engine_idling 		 :  0.01807190477848052978515625000000
gun_shot 		 :  0.16262696683406829833984375000000
jackhammer 		 :  0.18306092917919158935546875000000
siren 		 :  0.02403151430189609527587890625000
street_music 		 :  0.00738012418150901794433593750000


In [16]:
filename = '../Evaluation audio/dog_bark_1.wav'
print_prediction(filename) 

The predicted class is: dog_bark 

air_conditioner 		 :  0.00006136990123195573687553405762
car_horn 		 :  0.00024994436535052955150604248047
children_playing 		 :  0.00098140211775898933410644531250
dog_bark 		 :  0.99002712965011596679687500000000
drilling 		 :  0.00452040135860443115234375000000
engine_idling 		 :  0.00012656371109187602996826171875
gun_shot 		 :  0.00106266024522483348846435546875
jackhammer 		 :  0.00000228182352657313458621501923
siren 		 :  0.00006977697921684011816978454590
street_music 		 :  0.00289859389886260032653808593750


In [17]:
filename = '../Evaluation audio/drilling_1.wav'

print_prediction(filename) 

The predicted class is: jackhammer 

air_conditioner 		 :  0.02930284291505813598632812500000
car_horn 		 :  0.01064816582947969436645507812500
children_playing 		 :  0.00097187329083681106567382812500
dog_bark 		 :  0.00353058939799666404724121093750
drilling 		 :  0.00866227783262729644775390625000
engine_idling 		 :  0.00768750114366412162780761718750
gun_shot 		 :  0.00006228521669982001185417175293
jackhammer 		 :  0.93893104791641235351562500000000
siren 		 :  0.00011614457616815343499183654785
street_music 		 :  0.00008724664803594350814819335938


In [18]:
filename = '../Evaluation audio/gun_shot_1.wav'

print_prediction(filename) 

The predicted class is: gun_shot 

air_conditioner 		 :  0.00000002894428874355980951804668
car_horn 		 :  0.00007518471829826012253761291504
children_playing 		 :  0.00016089688870124518871307373047
dog_bark 		 :  0.00235235667787492275238037109375
drilling 		 :  0.03010236844420433044433593750000
engine_idling 		 :  0.00256270007230341434478759765625
gun_shot 		 :  0.96351563930511474609375000000000
jackhammer 		 :  0.00000104830974123615305870771408
siren 		 :  0.00107238197233527898788452148438
street_music 		 :  0.00015743463882245123386383056641
