In [1]:
# librosa is a python package for music and audio analysis. It provides the building blocks necessary to create
# music information retrieval systems.
!pip install librosa



In [2]:
#### Extracting MFCC's For every audio file
import pandas as pd
import os
import librosa

audio_dataset_path='UrbanSound8K/audio/'
metadata=pd.read_csv('UrbanSound8K/metadata/UrbanSound8K.csv')
metadata.head()

Unnamed: 0,slice_file_name,fsID,start,end,salience,fold,classID,class
0,100032-3-0-0.wav,100032,0.0,0.317551,1,5,3,dog_bark
1,100263-2-0-117.wav,100263,58.5,62.5,1,5,2,children_playing
2,100263-2-0-121.wav,100263,60.5,64.5,1,5,2,children_playing
3,100263-2-0-126.wav,100263,63.0,67.0,1,5,2,children_playing
4,100263-2-0-137.wav,100263,68.5,72.5,1,5,2,children_playing


In [3]:
### Check whether the dataset is imbalanced or not
metadata['class'].value_counts()
# the data is balanced

street_music        1000
children_playing    1000
engine_idling       1000
jackhammer          1000
dog_bark            1000
air_conditioner     1000
drilling            1000
siren                929
car_horn             429
gun_shot             374
Name: class, dtype: int64

In [4]:
def features_extractor(file):
    audio, sample_rate = librosa.load(file_name, res_type='kaiser_fast') # get audio and sample rate
    mfccs_features = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=40) # give audio and sample rate to mfcc
    mfccs_scaled_features = np.mean(mfccs_features.T,axis=0) # doing transpose of mfccs_features
    #np.mean is use to get the scale data

    return mfccs_scaled_features
# this is used to extract a single audio file for multiple we use loop

In [5]:
import numpy as np
from tqdm import tqdm
### tqdm is a library in Python which is used for creating Progress Meters or Progress Bars. tqdm got its name from the
### Arabic name taqaddum which means ‘progress’.It will show you the progress meter below when u run it.
### Now we iterate through every audio file and extract features 
### using Mel-Frequency Cepstral Coefficients
extracted_features=[]
for index_num,row in tqdm(metadata.iterrows()):# use iterrows to extract all the rows
    file_name = os.path.join(os.path.abspath(audio_dataset_path),'fold'+str(row["fold"])+'/',str(row["slice_file_name"]))# get the file name
    final_class_labels=row["class"]
    data=features_extractor(file_name)
    extracted_features.append([data,final_class_labels])

8732it [18:40,  7.79it/s]


In [6]:
### converting extracted_features to Pandas dataframe
extracted_features_df=pd.DataFrame(extracted_features,columns=['feature','class'])
extracted_features_df.head()

Unnamed: 0,feature,class
0,"[-215.79301, 71.66612, -131.81377, -52.09133, ...",dog_bark
1,"[-424.68677, 110.56227, -54.148235, 62.01074, ...",children_playing
2,"[-459.56467, 122.800354, -47.92471, 53.265697,...",children_playing
3,"[-414.55377, 102.896904, -36.66495, 54.18041, ...",children_playing
4,"[-447.397, 115.0954, -53.809113, 61.60859, 1.6...",children_playing


In [7]:
### Split the dataset into independent and dependent dataset
X=np.array(extracted_features_df['feature'].tolist())
y=np.array(extracted_features_df['class'].tolist())

In [8]:
X.shape

(8732, 40)

In [9]:
y

array(['dog_bark', 'children_playing', 'children_playing', ...,
       'car_horn', 'car_horn', 'car_horn'], dtype='<U16')

In [10]:
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder
labelencoder=LabelEncoder()
y=to_categorical(labelencoder.fit_transform(y))# convert y classes into 0,1,2,3 to 9 becoz there are total 10 classes

In [11]:
y

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.]], dtype=float32)

In [12]:
y.shape

(8732, 10)

In [13]:
### Train Test Split
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=0)

In [14]:
X_train

array([[-1.3183614e+02,  1.1397464e+02, -2.3956861e+01, ...,
         3.3314774e+00, -1.4786110e+00,  2.8736601e+00],
       [-1.4074220e+01,  9.1916939e+01, -8.6787214e+00, ...,
        -3.3844023e+00, -5.2119045e+00, -1.5936139e+00],
       [-4.9532028e+01,  1.5521857e-01, -2.0369110e+01, ...,
         2.0491767e+00, -8.0537474e-01,  2.7793028e+00],
       ...,
       [-4.2699332e+02,  9.2890656e+01,  3.0233388e+00, ...,
         8.6335957e-01,  6.4766806e-01,  7.8490508e-01],
       [-1.4607024e+02,  1.3709459e+02, -3.4298344e+01, ...,
         1.3777871e+00, -1.9530845e+00, -8.9652127e-01],
       [-4.2167450e+02,  2.1169032e+02,  2.6820304e+00, ...,
        -5.1484952e+00, -3.6400862e+00, -1.3321609e+00]], dtype=float32)

In [15]:
X_train.shape

(6985, 40)

In [16]:
X_test.shape


(1747, 40)

# Model Creation

In [17]:
import tensorflow as tf
print(tf.__version__)

2.2.0


In [18]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,Dropout,Activation,Flatten
from tensorflow.keras.optimizers import Adam
from sklearn import metrics

In [19]:
### No of classes
### for getting the no. of classes
num_labels=y.shape[1]
num_labels
# we have 10 classes or output

10

In [20]:
model=Sequential()
###first layer
model.add(Dense(100,input_shape=(40,)))# because in training data we have 40 features
model.add(Activation('relu'))
model.add(Dropout(0.5))
###second layer
model.add(Dense(200))
model.add(Activation('relu'))
model.add(Dropout(0.5))
###third layer
model.add(Dense(100))
model.add(Activation('relu'))
model.add(Dropout(0.5))

###final layer
model.add(Dense(num_labels))
model.add(Activation('softmax'))

In [21]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 100)               4100      
_________________________________________________________________
activation (Activation)      (None, 100)               0         
_________________________________________________________________
dropout (Dropout)            (None, 100)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 200)               20200     
_________________________________________________________________
activation_1 (Activation)    (None, 200)               0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 200)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 100)               2

In [22]:
model.compile(loss='categorical_crossentropy',metrics=['accuracy'],optimizer='adam')

In [23]:
## Trianing my model
from tensorflow.keras.callbacks import ModelCheckpoint
from datetime import datetime 

num_epochs = 200
num_batch_size = 32

checkpointer = ModelCheckpoint(filepath='saved_models/audio_classification.hdf5', 
                               verbose=1, save_best_only=True)
start = datetime.now()

model.fit(X_train, y_train, batch_size=num_batch_size, epochs=num_epochs, validation_data=(X_test, y_test), callbacks=[checkpointer], verbose=1)


duration = datetime.now() - start
print("Training completed in time: ", duration)

Epoch 1/200
Epoch 00001: val_loss improved from inf to 2.29048, saving model to saved_models/audio_classification.hdf5
Epoch 2/200
Epoch 00002: val_loss improved from 2.29048 to 2.28175, saving model to saved_models/audio_classification.hdf5
Epoch 3/200
Epoch 00003: val_loss improved from 2.28175 to 2.26719, saving model to saved_models/audio_classification.hdf5
Epoch 4/200
Epoch 00004: val_loss improved from 2.26719 to 2.20206, saving model to saved_models/audio_classification.hdf5
Epoch 5/200
Epoch 00005: val_loss improved from 2.20206 to 2.14162, saving model to saved_models/audio_classification.hdf5
Epoch 6/200
Epoch 00006: val_loss improved from 2.14162 to 2.11990, saving model to saved_models/audio_classification.hdf5
Epoch 7/200
Epoch 00007: val_loss improved from 2.11990 to 2.01588, saving model to saved_models/audio_classification.hdf5
Epoch 8/200
Epoch 00008: val_loss improved from 2.01588 to 1.97850, saving model to saved_models/audio_classification.hdf5
Epoch 9/200
Epoch 00

Epoch 00025: val_loss improved from 1.32004 to 1.29171, saving model to saved_models/audio_classification.hdf5
Epoch 26/200
Epoch 00026: val_loss improved from 1.29171 to 1.26222, saving model to saved_models/audio_classification.hdf5
Epoch 27/200
Epoch 00027: val_loss improved from 1.26222 to 1.25887, saving model to saved_models/audio_classification.hdf5
Epoch 28/200
Epoch 00028: val_loss improved from 1.25887 to 1.21282, saving model to saved_models/audio_classification.hdf5
Epoch 29/200
Epoch 00029: val_loss did not improve from 1.21282
Epoch 30/200
Epoch 00030: val_loss did not improve from 1.21282
Epoch 31/200
Epoch 00031: val_loss improved from 1.21282 to 1.17976, saving model to saved_models/audio_classification.hdf5
Epoch 32/200
Epoch 00032: val_loss improved from 1.17976 to 1.16122, saving model to saved_models/audio_classification.hdf5
Epoch 33/200
Epoch 00033: val_loss improved from 1.16122 to 1.12648, saving model to saved_models/audio_classification.hdf5
Epoch 34/200
Epoc

Epoch 51/200
Epoch 00051: val_loss did not improve from 0.94591
Epoch 52/200
Epoch 00052: val_loss did not improve from 0.94591
Epoch 53/200
Epoch 00053: val_loss improved from 0.94591 to 0.94549, saving model to saved_models/audio_classification.hdf5
Epoch 54/200
Epoch 00054: val_loss did not improve from 0.94549
Epoch 55/200
Epoch 00055: val_loss improved from 0.94549 to 0.93682, saving model to saved_models/audio_classification.hdf5
Epoch 56/200
Epoch 00056: val_loss improved from 0.93682 to 0.92903, saving model to saved_models/audio_classification.hdf5
Epoch 57/200
Epoch 00057: val_loss improved from 0.92903 to 0.90582, saving model to saved_models/audio_classification.hdf5
Epoch 58/200
Epoch 00058: val_loss did not improve from 0.90582
Epoch 59/200
Epoch 00059: val_loss improved from 0.90582 to 0.89305, saving model to saved_models/audio_classification.hdf5
Epoch 60/200
Epoch 00060: val_loss improved from 0.89305 to 0.88185, saving model to saved_models/audio_classification.hdf5


Epoch 78/200
Epoch 00078: val_loss did not improve from 0.82863
Epoch 79/200
Epoch 00079: val_loss improved from 0.82863 to 0.82008, saving model to saved_models/audio_classification.hdf5
Epoch 80/200
Epoch 00080: val_loss did not improve from 0.82008
Epoch 81/200
Epoch 00081: val_loss did not improve from 0.82008
Epoch 82/200
Epoch 00082: val_loss improved from 0.82008 to 0.80375, saving model to saved_models/audio_classification.hdf5
Epoch 83/200
Epoch 00083: val_loss did not improve from 0.80375
Epoch 84/200
Epoch 00084: val_loss did not improve from 0.80375
Epoch 85/200
Epoch 00085: val_loss did not improve from 0.80375
Epoch 86/200
Epoch 00086: val_loss did not improve from 0.80375
Epoch 87/200
Epoch 00087: val_loss did not improve from 0.80375
Epoch 88/200
Epoch 00088: val_loss did not improve from 0.80375
Epoch 89/200
Epoch 00089: val_loss improved from 0.80375 to 0.79590, saving model to saved_models/audio_classification.hdf5
Epoch 90/200
Epoch 00090: val_loss did not improve f

Epoch 107/200
Epoch 00107: val_loss did not improve from 0.77403
Epoch 108/200
Epoch 00108: val_loss improved from 0.77403 to 0.76726, saving model to saved_models/audio_classification.hdf5
Epoch 109/200
Epoch 00109: val_loss did not improve from 0.76726
Epoch 110/200
Epoch 00110: val_loss did not improve from 0.76726
Epoch 111/200
Epoch 00111: val_loss did not improve from 0.76726
Epoch 112/200
Epoch 00112: val_loss did not improve from 0.76726
Epoch 113/200
Epoch 00113: val_loss did not improve from 0.76726
Epoch 114/200
Epoch 00114: val_loss improved from 0.76726 to 0.75959, saving model to saved_models/audio_classification.hdf5
Epoch 115/200
Epoch 00115: val_loss did not improve from 0.75959
Epoch 116/200
Epoch 00116: val_loss did not improve from 0.75959
Epoch 117/200
Epoch 00117: val_loss did not improve from 0.75959
Epoch 118/200
Epoch 00118: val_loss did not improve from 0.75959
Epoch 119/200
Epoch 00119: val_loss did not improve from 0.75959
Epoch 120/200
Epoch 00120: val_loss

Epoch 00135: val_loss improved from 0.73796 to 0.73648, saving model to saved_models/audio_classification.hdf5
Epoch 136/200
Epoch 00136: val_loss did not improve from 0.73648
Epoch 137/200
Epoch 00137: val_loss did not improve from 0.73648
Epoch 138/200
Epoch 00138: val_loss improved from 0.73648 to 0.73109, saving model to saved_models/audio_classification.hdf5
Epoch 139/200
Epoch 00139: val_loss did not improve from 0.73109
Epoch 140/200
Epoch 00140: val_loss improved from 0.73109 to 0.72363, saving model to saved_models/audio_classification.hdf5
Epoch 141/200
Epoch 00141: val_loss did not improve from 0.72363
Epoch 142/200
Epoch 00142: val_loss did not improve from 0.72363
Epoch 143/200
Epoch 00143: val_loss did not improve from 0.72363
Epoch 144/200
Epoch 00144: val_loss improved from 0.72363 to 0.71856, saving model to saved_models/audio_classification.hdf5
Epoch 145/200
Epoch 00145: val_loss did not improve from 0.71856
Epoch 146/200
Epoch 00146: val_loss did not improve from 0.

Epoch 163/200
Epoch 00163: val_loss did not improve from 0.70097
Epoch 164/200
Epoch 00164: val_loss did not improve from 0.70097
Epoch 165/200
Epoch 00165: val_loss did not improve from 0.70097
Epoch 166/200
Epoch 00166: val_loss did not improve from 0.70097
Epoch 167/200
Epoch 00167: val_loss did not improve from 0.70097
Epoch 168/200
Epoch 00168: val_loss did not improve from 0.70097
Epoch 169/200
Epoch 00169: val_loss improved from 0.70097 to 0.69996, saving model to saved_models/audio_classification.hdf5
Epoch 170/200
Epoch 00170: val_loss did not improve from 0.69996
Epoch 171/200
Epoch 00171: val_loss did not improve from 0.69996
Epoch 172/200
Epoch 00172: val_loss did not improve from 0.69996
Epoch 173/200
Epoch 00173: val_loss did not improve from 0.69996
Epoch 174/200
Epoch 00174: val_loss did not improve from 0.69996
Epoch 175/200
Epoch 00175: val_loss did not improve from 0.69996
Epoch 176/200
Epoch 00176: val_loss did not improve from 0.69996
Epoch 177/200
Epoch 00177: val

Epoch 192/200
Epoch 00192: val_loss did not improve from 0.68712
Epoch 193/200
Epoch 00193: val_loss did not improve from 0.68712
Epoch 194/200
Epoch 00194: val_loss improved from 0.68712 to 0.68285, saving model to saved_models/audio_classification.hdf5
Epoch 195/200
Epoch 00195: val_loss did not improve from 0.68285
Epoch 196/200
Epoch 00196: val_loss did not improve from 0.68285
Epoch 197/200
Epoch 00197: val_loss did not improve from 0.68285
Epoch 198/200
Epoch 00198: val_loss did not improve from 0.68285
Epoch 199/200
Epoch 00199: val_loss improved from 0.68285 to 0.68026, saving model to saved_models/audio_classification.hdf5
Epoch 200/200
Epoch 00200: val_loss did not improve from 0.68026
Training completed in time:  0:02:25.857856


In [24]:
test_accuracy=model.evaluate(X_test,y_test,verbose=0)
print(test_accuracy[1])

0.7939324378967285


# Testing Some Test Audio Data

Steps

Preprocess the new audio data

predict the classes

Invere transform your Predicted Label

In [25]:
filename="UrbanSound8K/dog_bark.wav"
audio, sample_rate = librosa.load(filename, res_type='kaiser_fast') 
mfccs_features = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=40)
mfccs_scaled_features = np.mean(mfccs_features.T,axis=0)

print(mfccs_scaled_features)
mfccs_scaled_features=mfccs_scaled_features.reshape(1,-1)
print(mfccs_scaled_features)
print(mfccs_scaled_features.shape)
predicted_label=model.predict_classes(mfccs_scaled_features)
print(predicted_label)# we get 3 label but we dont no which class it belong
prediction_class = labelencoder.inverse_transform(predicted_label) # we use inverse_transform to get the class name
prediction_class


[-4.01122711e+02  1.53056015e+02  9.15836525e+00 -1.04600391e+01
 -1.64918995e+01  3.58699512e+00 -1.36686192e+01 -8.68370914e+00
 -4.45985365e+00 -7.58018064e+00 -8.65039289e-01 -2.98663425e+00
  4.75616646e+00  8.72414207e+00  9.36257172e+00  1.46945305e+01
  5.44877768e+00 -8.84514973e-02  7.87338853e-01 -3.53762913e+00
  7.84663390e-03 -2.13275695e+00 -4.25562906e+00 -4.67474490e-01
 -4.37626481e-01  2.55075169e+00  2.95378089e+00  3.44757819e+00
  4.33649969e+00  3.61894798e+00  3.66083050e+00  5.44085860e-01
  1.18284881e+00  1.53016174e+00  8.98930550e-01 -5.55400014e-01
 -2.51318526e+00 -1.23046386e+00 -1.45281053e+00  4.28475708e-01]
[[-4.01122711e+02  1.53056015e+02  9.15836525e+00 -1.04600391e+01
  -1.64918995e+01  3.58699512e+00 -1.36686192e+01 -8.68370914e+00
  -4.45985365e+00 -7.58018064e+00 -8.65039289e-01 -2.98663425e+00
   4.75616646e+00  8.72414207e+00  9.36257172e+00  1.46945305e+01
   5.44877768e+00 -8.84514973e-02  7.87338853e-01 -3.53762913e+00
   7.84663390e-03 -

array(['dog_bark'], dtype='<U16')