In [4]:
import pandas as pd
import librosa 
import os

audio_dataset_path = 'UrbanSound8K/audio/'
metadata = pd.read_csv('UrbanSound8K/metadata/UrbanSound8K.csv')
metadata.head()

Unnamed: 0,slice_file_name,fsID,start,end,salience,fold,classID,class
0,100032-3-0-0.wav,100032,0.0,0.317551,1,5,3,dog_bark
1,100263-2-0-117.wav,100263,58.5,62.5,1,5,2,children_playing
2,100263-2-0-121.wav,100263,60.5,64.5,1,5,2,children_playing
3,100263-2-0-126.wav,100263,63.0,67.0,1,5,2,children_playing
4,100263-2-0-137.wav,100263,68.5,72.5,1,5,2,children_playing


In [5]:
def feature_extractor(file):
    audio,sample_rate = librosa.load(file_name,res_type='kaiser_fast')
    mfccs_features = librosa.feature.mfcc(y=audio,sr=sample_rate,n_mfcc=40)
    mfccs_scaled_features = np.mean(mfccs_features.T,axis=0)
    return mfccs_scaled_features

In [6]:
import numpy as np
from tqdm import tqdm
### Now we iterate through every audio file and extract features 
### using Mel-Frequency Cepstral Coefficients
extracted_features=[]
for index_num,row in tqdm(metadata.iterrows()):
    file_name = os.path.join(os.path.abspath(audio_dataset_path),'fold'+str(row["fold"])+'/',str(row["slice_file_name"]))
    final_class_labels=row["class"]
    data=feature_extractor(file_name)
    extracted_features.append([data,final_class_labels])

8732it [08:07, 17.89it/s]


In [7]:
### converting extracted_features to Pandas dataframe
extracted_features_df=pd.DataFrame(extracted_features,columns=['feature','class'])
extracted_features_df.head()

Unnamed: 0,feature,class
0,"[-217.35526, 70.22339, -130.38527, -53.282898,...",dog_bark
1,"[-424.09818, 109.34077, -52.919525, 60.86475, ...",children_playing
2,"[-458.79114, 121.38419, -46.520657, 52.00812, ...",children_playing
3,"[-413.89984, 101.66371, -35.42945, 53.036358, ...",children_playing
4,"[-446.60352, 113.68541, -52.402218, 60.302044,...",children_playing


In [61]:
## split the dataset into dependent and independent features
X = np.array(extracted_features_df['feature'].tolist())
y=np.array(extracted_features_df['class'].tolist())

In [40]:
import numpy as np
y=np.array(pd.get_dummies(y))


In [62]:
y

array(['dog_bark', 'children_playing', 'children_playing', ...,
       'car_horn', 'car_horn', 'car_horn'], dtype='<U16')

In [53]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=0)
X_train

array([[-1.31104706e+02,  1.12505905e+02, -2.25746956e+01, ...,
         3.24665213e+00, -1.36902368e+00,  2.75575471e+00],
       [-1.36703424e+01,  9.10850830e+01, -7.79273319e+00, ...,
        -3.25305033e+00, -5.27745295e+00, -1.55697179e+00],
       [-4.98715439e+01,  2.65352994e-01, -2.05009365e+01, ...,
         2.85459447e+00, -1.60920441e+00,  3.52480578e+00],
       ...,
       [-4.27012360e+02,  9.26230469e+01,  3.12939739e+00, ...,
         7.42641211e-01,  7.33490884e-01,  7.11009145e-01],
       [-1.45754608e+02,  1.36265778e+02, -3.35155182e+01, ...,
         1.46811938e+00, -2.00917006e+00, -8.82181883e-01],
       [-4.21031342e+02,  2.10654541e+02,  3.49066091e+00, ...,
        -5.38886738e+00, -3.37136054e+00, -1.56651139e+00]], dtype=float32)

In [55]:
y_test

array(['engine_idling', 'car_horn', 'drilling', ..., 'car_horn',
       'children_playing', 'children_playing'], dtype='<U16')

MODEL CREATION

In [1]:
import tensorflow as tf

In [2]:
print(tf.__version__)

2.17.0


In [3]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,Dropout,Flatten,Activation
from tensorflow.keras.optimizers import Adam
from sklearn import metrics

In [12]:
## number of classes
num_labels = y.shape[1]

Dense

In [14]:
model = Sequential()
##first layer
model.add(Dense(100,input_shape=(40,)))
model.add(Activation('relu'))
model.add(Dropout(0.5))
## second layer
model.add(Dense(200))
model.add(Activation('relu'))
model.add(Dropout(0.5))
## third layer
model.add(Dense(100))
model.add(Activation('relu'))
model.add(Dropout(0.5))

##final layer
model.add(Dense(num_labels))
model.add(Activation('softmax'))

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [15]:
model.summary()

In [16]:
model.compile(loss='categorical_crossentropy',metrics=['accuracy'],optimizer='adam')

In [17]:
## Trianing my model
from tensorflow.keras.callbacks import ModelCheckpoint


In [20]:
## Trianing my model

from datetime import datetime 

num_epochs = 100
num_batch_size = 32

checkpointer = ModelCheckpoint(filepath='saved_models/audio_classification.keras', 
                               verbose=1, save_best_only=True)
start = datetime.now()

model.fit(X_train, y_train, batch_size=num_batch_size, epochs=num_epochs, validation_data=(X_test, y_test), callbacks=[checkpointer], verbose=1)


duration = datetime.now() - start
print("Training completed in time: ", duration)

Epoch 1/100
[1m192/219[0m [32m━━━━━━━━━━━━━━━━━[0m[37m━━━[0m [1m0s[0m 1ms/step - accuracy: 0.6699 - loss: 0.9692
Epoch 1: val_loss improved from inf to 0.76545, saving model to saved_models/audio_classification.keras
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.6716 - loss: 0.9659 - val_accuracy: 0.7573 - val_loss: 0.7654
Epoch 2/100
[1m211/219[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 1ms/step - accuracy: 0.6830 - loss: 0.9514
Epoch 2: val_loss did not improve from 0.76545
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.6829 - loss: 0.9518 - val_accuracy: 0.7533 - val_loss: 0.7786
Epoch 3/100
[1m209/219[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 1ms/step - accuracy: 0.6806 - loss: 0.9272
Epoch 3: val_loss did not improve from 0.76545
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.6806 - loss: 0.9275 - val_accuracy: 0.7619 - val

In [27]:
test_accuracy = model.evaluate(X_test,y_test,verbose=0)
print(test_accuracy[1])

0.7830566763877869


In [24]:
filename='UrbanSound8K/dog_bark.wav'
prediction_feature = feature_extractor(filename)
prediction_feature = prediction_feature.reshape(1,-1)
model.predict(prediction_feature)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 58ms/step


array([[3.6020724e-06, 7.3892498e-01, 7.7802606e-04, 4.0840674e-03,
        7.7992002e-04, 1.2102521e-03, 2.3591234e-05, 7.3297988e-06,
        1.6397610e-04, 2.5402433e-01]], dtype=float32)

In [22]:
prediction_feature.reshape(1,-1).shape

array([[-315.6028    ],
       [  94.854805  ],
       [ -37.22234   ],
       [  46.778263  ],
       [  -6.7286925 ],
       [  10.012548  ],
       [  -1.6075525 ],
       [  18.51134   ],
       [ -11.9006195 ],
       [   7.594036  ],
       [  -7.8546596 ],
       [  11.362425  ],
       [ -15.617317  ],
       [   3.3019912 ],
       [ -11.958161  ],
       [   6.35349   ],
       [  -5.587026  ],
       [  20.78539   ],
       [  -0.46922812],
       [   6.0436325 ],
       [ -11.619548  ],
       [   2.8686745 ],
       [ -10.176432  ],
       [   8.332485  ],
       [   1.776561  ],
       [   2.5638974 ],
       [ -14.761061  ],
       [   1.1465563 ],
       [   3.783566  ],
       [   3.1094651 ],
       [ -12.185812  ],
       [  -3.0522912 ],
       [   3.7284145 ],
       [   8.962753  ],
       [   0.93064505],
       [   3.1800797 ],
       [   2.4850492 ],
       [   0.6138646 ],
       [ -11.449189  ],
       [  -6.0105853 ]], dtype=float32)

Testing Some Test Audio Data
Steps

Preprocess the new audio data
predict the classes
Invere transform your Predicted Label

In [26]:
# filename="UrbanSound8K/drilling_1.wav"
# audio, sample_rate = librosa.load(filename, res_type='kaiser_fast') 
# mfccs_features = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=40)
# mfccs_scaled_features = np.mean(mfccs_features.T,axis=0)

# print(mfccs_scaled_features)
# mfccs_scaled_features=mfccs_scaled_features.reshape(1,-1)
# print(mfccs_scaled_features)
# print(mfccs_scaled_features.shape)
# predicted_label=model.predict_classes(mfccs_scaled_features)
# print(predicted_label)
# prediction_class = labelencoder.inverse_transform(predicted_label) 
# prediction_class

In [63]:
y

array(['dog_bark', 'children_playing', 'children_playing', ...,
       'car_horn', 'car_horn', 'car_horn'], dtype='<U16')

In [64]:
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder
labelencoder=LabelEncoder()
y=to_categorical(labelencoder.fit_transform(y))

In [65]:
y

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.]])

In [59]:
y.shape

(8732, 10)

Testing ANN Model

In [70]:
filename="UrbanSound8K/audio/fold5/205874-4-7-0.wav"
audio, sample_rate = librosa.load(filename, res_type='kaiser_fast') 
mfccs_features = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=40)
mfccs_scaled_features = np.mean(mfccs_features.T,axis=0)

print(mfccs_scaled_features)
mfccs_scaled_features=mfccs_scaled_features.reshape(1,-1)
print(mfccs_scaled_features)
print(mfccs_scaled_features.shape)
# Get the predicted label (which is a 2D array)
predicted_label = model.predict(mfccs_scaled_features)

# Convert the predicted probabilities to class labels (1D array)
predicted_label = np.argmax(predicted_label, axis=1)
print(predicted_label)
prediction_class = labelencoder.inverse_transform(predicted_label) 
prediction_class

[-101.86136      65.11602      13.641455     13.561497    -17.517298
   25.72726     -28.139944     23.225533    -22.598389     20.933466
  -18.300575     15.490882    -12.228727      0.2866461   -12.393409
    3.3978872    -4.0727506     3.521857    -12.590733     13.094791
  -12.366367      8.652108     -8.519329      8.302772     -3.9202414
    6.3012137    -4.7993317     2.9861689    -3.6358995     0.10202715
   -3.4095497    -0.54275364   -5.8550363     0.7684368    -5.3634825
    2.7812693    -3.61809      -0.7319855    -0.71020836   -0.38015357]
[[-101.86136      65.11602      13.641455     13.561497    -17.517298
    25.72726     -28.139944     23.225533    -22.598389     20.933466
   -18.300575     15.490882    -12.228727      0.2866461   -12.393409
     3.3978872    -4.0727506     3.521857    -12.590733     13.094791
   -12.366367      8.652108     -8.519329      8.302772     -3.9202414
     6.3012137    -4.7993317     2.9861689    -3.6358995     0.10202715
    -3.4095497    

array(['jackhammer'], dtype='<U16')