Data Preprocessing

In [9]:
#### Extracting MFCC's For every audio file
import pandas as pd
import os
import librosa

audio_dataset_path='UrbanSound8K/audio/'
metadata=pd.read_csv('UrbanSound8K/metadata/UrbanSound8K.csv')
metadata.head()

Unnamed: 0,slice_file_name,fsID,start,end,salience,fold,classID,class
0,100032-3-0-0.wav,100032,0.0,0.317551,1,5,3,dog_bark
1,100263-2-0-117.wav,100263,58.5,62.5,1,5,2,children_playing
2,100263-2-0-121.wav,100263,60.5,64.5,1,5,2,children_playing
3,100263-2-0-126.wav,100263,63.0,67.0,1,5,2,children_playing
4,100263-2-0-137.wav,100263,68.5,72.5,1,5,2,children_playing


In [10]:
def features_extractor(file):
    audio, sample_rate = librosa.load(file_name, res_type='kaiser_fast') 
    mfccs_features = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=40)
    mfccs_scaled_features = np.mean(mfccs_features.T,axis=0)
    
    return mfccs_scaled_features
    

In [11]:
import numpy as np
from tqdm import tqdm
### Now we iterate through every audio file and extract features 
### using Mel-Frequency Cepstral Coefficients
extracted_features=[]
for index_num,row in tqdm(metadata.iterrows()):
    file_name = os.path.join(os.path.abspath(audio_dataset_path),'fold'+str(row["fold"])+'/',str(row["slice_file_name"]))
    final_class_labels=row["class"]
    data=features_extractor(file_name)
    extracted_features.append([data,final_class_labels])

8732it [04:35, 31.65it/s]


In [12]:
### converting extracted_features to Pandas dataframe
extracted_features_df=pd.DataFrame(extracted_features,columns=['feature','class'])
extracted_features_df.head()

Unnamed: 0,feature,class
0,"[-217.35526, 70.22338, -130.38527, -53.282898,...",dog_bark
1,"[-424.09818, 109.34077, -52.919525, 60.86475, ...",children_playing
2,"[-458.79114, 121.38419, -46.520657, 52.00812, ...",children_playing
3,"[-413.89984, 101.66373, -35.42945, 53.036358, ...",children_playing
4,"[-446.60352, 113.68541, -52.402206, 60.302044,...",children_playing


In [13]:
### Split the dataset into independent and dependent dataset
X=np.array(extracted_features_df['feature'].tolist())
y=np.array(extracted_features_df['class'].tolist())

In [14]:
X.shape

(8732, 40)

In [47]:
y

array(['dog_bark', 'children_playing', 'children_playing', ...,
       'car_horn', 'car_horn', 'car_horn'], dtype='<U16')

In [49]:
### Label Encoder
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder
labelencoder=LabelEncoder()
y=to_categorical(labelencoder.fit_transform(y))

In [50]:
y

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.]])

In [53]:
### Train Test Split
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=0)

In [54]:
X_train

array([[-1.31104706e+02,  1.12505905e+02, -2.25746956e+01, ...,
         3.24665260e+00, -1.36902368e+00,  2.75575495e+00],
       [-1.36703424e+01,  9.10850830e+01, -7.79273319e+00, ...,
        -3.25305080e+00, -5.27745295e+00, -1.55697155e+00],
       [-4.98715439e+01,  2.65352994e-01, -2.05009365e+01, ...,
         2.85459447e+00, -1.60920465e+00,  3.52480578e+00],
       ...,
       [-4.27012360e+02,  9.26230469e+01,  3.12939739e+00, ...,
         7.42641389e-01,  7.33490884e-01,  7.11009026e-01],
       [-1.45754608e+02,  1.36265778e+02, -3.35155182e+01, ...,
         1.46811938e+00, -2.00917006e+00, -8.82181883e-01],
       [-4.21031342e+02,  2.10654541e+02,  3.49066067e+00, ...,
        -5.38886738e+00, -3.37136054e+00, -1.56651175e+00]], dtype=float32)

In [57]:
y

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.]])

In [58]:
X_train.shape

(6985, 40)

In [59]:
X_test.shape

(1747, 40)

In [60]:
y_train.shape

(6985, 10)

In [61]:
y_test.shape

(1747, 10)

### Model Creation

In [66]:
import tensorflow as tf
print(tf.__version__)

2.16.1


In [69]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Activation, BatchNormalization, Flatten,LeakyReLU
from tensorflow.keras.optimizers import Adam
from sklearn import metrics
from tensorflow.keras.regularizers import l2

### No of classes
num_labels=y.shape[1]

model=Sequential()
###first layer
model.add(Dense(256,input_shape=(40,), kernel_regularizer=l2(0.001)))
model.add(LeakyReLU(alpha=0.1))
model.add(BatchNormalization())
model.add(Dropout(0.5))
###second layer
model.add(Dense(256, kernel_regularizer=l2(0.001)))
model.add(LeakyReLU(alpha=0.1))
model.add(BatchNormalization())
model.add(Dropout(0.5))
###third layer
model.add(Dense(128, kernel_regularizer=l2(0.001)))
model.add(LeakyReLU(alpha=0.1))
model.add(BatchNormalization())
model.add(Dropout(0.5))

###final layer
model.add(Dense(num_labels, kernel_regularizer=l2(0.001)))
model.add(Activation('softmax'))

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
2024-05-27 07:38:59.902154: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M3 Pro
2024-05-27 07:38:59.902175: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 18.00 GB
2024-05-27 07:38:59.902181: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 6.00 GB
2024-05-27 07:38:59.902197: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-05-27 07:38:59.902213: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [106]:
model.summary()

In [70]:
## Trianing my model
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
from datetime import datetime 

optimizer = Adam(learning_rate=0.0001)
model.compile(loss='categorical_crossentropy',metrics=['accuracy'],optimizer=optimizer)


# Define callbacks
checkpointer = ModelCheckpoint(filepath='saved_models/audio_classification.keras', verbose=1, save_best_only=True)
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

num_epochs = 200
num_batch_size = 32

start = datetime.now()

history = model.fit(
    X_train, y_train,
    batch_size=num_batch_size,
    epochs=num_epochs,
    validation_data=(X_test, y_test),
    callbacks=[checkpointer, early_stopping],
    verbose=1
)
# model.fit(X_train, y_train, batch_size=num_batch_size, epochs=num_epochs, validation_data=(X_test, y_test), callbacks=[checkpointer], verbose=1)


duration = datetime.now() - start
print("Training completed in time: ", duration)

Epoch 1/200


2024-05-27 07:39:04.115152: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.


[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step - accuracy: 0.1380 - loss: 3.7447
Epoch 1: val_loss improved from inf to 2.39717, saving model to saved_models/audio_classification.keras
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 18ms/step - accuracy: 0.1382 - loss: 3.7439 - val_accuracy: 0.3497 - val_loss: 2.3972
Epoch 2/200
[1m216/219[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 15ms/step - accuracy: 0.2292 - loss: 3.1885
Epoch 2: val_loss improved from 2.39717 to 2.23536, saving model to saved_models/audio_classification.keras
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 16ms/step - accuracy: 0.2294 - loss: 3.1872 - val_accuracy: 0.3973 - val_loss: 2.2354
Epoch 3/200
[1m217/219[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 15ms/step - accuracy: 0.2578 - loss: 2.9962
Epoch 3: val_loss improved from 2.23536 to 2.11169, saving model to saved_models/audio_classification.keras
[1m219/219[0m [32m━

In [73]:
test_accuracy=model.evaluate(X_test,y_test,verbose=0)
print(test_accuracy[1])

0.8591871857643127


In [74]:
X_test[1]

array([-466.17957   ,    1.0950238 ,  -34.01389   ,   35.33935   ,
        -14.88148   ,  -19.12843   ,   -0.581684  ,  -16.130579  ,
        -21.339075  ,    7.673634  ,  -29.16449   ,  -18.950253  ,
         -2.9579992 ,   -8.16233   ,  -15.153101  ,   -6.6048055 ,
         -7.5685983 ,    9.340646  ,   14.4331    ,   21.934181  ,
         20.861397  ,    1.3340123 ,  -19.228804  ,   -4.630231  ,
         -1.0564744 ,    3.215267  ,   -6.984281  ,  -16.414577  ,
        -10.0286455 ,   13.009956  ,    0.5334608 ,  -23.843391  ,
        -15.267321  ,    9.245734  ,   10.367627  ,   -0.58320105,
         -1.2624055 ,   17.700016  ,   13.847463  ,   -5.1862826 ],
      dtype=float32)

In [75]:
# Predict class probabilities
predictions = model.predict(X_test)

# Convert probabilities to class labels
predicted_classes = np.argmax(predictions, axis=1)

print(predicted_classes)


[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
[5 3 4 ... 1 2 9]


In [115]:
### Testing Some Test Audio Data
filename = "UrbanSound8K/audio/fold1/31323-3-0-22.wav"
audio, sample_rate = librosa.load(filename, res_type='kaiser_fast') 
mfccs_features = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=40)
mfccs_scaled_features = np.mean(mfccs_features.T, axis=0)

print(mfccs_scaled_features)
mfccs_scaled_features = mfccs_scaled_features.reshape(1, -1)
print(mfccs_scaled_features)
print(mfccs_scaled_features.shape)

# Predict class probabilities
predicted_label = model.predict(mfccs_scaled_features)

# Convert probabilities to class label
predicted_class = np.argmax(predicted_label, axis=1)
print(predicted_class)

# If you have a label encoder
prediction_class = labelencoder.inverse_transform(predicted_class) 
print(prediction_class)


[-3.81373199e+02  1.51458832e+02  9.18357468e+00 -6.53484249e+00
 -5.11801863e+00  1.79286022e+01  3.17759037e+00  1.02581215e+01
  3.32650518e+00 -3.95141697e+00 -7.83147573e+00  6.71050930e+00
 -1.17115049e+01 -6.31992996e-01  3.79438734e+00  1.87321472e+01
  8.09789658e+00  2.07430315e+00 -3.05020118e+00  5.89708614e+00
  1.34657204e+00 -1.79574883e+00 -4.02999973e+00 -3.23028874e+00
 -2.92616868e+00 -2.03436479e-01 -4.09587562e-01 -2.91133380e+00
  1.70245504e+00 -1.14387095e+00  5.06934023e+00 -2.61036873e+00
 -2.68362854e-02  3.36770952e-01  1.94930220e+00 -8.20713639e-01
 -2.27108192e+00  1.04655361e+00 -2.67646933e+00  1.11031306e+00]
[[-3.81373199e+02  1.51458832e+02  9.18357468e+00 -6.53484249e+00
  -5.11801863e+00  1.79286022e+01  3.17759037e+00  1.02581215e+01
   3.32650518e+00 -3.95141697e+00 -7.83147573e+00  6.71050930e+00
  -1.17115049e+01 -6.31992996e-01  3.79438734e+00  1.87321472e+01
   8.09789658e+00  2.07430315e+00 -3.05020118e+00  5.89708614e+00
   1.34657204e+00 -