In [None]:
# Import necessary libraries
import matplotlib.pyplot as plt
%matplotlib inline
import IPython.display as ipd
import librosa
import librosa.display
from scipy.io import wavfile as wav
import pandas as pd
import os
import numpy as np
from tqdm import tqdm

In [None]:
# Display directory contents
!dir

EDA

In [None]:
# Load and display audio waveforms for the first file
filename='UrbanSound8K/dog_bark.wav'
plt.figure(figsize=(14,5))
data,sample_rate = librosa.load(filename)
librosa.display.waveplot(data, sr=sample_rate)
ipd.Audio(filename)

In [None]:
# Load and display audio waveforms for the second file
filename='UrbanSound8K/100263-2-0-3.wav'
plt.figure(figsize=(14,5))
data,sample_rate = librosa.load(filename)
librosa.display.waveplot(data, sr=sample_rate)
ipd.Audio(filename)

In [None]:
# Sample rate details
sample_rate
wave_sample_rate, wave_audio = wav.read(filename)
wave_sample_rate

In [None]:
# Display audio data details
wave_audio
data

In [None]:
# Read metadata
metadata = pd.read_csv('UrbanSound8K/metadata/UrbanSound8K.csv')
metadata.head(10)

In [None]:
# Check whether the dataset is imbalanced
metadata['class'].value_counts()

#### Audio Classification Data Preprocessing

In [None]:
# Read and plot a sample audio file using librosa
audio_file_path = 'UrbanSound8K/100263-2-0-3.wav'
librosa_audio_data, librosa_sample_rate = librosa.load(audio_file_path)

plt.figure(figsize=(12, 4))
plt.plot(librosa_audio_data)

In [None]:
# Read and plot audio with scipy
wave_sample_rate, wave_audio = wav.read(audio_file_path)

plt.figure(figsize=(12, 4))
plt.plot(wave_audio)

#### Observation
Here Librosa converts the signal to mono, meaning the channel will alays be 1

### Extract Features
Here we will be using Mel-Frequency Cepstral Coefficients(MFCC) from the audio
samples.
The MFCC summarises the frequency distribution across the window size, so it is possible to analyse both the frequency and time characteristics of the sound. These audio representations will allow us to identify features for classification.

In [None]:
mfccs = librosa.feature.mfcc(y=librosa_audio_data, sr=librosa_sample_rate, n_mfcc=40)
print(mfccs.shape)

(40, 173)


In [None]:
mfccs

array([[-4.45197296e+02, -4.47219299e+02, -4.49755127e+02, ...,
        -4.77412781e+02, -4.74241730e+02, -4.82704987e+02],
       [ 1.12513969e+02,  1.11970383e+02,  1.12244164e+02, ...,
         1.12045395e+02,  1.12248581e+02,  1.05560913e+02],
       [-1.58260956e+01, -2.30021858e+01, -3.12500191e+01, ...,
        -9.15441895e+00, -1.03232269e+01, -7.39410734e+00],
       ...,
       [-7.82766485e+00, -5.03880405e+00, -4.48165369e+00, ...,
        -1.90692782e-01,  4.34143972e+00,  1.00339069e+01],
       [-1.91763425e+00, -8.02737713e-01, -1.20930457e+00, ...,
        -1.23640239e-01,  2.90504694e-02,  9.22017097e-01],
       [-3.88130605e-01,  3.09317827e-01,  6.72155714e+00, ...,
        -2.33736587e+00, -4.25179911e+00, -2.31322765e+00]], dtype=float32)

In [None]:
#### Extracting MFCC's For every audio file
import pandas as pd
import os
import librosa

audio_dataset_path='UrbanSound8K/audio/'
metadata=pd.read_csv('UrbanSound8K/metadata/UrbanSound8K.csv')
metadata.head()

Unnamed: 0,slice_file_name,fsID,start,end,salience,fold,classID,class
0,100032-3-0-0.wav,100032,0.0,0.317551,1,5,3,dog_bark
1,100263-2-0-117.wav,100263,58.5,62.5,1,5,2,children_playing
2,100263-2-0-121.wav,100263,60.5,64.5,1,5,2,children_playing
3,100263-2-0-126.wav,100263,63.0,67.0,1,5,2,children_playing
4,100263-2-0-137.wav,100263,68.5,72.5,1,5,2,children_playing


In [None]:
def features_extractor(file):
    audio, sample_rate = librosa.load(file_name, res_type='kaiser_fast')
    mfccs_features = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=40)
    mfccs_scaled_features = np.mean(mfccs_features.T,axis=0)

    return mfccs_scaled_features


In [None]:
import numpy as np
from tqdm import tqdm
### Now we iterate through every audio file and extract features
### using Mel-Frequency Cepstral Coefficients
extracted_features=[]
for index_num,row in tqdm(metadata.iterrows()):
    file_name = os.path.join(os.path.abspath(audio_dataset_path),'fold'+str(row["fold"])+'/',str(row["slice_file_name"]))
    final_class_labels=row["class"]
    data=features_extractor(file_name)
    extracted_features.append([data,final_class_labels])

8732it [05:08, 28.33it/s]


In [None]:
### converting extracted_features to Pandas dataframe
extracted_features_df=pd.DataFrame(extracted_features,columns=['feature','class'])
extracted_features_df.head()

Unnamed: 0,feature,class
0,"[-215.79301, 71.66612, -131.81377, -52.091335,...",dog_bark
1,"[-424.68677, 110.56227, -54.148235, 62.01074, ...",children_playing
2,"[-459.56467, 122.800354, -47.92471, 53.265697,...",children_playing
3,"[-414.55377, 102.896904, -36.66495, 54.18041, ...",children_playing
4,"[-447.397, 115.0954, -53.809113, 61.60859, 1.6...",children_playing


In [None]:
### Split the dataset into independent and dependent dataset
X=np.array(extracted_features_df['feature'].tolist())
y=np.array(extracted_features_df['class'].tolist())

In [None]:
X.shape

(8732, 40)

In [None]:
y

array(['dog_bark', 'children_playing', 'children_playing', ...,
       'car_horn', 'car_horn', 'car_horn'], dtype='<U16')

In [None]:
### Label Encoding
###y=np.array(pd.get_dummies(y))
### Label Encoder
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder
labelencoder=LabelEncoder()
y=to_categorical(labelencoder.fit_transform(y))

In [None]:
y

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.]], dtype=float32)

In [None]:
### Train Test Split
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=0)

In [None]:
X_train

array([[[-1.3183614e+02,  1.1397464e+02, -2.3956861e+01, ...,
          3.3314774e+00, -1.4786109e+00,  2.8736601e+00]],

       [[-1.4074220e+01,  9.1916939e+01, -8.6787214e+00, ...,
         -3.3844023e+00, -5.2119045e+00, -1.5936136e+00]],

       [[-4.9532028e+01,  1.5521857e-01, -2.0369110e+01, ...,
          2.0491767e+00, -8.0537486e-01,  2.7793026e+00]],

       ...,

       [[-4.2699332e+02,  9.2890656e+01,  3.0233388e+00, ...,
          8.6335975e-01,  6.4766806e-01,  7.8490508e-01]],

       [[-1.4607024e+02,  1.3709459e+02, -3.4298344e+01, ...,
          1.3777871e+00, -1.9530845e+00, -8.9652127e-01]],

       [[-4.2167450e+02,  2.1169032e+02,  2.6820304e+00, ...,
         -5.1484952e+00, -3.6400862e+00, -1.3321606e+00]]], dtype=float32)

In [None]:
y

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.]], dtype=float32)

In [None]:
X_train.shape

(6985, 1, 40)

In [None]:
X_test.shape

(1747, 1, 40)

In [None]:
y_train.shape

(6985, 10)

In [None]:
y_test.shape

(1747, 10)

### Model Creation

In [None]:
import tensorflow as tf
print(tf.__version__)

2.3.1


In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,Dropout,Activation,Flatten
from tensorflow.keras.optimizers import Adam
from sklearn import metrics

In [None]:
### No of classes
num_labels=y.shape[1]

In [None]:
model=Sequential()
###first layer
model.add(Dense(100,input_shape=(40,)))
model.add(Activation('relu'))
model.add(Dropout(0.5))
###second layer
model.add(Dense(200))
model.add(Activation('relu'))
model.add(Dropout(0.5))
###third layer
model.add(Dense(100))
model.add(Activation('relu'))
model.add(Dropout(0.5))

###final layer
model.add(Dense(num_labels))
model.add(Activation('softmax'))

In [None]:
model.summary()

Model: "sequential_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_8 (Dense)              (None, 100)               4100      
_________________________________________________________________
activation_8 (Activation)    (None, 100)               0         
_________________________________________________________________
dropout_10 (Dropout)         (None, 100)               0         
_________________________________________________________________
dense_9 (Dense)              (None, 200)               20200     
_________________________________________________________________
activation_9 (Activation)    (None, 200)               0         
_________________________________________________________________
dropout_11 (Dropout)         (None, 200)               0         
_________________________________________________________________
dense_10 (Dense)             (None, 100)              

In [None]:
model.compile(loss='categorical_crossentropy',metrics=['accuracy'],optimizer='adam')

In [None]:
## Trianing my model
from tensorflow.keras.callbacks import ModelCheckpoint
from datetime import datetime

num_epochs = 100
num_batch_size = 32

checkpointer = ModelCheckpoint(filepath='saved_models/audio_classification.hdf5',
                               verbose=1, save_best_only=True)
start = datetime.now()

model.fit(X_train, y_train, batch_size=num_batch_size, epochs=num_epochs, validation_data=(X_test, y_test), callbacks=[checkpointer], verbose=1)


duration = datetime.now() - start
print("Training completed in time: ", duration)

Epoch 1/100
Epoch 00001: val_loss improved from inf to 0.65317, saving model to saved_models\audio_classification.hdf5
Epoch 2/100
Epoch 00002: val_loss did not improve from 0.65317
Epoch 3/100
Epoch 00003: val_loss did not improve from 0.65317
Epoch 4/100
Epoch 00004: val_loss did not improve from 0.65317
Epoch 5/100
Epoch 00005: val_loss improved from 0.65317 to 0.64595, saving model to saved_models\audio_classification.hdf5
Epoch 6/100
Epoch 00006: val_loss did not improve from 0.64595
Epoch 7/100
Epoch 00007: val_loss did not improve from 0.64595
Epoch 8/100
Epoch 00008: val_loss did not improve from 0.64595
Epoch 9/100
Epoch 00009: val_loss did not improve from 0.64595
Epoch 10/100
Epoch 00010: val_loss did not improve from 0.64595
Epoch 11/100
Epoch 00011: val_loss did not improve from 0.64595
Epoch 12/100
Epoch 00012: val_loss did not improve from 0.64595
Epoch 13/100
Epoch 00013: val_loss did not improve from 0.64595
Epoch 14/100
Epoch 00014: val_loss did not improve from 0.645

Epoch 30/100
Epoch 00030: val_loss did not improve from 0.63746
Epoch 31/100
Epoch 00031: val_loss did not improve from 0.63746
Epoch 32/100
Epoch 00032: val_loss did not improve from 0.63746
Epoch 33/100
Epoch 00033: val_loss did not improve from 0.63746
Epoch 34/100
Epoch 00034: val_loss did not improve from 0.63746
Epoch 35/100
Epoch 00035: val_loss did not improve from 0.63746
Epoch 36/100
Epoch 00036: val_loss did not improve from 0.63746
Epoch 37/100
Epoch 00037: val_loss did not improve from 0.63746
Epoch 38/100
Epoch 00038: val_loss did not improve from 0.63746
Epoch 39/100
Epoch 00039: val_loss did not improve from 0.63746
Epoch 40/100
Epoch 00040: val_loss did not improve from 0.63746
Epoch 41/100
Epoch 00041: val_loss did not improve from 0.63746
Epoch 42/100
Epoch 00042: val_loss did not improve from 0.63746
Epoch 43/100
Epoch 00043: val_loss did not improve from 0.63746
Epoch 44/100
Epoch 00044: val_loss did not improve from 0.63746
Epoch 45/100
Epoch 00045: val_loss did n

Epoch 00059: val_loss did not improve from 0.63746
Epoch 60/100
Epoch 00060: val_loss did not improve from 0.63746
Epoch 61/100
Epoch 00061: val_loss did not improve from 0.63746
Epoch 62/100
Epoch 00062: val_loss did not improve from 0.63746
Epoch 63/100
Epoch 00063: val_loss did not improve from 0.63746
Epoch 64/100
Epoch 00064: val_loss did not improve from 0.63746
Epoch 65/100
Epoch 00065: val_loss did not improve from 0.63746
Epoch 66/100
Epoch 00066: val_loss did not improve from 0.63746
Epoch 67/100
Epoch 00067: val_loss did not improve from 0.63746
Epoch 68/100
Epoch 00068: val_loss did not improve from 0.63746
Epoch 69/100
Epoch 00069: val_loss did not improve from 0.63746
Epoch 70/100
Epoch 00070: val_loss did not improve from 0.63746
Epoch 71/100
Epoch 00071: val_loss did not improve from 0.63746
Epoch 72/100
Epoch 00072: val_loss did not improve from 0.63746
Epoch 73/100
Epoch 00073: val_loss did not improve from 0.63746
Epoch 74/100
Epoch 00074: val_loss improved from 0.63

Epoch 00088: val_loss did not improve from 0.63103
Epoch 89/100
Epoch 00089: val_loss did not improve from 0.63103
Epoch 90/100
Epoch 00090: val_loss did not improve from 0.63103
Epoch 91/100
Epoch 00091: val_loss did not improve from 0.63103
Epoch 92/100
Epoch 00092: val_loss did not improve from 0.63103
Epoch 93/100
Epoch 00093: val_loss did not improve from 0.63103
Epoch 94/100
Epoch 00094: val_loss did not improve from 0.63103
Epoch 95/100
Epoch 00095: val_loss did not improve from 0.63103
Epoch 96/100
Epoch 00096: val_loss did not improve from 0.63103
Epoch 97/100
Epoch 00097: val_loss did not improve from 0.63103
Epoch 98/100
Epoch 00098: val_loss did not improve from 0.63103
Epoch 99/100
Epoch 00099: val_loss did not improve from 0.63103
Epoch 100/100
Epoch 00100: val_loss did not improve from 0.63103
Training completed in time:  0:00:45.200242


In [None]:
test_accuracy=model.evaluate(X_test,y_test,verbose=0)
print(test_accuracy[1])

0.8002289533615112


In [None]:
prediction_feature.shape

(1, 40)

In [None]:
X_test[1]

array([-466.1843    ,    1.5388278 ,  -34.397358  ,   35.715336  ,
        -15.166929  ,  -18.850813  ,   -0.7415805 ,  -15.99989   ,
        -21.354332  ,    7.6506834 ,  -29.031452  ,  -19.142824  ,
         -2.6798913 ,   -8.466884  ,  -14.7660475 ,   -7.004778  ,
         -7.103754  ,    8.887754  ,   14.911873  ,   21.47102   ,
         21.336624  ,    0.9169518 ,  -18.795404  ,   -5.001721  ,
         -0.70152664,    2.91399   ,   -6.7105994 ,  -16.638536  ,
         -9.821647  ,   12.8619585 ,    0.6552978 ,  -23.953394  ,
        -15.200551  ,    9.21079   ,   10.419799  ,   -0.57916117,
         -1.2440346 ,   17.722294  ,   13.837573  ,   -5.164349  ],
      dtype=float32)

In [None]:
model.predict_classes(X_test)

array([5, 3, 4, ..., 1, 2, 2], dtype=int64)

### Testing Some Test Audio Data

Steps
- Preprocess the new audio data
- predict the classes
- Invere transform your Predicted Label

In [None]:
filename="UrbanSound8K/drilling_1.wav"
audio, sample_rate = librosa.load(filename, res_type='kaiser_fast')
mfccs_features = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=40)
mfccs_scaled_features = np.mean(mfccs_features.T,axis=0)

print(mfccs_scaled_features)
mfccs_scaled_features=mfccs_scaled_features.reshape(1,-1)
print(mfccs_scaled_features)
print(mfccs_scaled_features.shape)
predicted_label=model.predict_classes(mfccs_scaled_features)
print(predicted_label)
prediction_class = labelencoder.inverse_transform(predicted_label)
prediction_class

[-146.34639      52.85859       6.0391283    46.972637      0.48288426
   31.756617     -6.395756     36.949165     -2.2981966     9.0149975
   -8.056831     24.668858    -14.41076       7.5845594    -3.089655
   17.423319    -10.068965      9.606158     -1.4731672     7.745292
   -1.9399884    -1.5998945     3.373213      1.6671567    -4.9514785
    4.8195934    -6.1473813     3.8730834   -10.502274      1.3417107
   -5.616546      4.815169     -6.152183      2.0756485    -1.8508396
   -0.45990178   -4.9980536     2.528911     -0.7446382    -6.4779253 ]
[[-146.34639      52.85859       6.0391283    46.972637      0.48288426
    31.756617     -6.395756     36.949165     -2.2981966     9.0149975
    -8.056831     24.668858    -14.41076       7.5845594    -3.089655
    17.423319    -10.068965      9.606158     -1.4731672     7.745292
    -1.9399884    -1.5998945     3.373213      1.6671567    -4.9514785
     4.8195934    -6.1473813     3.8730834   -10.502274      1.3417107
    -5.616546 

array(['drilling'], dtype='<U16')