Download and Unpack the Urban Sound 8K Dataset

In [None]:
!wget 'https://zenodo.org/record/1203745/files/UrbanSound8K.tar.gz'
!tar xvzf './UrbanSound8K.tar.gz'

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
UrbanSound8K/audio/fold4/17480-2-0-6.wav
UrbanSound8K/audio/fold4/17480-2-0-9.wav
UrbanSound8K/audio/fold4/175904-2-0-11.wav
UrbanSound8K/audio/fold4/175904-2-0-24.wav
UrbanSound8K/audio/fold4/176003-1-0-0.wav
UrbanSound8K/audio/fold4/176638-5-0-0.wav
UrbanSound8K/audio/fold4/177756-2-0-10.wav
UrbanSound8K/audio/fold4/177756-2-0-4.wav
UrbanSound8K/audio/fold4/177756-2-0-5.wav
UrbanSound8K/audio/fold4/177756-2-0-7.wav
UrbanSound8K/audio/fold4/179862-1-0-0.wav
UrbanSound8K/audio/fold4/180977-3-1-1.wav
UrbanSound8K/audio/fold4/180977-3-1-5.wav
UrbanSound8K/audio/fold4/183989-3-1-21.wav
UrbanSound8K/audio/fold4/183989-3-1-23.wav
UrbanSound8K/audio/fold4/185709-0-0-0.wav
UrbanSound8K/audio/fold4/185709-0-0-1.wav
UrbanSound8K/audio/fold4/185709-0-0-6.wav
UrbanSound8K/audio/fold4/185709-0-0-7.wav
UrbanSound8K/audio/fold4/185909-2-0-102.wav
UrbanSound8K/audio/fold4/185909-2-0-13.wav
UrbanSound8K/audio/fold4/185909-2-0-17.wav
Urba

Data Preprocessing

In [None]:
import librosa
import pandas as pd
import os
fulldatasetpath = './UrbanSound8K/audio/'
metadata = pd.read_csv('./UrbanSound8K/metadata/UrbanSound8K.csv')

Extract Features Using MFCC

In [None]:
def feature_extractor(filename):
  audio, sample_rate = librosa.load(filename)
  mfcc = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=40)
  mfcc_scaled = np.mean(mfcc.T, axis=0)
  return mfcc_scaled

In [None]:
import numpy as np
from tqdm import tqdm
features = []
for ind, row in tqdm(metadata.iterrows()):
  filename = os.path.join(os.path.abspath(fulldatasetpath), 'fold'+str(row["fold"])+'/',str(row["slice_file_name"]))
  class_label = row["class"]
  data = feature_extractor(filename)
  features.append([data, class_label])

  n_fft, y.shape[-1]
  n_fft, y.shape[-1]
  n_fft, y.shape[-1]
8732it [33:59,  4.28it/s]


Convert into DataFrame

In [None]:
featuresdf = pd.DataFrame(features, columns=['feature','class_label'])
X = np.array(featuresdf['feature'].to_list())
y = np.array(featuresdf['class_label'].to_list())

Label Encoding

In [None]:
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = to_categorical(le.fit_transform(y))

Train Test Split

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Model Creation

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import accuracy_score
from keras.layers import Convolution2D, Conv2D, MaxPooling2D, GlobalAveragePooling2D, Dense, Dropout, Activation, Flatten
# num_rows = 40
# num_columns = 174
# num_channels = 1
# X_train = X_train.reshape(X_train.shape[0], num_rows, num_columns, num_channels)
# X_test = X_test.reshape(X_test.shape[0], num_rows, num_columns, num_channels)

num_labels = y.shape[1]
# filter_size = 2

model = Sequential()
# model.add(Conv2D(filters=16, kernel_size=2, input_shape=(num_rows, num_columns, num_channels), activation='relu'))
# model.add(Dense(16, input_shape=(40,)))
# model.add(Conv2D(filters=16, kernel_size=2, activation="relu"))
# model.add(MaxPooling2D(pool_size=2))
# model.add(Dropout(0.2))

# model.add(Conv2D(filters=32, kernel_size=2, activation='relu'))
# model.add(MaxPooling2D(pool_size=2))
# model.add(Dropout(0.2))

# model.add(Conv2D(filters=64, kernel_size=2, activation='relu'))
# model.add(MaxPooling2D(pool_size=2))
# model.add(Dropout(0.2))

# model.add(Conv2D(filters=128, kernel_size=2, activation='relu'))
# model.add(MaxPooling2D(pool_size=2))
# model.add(Dropout(0.2))
# model.add(GlobalAveragePooling2D())

# model.add(Dense(num_labels, activation='softmax'))
model.add(Dense(100, input_shape=(40, )))
model.add(Activation("relu"))
model.add(Dropout(0.5))
model.add(Dense(200))
model.add(Activation("relu"))
model.add(Dropout(0.5))
model.add(Dense(num_labels))
model.add(Activation("softmax"))
model.compile(loss='categorical_crossentropy', metrics=['accuracy'], optimizer='adam')
model.summary()

Model: "sequential_13"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_9 (Dense)              (None, 100)               4100      
_________________________________________________________________
activation_6 (Activation)    (None, 100)               0         
_________________________________________________________________
dropout_11 (Dropout)         (None, 100)               0         
_________________________________________________________________
dense_10 (Dense)             (None, 200)               20200     
_________________________________________________________________
activation_7 (Activation)    (None, 200)               0         
_________________________________________________________________
dropout_12 (Dropout)         (None, 200)               0         
_________________________________________________________________
dense_11 (Dense)             (None, 10)              

Training

In [None]:
num_epochs = 72
num_batch_size = 256
from datetime import datetime
start = datetime.now()
model.fit(X_train, y_train, batch_size=num_batch_size, epochs=num_epochs, validation_data=(X_test, y_test), verbose=1)
duration = datetime.now() - start
print("Training completed in time: ", duration)

Epoch 1/72
Epoch 2/72
Epoch 3/72
Epoch 4/72
Epoch 5/72
Epoch 6/72
Epoch 7/72
Epoch 8/72
Epoch 9/72
Epoch 10/72
Epoch 11/72
Epoch 12/72
Epoch 13/72
Epoch 14/72
Epoch 15/72
Epoch 16/72
Epoch 17/72
Epoch 18/72
Epoch 19/72
Epoch 20/72
Epoch 21/72
Epoch 22/72
Epoch 23/72
Epoch 24/72
Epoch 25/72
Epoch 26/72
Epoch 27/72
Epoch 28/72
Epoch 29/72
Epoch 30/72
Epoch 31/72
Epoch 32/72
Epoch 33/72
Epoch 34/72
Epoch 35/72
Epoch 36/72
Epoch 37/72
Epoch 38/72
Epoch 39/72
Epoch 40/72
Epoch 41/72
Epoch 42/72
Epoch 43/72
Epoch 44/72
Epoch 45/72
Epoch 46/72
Epoch 47/72
Epoch 48/72
Epoch 49/72
Epoch 50/72
Epoch 51/72
Epoch 52/72
Epoch 53/72
Epoch 54/72
Epoch 55/72
Epoch 56/72
Epoch 57/72
Epoch 58/72
Epoch 59/72
Epoch 60/72
Epoch 61/72
Epoch 62/72
Epoch 63/72
Epoch 64/72
Epoch 65/72
Epoch 66/72
Epoch 67/72
Epoch 68/72
Epoch 69/72
Epoch 70/72
Epoch 71/72
Epoch 72/72
Training completed in time:  0:00:21.982763


Accuracy Score

In [None]:
score = model.evaluate(X_train, y_train, verbose=0)
print("Training Accuracy: ", score[1])
score = model.evaluate(X_test, y_test, verbose=0)
print("Testing Accuracy: ", score[1])

Training Accuracy:  0.7620615363121033
Testing Accuracy:  0.7447052001953125


Predict a Sound File

In [None]:
def print_prediction(file_name):
    prediction_feature = extract_features(file_name) 
    prediction_feature = prediction_feature.reshape(1, num_rows, num_columns, num_channels)
    predicted_vector = model.predict_classes(prediction_feature)
    predicted_class = le.inverse_transform(predicted_vector) 
    print("The predicted class is:", predicted_class[0], '\n') 
    predicted_proba_vector = model.predict_proba(prediction_feature) 
    predicted_proba = predicted_proba_vector[0]
    for i in range(len(predicted_proba)): 
        category = le.inverse_transform(np.array([i]))
        print(category[0], "\t\t : ", format(predicted_proba[i], '.32f'))