In [1]:
#Load Dataset
import os
import numpy as np
import librosa
import scipy
import math
import pickle

In [2]:
#Preprocess audio for vggish model input

In [3]:
#A few Parameters
VGGISH_CHECKPOINT_PATH = "./vggish/model_parameters/vggish_weights.ckpt"
VGGISH_PCA_PARAMS_PATH = "./vggish/model_parameters/vggish_pca_params.npz"

DATASET_16BIT_PATH = "D:/Thesis/UrbanSound8K-16bit/audio-classified"


In [4]:
#optional: Augment the datasets

In [5]:
#Extract Features from Vggish Model
from __future__ import print_function
import sys
sys.path.append("./vggish")

import vggish_input
import vggish_params
import vggish_postprocess
import vggish_keras
import mel_features
print('\nTesting your install of VGGish\n')

# Paths to downloaded VGGish files.
checkpoint_path = VGGISH_CHECKPOINT_PATH
pca_params_path = VGGISH_PCA_PARAMS_PATH

# Relative tolerance of errors in mean and standard deviation of embeddings.
rel_error = 0.1  # Up to 10%

# Generate a 1 kHz sine wave at 44.1 kHz (we use a high sampling rate
# to test resampling to 16 kHz during feature extraction).
num_secs = 3
freq = 1000
sr = 44100
t = np.linspace(0, num_secs, int(num_secs * sr))
x = np.sin(2 * np.pi * freq * t)

# Produce a batch of log mel spectrogram examples.
input_batch = vggish_input.waveform_to_examples(x, sr)
print('Log Mel Spectrogram example: ', input_batch[0])
np.testing.assert_equal(
    input_batch.shape,
    [num_secs, vggish_params.NUM_FRAMES, vggish_params.NUM_BANDS])

# Define VGGish, load the checkpoint, and run the batch through the model to
# produce embeddings.
model = vggish_keras.get_vggish_keras()
model.load_weights(checkpoint_path)
embedding_batch = model.predict(input_batch[:,:,:,None])
print('VGGish embedding: ', embedding_batch[0])
expected_embedding_mean = 0.131
expected_embedding_std = 0.238
np.testing.assert_allclose(
  [np.mean(embedding_batch), np.std(embedding_batch)],
  [expected_embedding_mean, expected_embedding_std],
  rtol=rel_error)

# Postprocess the results to produce whitened quantized embeddings.
pproc = vggish_postprocess.Postprocessor(pca_params_path)
postprocessed_batch = pproc.postprocess(embedding_batch)
print('Postprocessed VGGish embedding: ', postprocessed_batch[0])
expected_postprocessed_mean = 123.0
expected_postprocessed_std = 75.0
np.testing.assert_allclose(
    [np.mean(postprocessed_batch), np.std(postprocessed_batch)],
    [expected_postprocessed_mean, expected_postprocessed_std],
    rtol=rel_error)

print('\nLooks Good To Me!\n')

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.



Testing your install of VGGish

Log Mel Spectrogram example:  [[-4.47297436 -4.29457354 -4.14940631 ... -3.9747003  -3.94774997
  -3.78687669]
 [-4.48589533 -4.28825497 -4.139964   ... -3.98368686 -3.94976505
  -3.7951698 ]
 [-4.46158065 -4.29329706 -4.14905953 ... -3.96442484 -3.94895483
  -3.78619839]
 ...
 [-4.46152626 -4.29365061 -4.14848608 ... -3.96638113 -3.95057575
  -3.78538167]
 [-4.46152595 -4.2936572  -4.14848104 ... -3.96640507 -3.95059567
  -3.78537143]
 [-4.46152565 -4.29366386 -4.14847603 ... -3.96642906 -3.95061564
  -3.78536116]]
VGGish embedding:  [0.         0.         0.         0.         0.         0.
 0.         0.16137296 0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.8069576
 0.         0.         0.         0.         0.         0.
 0.         0.3679275  0.03582403 0.         0.         0.
 0.         0.38027024 0.13755944 0.91747046 0.8065635  0.
 0.         0.         0.         0.04036278 0.7076244  0.
 0.497

In [6]:
#Looks like we are going to use the PCA whitened embeddings as input features to the next model
x.shape

(132300,)

In [7]:
# Feed data into the model
def getListOfFiles(dirpath):
    # create a list of file and sub directories 
    # names in the given directory 
    listOfFile = os.listdir(dirpath)
    allFiles = list()
    # Iterate over all the entries
    for entry in listOfFile:
        # Create full path
        fullPath = os.path.join(dirpath, entry)
        # If entry is a directory then get the list of files in this directory 
        if os.path.isdir(fullPath):
            allFiles = allFiles + getListOfFiles(fullPath)
        else:
            allFiles.append(fullPath)        
    
    return allFiles

def urban_labels(Y, fpaths):
    """urban sound dataset labels."""
    urban_label = lambda path: int(os.path.split(path)[-1].split('-')[1])
    for p in fpaths:
        Y = np.append(Y, [urban_label(p)])
    return Y

def load_sound_files(file_paths):
    raw_sounds = []
    X,sr = librosa.load(file_paths)
    time = librosa.get_duration(X, sr)
    factor = math.ceil(4/time)
    scaled_by_factor = np.tile(X,factor)
    adjusted_to_4 = scaled_by_factor[0:(sr*4)]
#    X = np.asarray(scipy.signal.resample(X, sr),dtype=np.float32)
    raw_sounds = np.append(raw_sounds, adjusted_to_4)
    return raw_sounds


In [83]:
#Loading filepaths and labels
import pickle

filepaths = np.asarray(getListOfFiles(DATASET_16BIT_PATH))
print("No of entries in our Dataset: ", filepaths.shape)

Y = np.ndarray(0)
labels = urban_labels(Y, filepaths)
print("Labels : ",labels.shape)

with open("filepaths.txt", "wb") as fp:   #Pickling
    pickle.dump(filepaths, fp)
    
with open("labels.txt", "wb") as fp:   #Pickling
    pickle.dump(labels, fp)

No of entries in our Dataset:  (9712,)
Labels :  (9712,)


In [10]:
#loading raw sounds 
raw = []
for i in range(len(filepaths)):
    #len(filepaths) when ready
    raw.append(load_sound_files(filepaths[i]))
    if i%100 == 0 :
        print("Raw Sound loaded for :", filepaths[i])

#An example file
i = 5
print("Filepath : ", filepaths[i])
print("Label : ", labels[i])
print("raw sound: ", raw[i][0])
print("raw sound length: ", len(raw))
#resampling all files to 22khz

np.save("test",np.asarray(raw))
del raw

In [3]:
raw = np.load("test.npy")

In [100]:
# checking raw files
raw.shape

(9712, 88200)

In [8]:
#Feature extractor model
model1 = vggish_keras.get_vggish_keras()
model1.load_weights(checkpoint_path)
pproc = vggish_postprocess.Postprocessor(pca_params_path)

# Produce a batch of log mel spectrogram examples.
def feature_extractor(x, sr):
    input_batch = vggish_input.waveform_to_examples(x, sr)
    embedding_batch = np.asarray(model1.predict(input_batch[:,:,:,None]))
    postprocessed_batch = pproc.postprocess(embedding_batch)

    return np.asarray(postprocessed_batch)


In [44]:
#######EXTRACT THE RAW SOUNDS FILE FROM PICKLE###################
PCA_embeddings = []
for i in range(len(filepaths)):
    #len(filepaths) when ready
#   try:
    sr = raw[i].size/4
    PCA_embedding = feature_extractor(raw[i], sr)
    PCA_embeddings.append(PCA_embedding)
#    except:
#        print("Error at i =", i)
    if i%100 ==0:
        print(" Features extracted from : ", filepaths[i])
        
print(len(PCA_embeddings))

 Features extracted from :  D:/Thesis/UrbanSound8K-16bit/audio-classified\air_conditioner\100852-0-0-0.wav
 Features extracted from :  D:/Thesis/UrbanSound8K-16bit/audio-classified\air_conditioner\13230-0-0-22.wav
 Features extracted from :  D:/Thesis/UrbanSound8K-16bit/audio-classified\air_conditioner\146714-0-0-41.wav
 Features extracted from :  D:/Thesis/UrbanSound8K-16bit/audio-classified\air_conditioner\162103-0-0-14.wav
 Features extracted from :  D:/Thesis/UrbanSound8K-16bit/audio-classified\air_conditioner\177726-0-0-15.wav
 Features extracted from :  D:/Thesis/UrbanSound8K-16bit/audio-classified\air_conditioner\178686-0-0-43.wav
 Features extracted from :  D:/Thesis/UrbanSound8K-16bit/audio-classified\air_conditioner\189982-0-0-20.wav
 Features extracted from :  D:/Thesis/UrbanSound8K-16bit/audio-classified\air_conditioner\204240-0-0-23.wav
 Features extracted from :  D:/Thesis/UrbanSound8K-16bit/audio-classified\air_conditioner\57320-0-0-5.wav
 Features extracted from :  D:/T

 Features extracted from :  D:/Thesis/UrbanSound8K-16bit/audio-classified\siren\156868-8-2-0.wav
 Features extracted from :  D:/Thesis/UrbanSound8K-16bit/audio-classified\siren\157868-8-0-9.wav
 Features extracted from :  D:/Thesis/UrbanSound8K-16bit/audio-classified\siren\159748-8-0-7.wav
 Features extracted from :  D:/Thesis/UrbanSound8K-16bit/audio-classified\siren\165166-8-0-7.wav
 Features extracted from :  D:/Thesis/UrbanSound8K-16bit/audio-classified\siren\24347-8-0-19.wav
 Features extracted from :  D:/Thesis/UrbanSound8K-16bit/audio-classified\siren\34952-8-0-6.wav
 Features extracted from :  D:/Thesis/UrbanSound8K-16bit/audio-classified\siren\74364-8-1-22.wav
 Features extracted from :  D:/Thesis/UrbanSound8K-16bit/audio-classified\siren\94636-8-0-4.wav
 Features extracted from :  D:/Thesis/UrbanSound8K-16bit/audio-classified\street_music\115415-9-0-5.wav
 Features extracted from :  D:/Thesis/UrbanSound8K-16bit/audio-classified\street_music\14385-9-0-10.wav
 Features extracte

In [45]:
#save PCA embeddings to file--- CAREFUL
with open("PCA_embeddings.txt", "wb") as fp:   #Pickling
    pickle.dump(PCA_embeddings, fp)

In [2]:
with open("filepaths.txt", "rb") as fp:   # Unpickling
    filepaths = pickle.load(fp)


with open("labels.txt", "rb") as fp:   # Unpickling
    labels = pickle.load(fp)


with open("PCA_embeddings.txt", "rb") as fp:   # Unpickling
    PCA_embeddings = pickle.load(fp)
PCA_embeddings

[array([[154,  27, 143,  52, 255,  91, 129,  87,  98, 213, 171,  72, 104,
         153, 115, 111,  78, 155, 156, 113,  36, 228,  14,  84, 129, 179,
         118, 172, 182,  68, 156, 148, 117, 115, 128, 238, 178,  28, 118,
         206, 138, 172,   0, 158,  31, 222, 187,  94, 150, 234, 129,  90,
         115, 188, 104, 204,  97,   0, 233, 232, 214, 101, 106, 255, 116,
         110, 201,  14, 121,  34,  80, 255, 252, 123, 103,  28,  25, 137,
          63, 213,   0, 124, 143,   0,   8,  36,  99, 214, 192, 182,   0,
         124, 152, 184,  81, 198,  49,   0, 106, 169, 201,   0, 123,  45,
          54, 154,  48, 162, 135, 255, 248, 168,  50,   0, 111,  14, 142,
         215,  10, 178, 107,  47,   0,  86, 246,  62,   0, 255],
        [152,  22, 156,  51, 255,  68, 136, 103, 119, 209, 112, 132,  66,
         197, 136, 138,  56, 141,  97,  18,   0, 236,  42,  68,  78, 255,
          48, 130, 177,  63, 255, 136, 167,  96, 104, 255, 108,   0, 132,
         181, 191, 152,   2,   0, 142, 252, 121

In [25]:
# Cosolidate Datasets, test train val split
print(filepaths[4000])
print(PCA_embeddings[4000])
print(labels[4000])

D:/Thesis/UrbanSound8K-16bit/audio-classified\dog_bark\31323-3-0-22.wav
[[165  11 175 159 147   0 157  43 157 156  23  19 176 121 135  88  87 117
  172 164  75 255 106  31   0  73  64 114  84   0   3  79 137  97 149 129
  213 142  31 184 150 255  42 198   0 171   6 199 164   0 233 141  89 102
  150  61  23 103 255 164 119  72  76  32 255 168 255 167 255  38 203 165
  207 255   0   0  30  89 235 211 157 212 255 202 240 112 199 241 168  14
    0 217 157 255 255 200 139   0 106 255   0   0 158 255   0 146 255 115
   31 160 255  56 223 106 176  81 255 255  37 252 171 119   0 255 255 125
  255 255]
 [157  16 157  87 231  70 141 102 191 174  72  70 193 119  50 100 134 117
  184 237  32 221 123 117 116 111 213 169 114   0  86  53  54 187 201 156
   73 102  84 108  91 255   0 188 146  85   0 224  52 242 144 105  51  69
   72  25 171  93  33 193 188  71  81 181 206  91  73  99 240 136  62  70
  255 255 132 180 106  76 109 236  19 255 255 166 131 227  94  89  45   0
   31 255  97 255  65 216 165

In [6]:
#Build a network based on the extracted features
import numpy as np
from keras.models import Sequential, Model
from keras.layers import Input, Dense, Dropout, Activation, Flatten
from keras.layers import Convolution2D, MaxPooling2D, GlobalAveragePooling2D
from keras import optimizers
from keras.utils import np_utils
from sklearn import metrics 


  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [13]:
F = model1.output
#F=MaxPooling2D()(F)
F=Dense(128,activation='relu')(F) #we add dense layers so that the model can learn more complex functions and classify for better results.
preds=Dense(11,activation='softmax')(F) #final layer with softmax activation

In [91]:
full_model = Model(model1.input, preds)
for i in range(len(model1.layers)):
    full_model.layers[i].trainable = False
    
full_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 96, 64, 1)         0         
_________________________________________________________________
conv1 (Conv2D)               (None, 96, 64, 64)        640       
_________________________________________________________________
pool1 (MaxPooling2D)         (None, 48, 32, 64)        0         
_________________________________________________________________
conv2 (Conv2D)               (None, 48, 32, 128)       73856     
_________________________________________________________________
pool2 (MaxPooling2D)         (None, 24, 16, 128)       0         
_________________________________________________________________
conv3_1 (Conv2D)             (None, 24, 16, 256)       295168    
_________________________________________________________________
conv3_2 (Conv2D)             (None, 24, 16, 256)       590080    
__________

In [20]:
OBJECTIVE_FUNCTION = 'categorical_crossentropy'

LOSS_METRICS = ['accuracy']

#sgd = optimizers.SGD(lr = 0.01, decay = 1e-6, momentum = 0.9, nesterov = True)
#full_model.compile(optimizer = adam, loss = OBJECTIVE_FUNCTION, metrics = LOSS_METRICS)
full_model.compile(loss='categorical_crossentropy', optimizer='adam')

In [14]:
from sklearn.model_selection import train_test_split
XTrain,XTest,YTrain,YTest=train_test_split(PCA_embeddings,labels,test_size=0.2)

from keras.utils import to_categorical

YTrain = to_categorical(YTrain, num_classes=11)
YTest = to_categorical(YTest, num_classes=11)

In [15]:
# a = Input(shape=(4,128))
# b = Dense(128)(a)
# c = Dense(11, activation = 'softmax')(b)
# top_model = Model(inputs=a, outputs=c)

top_model = Sequential()
top_model.add(Dense(128, input_shape=(4, 128)))
top_model.add(Dense(128))
top_model.add(Flatten())
top_model.add(Activation('relu'))
top_model.add(Dense(11))

In [16]:
top_model.compile(loss='categorical_crossentropy', metrics=['accuracy'], optimizer='adam')
top_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_8 (Dense)              (None, 4, 128)            16512     
_________________________________________________________________
dense_9 (Dense)              (None, 4, 128)            16512     
_________________________________________________________________
flatten_2 (Flatten)          (None, 512)               0         
_________________________________________________________________
activation_6 (Activation)    (None, 512)               0         
_________________________________________________________________
dense_10 (Dense)             (None, 11)                5643      
Total params: 38,667
Trainable params: 38,667
Non-trainable params: 0
_________________________________________________________________


In [17]:
top_model.fit(np.array(XTrain), np.array(YTrain), batch_size=4, epochs=5, validation_data=(np.array(XTest), np.array(YTest)))

Train on 7769 samples, validate on 1943 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x1fe06a5afd0>

In [7]:
# copied 
try_model = Sequential()

try_model.add(Dense(1024, input_shape=(88200,)))
try_model.add(Activation('relu'))
try_model.add(Dropout(0.5))

try_model.add(Dense(256))
try_model.add(Activation('relu'))
try_model.add(Dropout(0.5))

try_model.add(Dense(256))
try_model.add(Activation('relu'))
try_model.add(Dropout(0.5))

try_model.add(Dense(11))
try_model.add(Activation('softmax'))

try_model.compile(loss='categorical_crossentropy', metrics=['accuracy'], optimizer='adam')

In [8]:
from sklearn.model_selection import train_test_split
XTrain,XTest,YTrain,YTest=train_test_split(raw,labels,test_size=0.2)

from keras.utils import to_categorical

YTrain = to_categorical(YTrain, num_classes=11)
YTest = to_categorical(YTest, num_classes=11)
del raw

In [10]:
try_model.fit(np.array(XTrain), np.array(YTrain), batch_size=32, epochs=5, validation_data=(np.array(XTest), np.array(YTest)))

Train on 7769 samples, validate on 1943 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x1ff525ab2e8>

In [9]:
#seems to be working; Raw audio data working better than Vggish embeddings