In [1]:
#Load Dataset
import os
import numpy as np
import librosa
import scipy
import math

In [2]:
#Preprocess audio for vggish model input

In [3]:
#A few Parameters
VGGISH_CHECKPOINT_PATH = "./vggish/model_parameters/vggish_weights.ckpt"
VGGISH_PCA_PARAMS_PATH = "./vggish/model_parameters/vggish_pca_params.npz"

DATASET_16BIT_PATH = "D:/Thesis/UrbanSound8K-16bit/audio-classified"


In [4]:
#optional: Augment the datasets

In [5]:
#Extract Features from Vggish Model
from __future__ import print_function
import sys
sys.path.append("./vggish")

import vggish_input
import vggish_params
import vggish_postprocess
import vggish_keras
import mel_features
print('\nTesting your install of VGGish\n')

# Paths to downloaded VGGish files.
checkpoint_path = VGGISH_CHECKPOINT_PATH
pca_params_path = VGGISH_PCA_PARAMS_PATH

# Relative tolerance of errors in mean and standard deviation of embeddings.
rel_error = 0.1  # Up to 10%

# Generate a 1 kHz sine wave at 44.1 kHz (we use a high sampling rate
# to test resampling to 16 kHz during feature extraction).
num_secs = 3
freq = 1000
sr = 44100
t = np.linspace(0, num_secs, int(num_secs * sr))
x = np.sin(2 * np.pi * freq * t)

# Produce a batch of log mel spectrogram examples.
input_batch = vggish_input.waveform_to_examples(x, sr)
print('Log Mel Spectrogram example: ', input_batch[0])
np.testing.assert_equal(
    input_batch.shape,
    [num_secs, vggish_params.NUM_FRAMES, vggish_params.NUM_BANDS])

# Define VGGish, load the checkpoint, and run the batch through the model to
# produce embeddings.
model = vggish_keras.get_vggish_keras()
model.load_weights(checkpoint_path)
embedding_batch = model.predict(input_batch[:,:,:,None])
print('VGGish embedding: ', embedding_batch[0])
expected_embedding_mean = 0.131
expected_embedding_std = 0.238
np.testing.assert_allclose(
  [np.mean(embedding_batch), np.std(embedding_batch)],
  [expected_embedding_mean, expected_embedding_std],
  rtol=rel_error)

# Postprocess the results to produce whitened quantized embeddings.
pproc = vggish_postprocess.Postprocessor(pca_params_path)
postprocessed_batch = pproc.postprocess(embedding_batch)
print('Postprocessed VGGish embedding: ', postprocessed_batch[0])
expected_postprocessed_mean = 123.0
expected_postprocessed_std = 75.0
np.testing.assert_allclose(
    [np.mean(postprocessed_batch), np.std(postprocessed_batch)],
    [expected_postprocessed_mean, expected_postprocessed_std],
    rtol=rel_error)

print('\nLooks Good To Me!\n')

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.



Testing your install of VGGish

Log Mel Spectrogram example:  [[-4.47297436 -4.29457354 -4.14940631 ... -3.9747003  -3.94774997
  -3.78687669]
 [-4.48589533 -4.28825497 -4.139964   ... -3.98368686 -3.94976505
  -3.7951698 ]
 [-4.46158065 -4.29329706 -4.14905953 ... -3.96442484 -3.94895483
  -3.78619839]
 ...
 [-4.46152626 -4.29365061 -4.14848608 ... -3.96638113 -3.95057575
  -3.78538167]
 [-4.46152595 -4.2936572  -4.14848104 ... -3.96640507 -3.95059567
  -3.78537143]
 [-4.46152565 -4.29366386 -4.14847603 ... -3.96642906 -3.95061564
  -3.78536116]]
VGGish embedding:  [0.         0.         0.         0.         0.         0.
 0.         0.16137299 0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.8069577
 0.         0.         0.         0.         0.         0.
 0.         0.3679273  0.035824   0.         0.         0.
 0.         0.38027024 0.1375595  0.9174706  0.80656356 0.
 0.         0.         0.         0.04036269 0.70762444 0.
 0.497

In [6]:
#Looks like we are going to use the PCA whitened embedding version. WHY?

In [7]:
# Feed data into the model
def getListOfFiles(dirpath):
    # create a list of file and sub directories 
    # names in the given directory 
    listOfFile = os.listdir(dirpath)
    allFiles = list()
    # Iterate over all the entries
    for entry in listOfFile:
        # Create full path
        fullPath = os.path.join(dirpath, entry)
        # If entry is a directory then get the list of files in this directory 
        if os.path.isdir(fullPath):
            allFiles = allFiles + getListOfFiles(fullPath)
        else:
            allFiles.append(fullPath)        
    
    return allFiles

def urban_labels(Y, fpaths):
    """urban sound dataset labels."""
    urban_label = lambda path: int(os.path.split(path)[-1].split('-')[1])
    for p in fpaths:
        Y = np.append(Y, [urban_label(p)])
    return Y

def load_sound_files(file_paths):
    raw_sounds = []
    X,sr = librosa.load(file_paths)
    time = librosa.get_duration(X, sr)
    factor = math.ceil(4/time)
    scaled_by_factor = np.tile(X,factor)
    adjusted_to_4 = scaled_by_factor[0:(sr*4)]
#    X = np.asarray(scipy.signal.resample(X, sr),dtype=np.float32)
    raw_sounds = np.append(raw_sounds, adjusted_to_4)
    return raw_sounds

# y, sr = librosa.load(path_name)

# for i in range(len(sound_file_paths)):
#     filepath, sound_name = path_class(sound_file_paths[i])
#     raw_sounds = load_sound_files(filepath)


In [8]:
#Loading raw sounds
import pickle

filepaths = np.asarray(getListOfFiles(DATASET_16BIT_PATH))
print("No of entries in our Dataset: ", filepaths.shape)

Y = np.ndarray(0)
labels = urban_labels(Y, filepaths)
print("Labels : ",labels.shape)

raw = []
for i in range(len(filepaths)):
    #len(filepaths) when ready
    raw.append(load_sound_files(filepaths[i]))
    if i%100 == 0 :
        print("Raw Sound loaded for :", filepaths[i])

#An example file
i = 5
print("Filepath : ", filepaths[i])
print("Label : ", labels[i])
print("raw sound: ", raw[i][0])
print("raw sound length: ", len(raw))
#resampling all files to 22khz


#del raw

No of entries in our Dataset:  (9712,)
Labels :  (9712,)
Raw Sound loaded for : D:/Thesis/UrbanSound8K-16bit/audio-classified\air_conditioner\100852-0-0-0.wav
Raw Sound loaded for : D:/Thesis/UrbanSound8K-16bit/audio-classified\air_conditioner\13230-0-0-22.wav
Raw Sound loaded for : D:/Thesis/UrbanSound8K-16bit/audio-classified\air_conditioner\146714-0-0-41.wav
Raw Sound loaded for : D:/Thesis/UrbanSound8K-16bit/audio-classified\air_conditioner\162103-0-0-14.wav
Raw Sound loaded for : D:/Thesis/UrbanSound8K-16bit/audio-classified\air_conditioner\177726-0-0-15.wav
Raw Sound loaded for : D:/Thesis/UrbanSound8K-16bit/audio-classified\air_conditioner\178686-0-0-43.wav
Raw Sound loaded for : D:/Thesis/UrbanSound8K-16bit/audio-classified\air_conditioner\189982-0-0-20.wav
Raw Sound loaded for : D:/Thesis/UrbanSound8K-16bit/audio-classified\air_conditioner\204240-0-0-23.wav
Raw Sound loaded for : D:/Thesis/UrbanSound8K-16bit/audio-classified\air_conditioner\57320-0-0-5.wav
Raw Sound loaded for

Raw Sound loaded for : D:/Thesis/UrbanSound8K-16bit/audio-classified\siren\24347-8-0-19.wav
Raw Sound loaded for : D:/Thesis/UrbanSound8K-16bit/audio-classified\siren\34952-8-0-6.wav
Raw Sound loaded for : D:/Thesis/UrbanSound8K-16bit/audio-classified\siren\74364-8-1-22.wav
Raw Sound loaded for : D:/Thesis/UrbanSound8K-16bit/audio-classified\siren\94636-8-0-4.wav
Raw Sound loaded for : D:/Thesis/UrbanSound8K-16bit/audio-classified\street_music\115415-9-0-5.wav
Raw Sound loaded for : D:/Thesis/UrbanSound8K-16bit/audio-classified\street_music\14385-9-0-10.wav
Raw Sound loaded for : D:/Thesis/UrbanSound8K-16bit/audio-classified\street_music\155217-9-1-58.wav
Raw Sound loaded for : D:/Thesis/UrbanSound8K-16bit/audio-classified\street_music\171406-9-0-183.wav
Raw Sound loaded for : D:/Thesis/UrbanSound8K-16bit/audio-classified\street_music\186336-9-0-3.wav
Raw Sound loaded for : D:/Thesis/UrbanSound8K-16bit/audio-classified\street_music\26270-9-0-19.wav
Raw Sound loaded for : D:/Thesis/Urba

MemoryError: 

In [10]:
np.save("test",np.asarray(raw))
# with open("raw_sounds_dump.txt", "wb") as fp:   #Pickling
#     pickle.dump(raw, fp)
# with open("raw_sounds_dump.txt", "rb") as fp:   # Unpickling
#     raw_arr = np.asarray(pickle.load(fp))


In [17]:
# X, sr = librosa.load("D:/Thesis/UrbanSound8K-16bit/audio-classified/car_horn/100648-1-0-0.wav")
# time = librosa.get_duration(X, sr)
# factor = math.ceil(4/time)

# Y = np.tile(X,factor)
# Y = Y[0:(sr*4)]
# Y.shape
#raw_arr[0].shape


(88200,)

In [18]:
#Feature extractor model

model1 = vggish_keras.get_vggish_keras()
model1.load_weights(checkpoint_path)
pproc = vggish_postprocess.Postprocessor(pca_params_path)

# Produce a batch of log mel spectrogram examples.
def feature_extractor(x, sr):
    input_batch = vggish_input.waveform_to_examples(x, sr)
    embedding_batch = np.asarray(model1.predict(input_batch[:,:,:,None]))
    postprocessed_batch = pproc.postprocess(embedding_batch)

    return np.asarray(postprocessed_batch)



In [20]:
#######EXTRACT THE RAW SOUNDS FILE FROM PICKLe FIRST###################
PCA_embeddings = []
for i in range(len(filepaths)):
    #len(filepaths) when ready
#   try:
    PCA_embedding = feature_extractor(raw[i], sr)
    PCA_embeddings.append(PCA_embedding)
#    except:
#        print("Error at i =", i)
    if i%100 ==0:
        print(" Features extracted from : ", filepaths[i])
        
print(len(PCA_embeddings))
      
      


 Features extracted from :  D:/Thesis/UrbanSound8K-16bit/audio-classified\air_conditioner\100852-0-0-0.wav
 Features extracted from :  D:/Thesis/UrbanSound8K-16bit/audio-classified\air_conditioner\13230-0-0-22.wav
 Features extracted from :  D:/Thesis/UrbanSound8K-16bit/audio-classified\air_conditioner\146714-0-0-41.wav
 Features extracted from :  D:/Thesis/UrbanSound8K-16bit/audio-classified\air_conditioner\162103-0-0-14.wav
 Features extracted from :  D:/Thesis/UrbanSound8K-16bit/audio-classified\air_conditioner\177726-0-0-15.wav
 Features extracted from :  D:/Thesis/UrbanSound8K-16bit/audio-classified\air_conditioner\178686-0-0-43.wav
 Features extracted from :  D:/Thesis/UrbanSound8K-16bit/audio-classified\air_conditioner\189982-0-0-20.wav
 Features extracted from :  D:/Thesis/UrbanSound8K-16bit/audio-classified\air_conditioner\204240-0-0-23.wav
 Features extracted from :  D:/Thesis/UrbanSound8K-16bit/audio-classified\air_conditioner\57320-0-0-5.wav
 Features extracted from :  D:/T

 Features extracted from :  D:/Thesis/UrbanSound8K-16bit/audio-classified\siren\156868-8-2-0.wav
 Features extracted from :  D:/Thesis/UrbanSound8K-16bit/audio-classified\siren\157868-8-0-9.wav
 Features extracted from :  D:/Thesis/UrbanSound8K-16bit/audio-classified\siren\159748-8-0-7.wav
 Features extracted from :  D:/Thesis/UrbanSound8K-16bit/audio-classified\siren\165166-8-0-7.wav
 Features extracted from :  D:/Thesis/UrbanSound8K-16bit/audio-classified\siren\24347-8-0-19.wav
 Features extracted from :  D:/Thesis/UrbanSound8K-16bit/audio-classified\siren\34952-8-0-6.wav
 Features extracted from :  D:/Thesis/UrbanSound8K-16bit/audio-classified\siren\74364-8-1-22.wav
 Features extracted from :  D:/Thesis/UrbanSound8K-16bit/audio-classified\siren\94636-8-0-4.wav
 Features extracted from :  D:/Thesis/UrbanSound8K-16bit/audio-classified\street_music\115415-9-0-5.wav
 Features extracted from :  D:/Thesis/UrbanSound8K-16bit/audio-classified\street_music\14385-9-0-10.wav
 Features extracte

In [21]:
#save PCA embeddings to file
with open("PCA_embeddings.txt", "wb") as fp:   #Pickling
    pickle.dump(PCA_embeddings, fp)

In [None]:
#Consolidate data values and labels

In [None]:
#Build a network based on the extracted features

In [None]:
concatenated = concatenate([feature_model, model2_out])
out = Dense(1, activation='softmax', name='output_layer')(concatenated)

merged_model = Model([model1_in, model2_in], out)