In [80]:
#Load Dataset
import os
import numpy as np
import librosa


In [2]:
#Preprocess audio for vggish model input

In [9]:
#A few Parameters
VGGISH_CHECKPOINT_PATH = "./vggish/model_parameters/vggish_weights.ckpt"
VGGISH_PCA_PARAMS_PATH = "./vggish/model_parameters/vggish_pca_params.npz"

DATASET_16BIT_PATH = "D:/Thesis/UrbanSound8K-16bit/audio-classified"


In [4]:
#optional: Augment the datasets

In [5]:
#Extract Features from Vggish Model
from __future__ import print_function
import sys
sys.path.append("./vggish")

import vggish_input
import vggish_params
import vggish_postprocess
import vggish_keras

print('\nTesting your install of VGGish\n')

# Paths to downloaded VGGish files.
checkpoint_path = VGGISH_CHECKPOINT_PATH
pca_params_path = VGGISH_PCA_PARAMS_PATH

# Relative tolerance of errors in mean and standard deviation of embeddings.
rel_error = 0.1  # Up to 10%

# Generate a 1 kHz sine wave at 44.1 kHz (we use a high sampling rate
# to test resampling to 16 kHz during feature extraction).
num_secs = 3
freq = 1000
sr = 44100
t = np.linspace(0, num_secs, int(num_secs * sr))
x = np.sin(2 * np.pi * freq * t)

# Produce a batch of log mel spectrogram examples.
input_batch = vggish_input.waveform_to_examples(x, sr)
print('Log Mel Spectrogram example: ', input_batch[0])
np.testing.assert_equal(
    input_batch.shape,
    [num_secs, vggish_params.NUM_FRAMES, vggish_params.NUM_BANDS])

# Define VGGish, load the checkpoint, and run the batch through the model to
# produce embeddings.
model = vggish_keras.get_vggish_keras()
model.load_weights(checkpoint_path)
embedding_batch = model.predict(input_batch[:,:,:,None])
print('VGGish embedding: ', embedding_batch[0])
expected_embedding_mean = 0.131
expected_embedding_std = 0.238
np.testing.assert_allclose(
  [np.mean(embedding_batch), np.std(embedding_batch)],
  [expected_embedding_mean, expected_embedding_std],
  rtol=rel_error)

# Postprocess the results to produce whitened quantized embeddings.
pproc = vggish_postprocess.Postprocessor(pca_params_path)
postprocessed_batch = pproc.postprocess(embedding_batch)
print('Postprocessed VGGish embedding: ', postprocessed_batch[0])
expected_postprocessed_mean = 123.0
expected_postprocessed_std = 75.0
np.testing.assert_allclose(
    [np.mean(postprocessed_batch), np.std(postprocessed_batch)],
    [expected_postprocessed_mean, expected_postprocessed_std],
    rtol=rel_error)

print('\nLooks Good To Me!\n')

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.



Testing your install of VGGish

Log Mel Spectrogram example:  [[-4.47297436 -4.29457354 -4.14940631 ... -3.9747003  -3.94774997
  -3.78687669]
 [-4.48589533 -4.28825497 -4.139964   ... -3.98368686 -3.94976505
  -3.7951698 ]
 [-4.46158065 -4.29329706 -4.14905953 ... -3.96442484 -3.94895483
  -3.78619839]
 ...
 [-4.46152626 -4.29365061 -4.14848608 ... -3.96638113 -3.95057575
  -3.78538167]
 [-4.46152595 -4.2936572  -4.14848104 ... -3.96640507 -3.95059567
  -3.78537143]
 [-4.46152565 -4.29366386 -4.14847603 ... -3.96642906 -3.95061564
  -3.78536116]]
VGGish embedding:  [0.         0.         0.         0.         0.         0.
 0.         0.16137299 0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.8069577
 0.         0.         0.         0.         0.         0.
 0.         0.3679273  0.035824   0.         0.         0.
 0.         0.38027024 0.1375595  0.9174706  0.80656356 0.
 0.         0.         0.         0.04036269 0.70762444 0.
 0.497

In [10]:
#Looks like we are going to use the PCA whitened embedding version

In [189]:
# Feed data into the model
def getListOfFiles(dirpath):
    # create a list of file and sub directories 
    # names in the given directory 
    listOfFile = os.listdir(dirpath)
    allFiles = list()
    # Iterate over all the entries
    for entry in listOfFile:
        # Create full path
        fullPath = os.path.join(dirpath, entry)
        # If entry is a directory then get the list of files in this directory 
        if os.path.isdir(fullPath):
            allFiles = allFiles + getListOfFiles(fullPath)
        else:
            allFiles.append(fullPath)        
    
    return allFiles

def urban_labels(Y, fpaths):
    """urban sound dataset labels."""
    urban_label = lambda path: int(os.path.split(path)[-1].split('-')[1])
    for p in fpaths:
        Y = np.append(Y, [urban_label(p)])
    return Y

def load_sound_files(file_paths):
    raw_sounds = []
    X,sr = librosa.load(file_paths)
    raw_sounds.append(X)
    return raw_sounds

# y, sr = librosa.load(path_name)

# for i in range(len(sound_file_paths)):
#     filepath, sound_name = path_class(sound_file_paths[i])
#     raw_sounds = load_sound_files(filepath)


In [190]:

filepaths = getListOfFiles(DATASET_16BIT_PATH)
print("No of entries in our Dataset: ", len(filepaths))

Y = np.ndarray(0)
labels = urban_labels(Y, filepaths)
print("Labels : ",labels.shape)

raw = []
for i in range(10):
    #len(filepaths) when ready
    raw.append(load_sound_files(filepaths[i]))

i = 5
print("Filepath : ", filepaths[i])
print("Label : ", labels[i])
print("raw sound: ", raw[i][0])
print("raw sound: ", len(raw[i][0]))
#Librosa resamples to 22Khz by default at load time

No of entries in our Dataset:  9712
Labels :  (9712,)
Filepath :  D:/Thesis/UrbanSound8K-16bit/audio-classified\air_conditioner\100852-0-0-13.wav
Label :  0.0
raw sound:  [ 0.04521289  0.02396493 -0.01914914 ... -0.0242601  -0.01266464
 -0.02522417]
raw sound:  88200


In [37]:
#Build a network based on the extracted features