In [1]:
# !pip install librosa

In [1]:
import os
import pickle
import numpy as np
import pandas as pd
from glob import glob
from tqdm import tqdm_notebook # progress bars
from ipywidgets import IntProgress # bug fix with tqdm

import librosa # calculates mfccs

# vggish
import tensorflow as tf
from audioset import vggish_input
from audioset import vggish_slim
from audioset import vggish_postprocess
from audioset import vggish_params
from audioset import mel_features

import matplotlib.pyplot as plt
%matplotlib inline

Instructions for updating:
Use the retry module or similar alternatives.


<IPython.core.display.Javascript object>

In [2]:
plt.rcParams['figure.figsize'] = 15, 8

# mapping between class index and label
classes = ['air_conditioner', 'car_horn', 'children_playing', 'dog_bark', 'drilling',
           'engine_idling', 'gun_shot', 'jackhammer', 'siren', 'street_music']
class_index = lambda c: classes.index(c) # ('car_horn') returns 1

# get the glob file pattern for audio files in a specific class
wav_file_pattern = '../Data/UrbanSound8k/audio/fold*/*-{}-*-*.wav'
class_fname = lambda c: wav_file_pattern.format(class_index(c)) # ('car_horn') returns '.../*-1-*-*.wav'

# Defining the CNN
We are loading the pretrained VGGish model from the audioset directory. The original repo can be found [here](https://github.com/tensorflow/models/tree/master/research/audioset).

### Don't run this cell twice cuz it'll give you an error !!

In [4]:
# don't run twice
# VGG-ish model parameters
PCA_PARAMS = 'audioset/vggish_pca_params.npz'
CHECKPOINT = 'audioset/vggish_model.ckpt'

# Prepare a postprocessor to munge the model embeddings
pproc = vggish_postprocess.Postprocessor(PCA_PARAMS)

# initialize tensorflow stuff
tf.Graph().as_default()
sess = tf.Session()

# Define the model in inference mode, load the checkpoint, and
# locate input and output tensors.
vggish_slim.define_vggish_slim(training=False)
vggish_slim.load_vggish_slim_checkpoint(sess, CHECKPOINT)
features_tensor = sess.graph.get_tensor_by_name(vggish_params.INPUT_TENSOR_NAME)
embedding_tensor = sess.graph.get_tensor_by_name(vggish_params.OUTPUT_TENSOR_NAME)

INFO:tensorflow:Restoring parameters from audioset/vggish_model.ckpt


# Load files for a specific class

In [3]:
# choose which class you're extracting embeddings for
CLASS = 'air_conditioner'

# we can extract data for each class to separate files and combine them after
OUTPUT_FILE = '../Data/features/output_{}.pkl'.format(CLASS)

# get all wave files for a specific class
wav_files = glob(class_fname(CLASS))
OUTPUT_FILE, len(wav_files), wav_files[:5]

('../Data/features/output_air_conditioner.pkl',
 1000,
 ['../Data/UrbanSound8k/audio/fold1/127873-0-0-0.wav',
  '../Data/UrbanSound8k/audio/fold1/134717-0-0-0.wav',
  '../Data/UrbanSound8k/audio/fold1/134717-0-0-1.wav',
  '../Data/UrbanSound8k/audio/fold1/134717-0-0-12.wav',
  '../Data/UrbanSound8k/audio/fold1/134717-0-0-13.wav'])

# Run the model and get embeddings
The embeddings are extracted and then the PCA is taken of the embeddings.

In [6]:
postprocessed_batch = []
for wav_file in tqdm_notebook(wav_files, desc='VGGish'):
    # load the file and calculate mel spectrogram
    example = vggish_input.wavfile_to_examples(wav_file)
    
    if example.size:
        # get embedding
        [embedding] = sess.run([embedding_tensor], feed_dict={features_tensor: example})
        # get pca of embedding
        pca = pproc.postprocess(embedding)
    else:
        pca = None
    postprocessed_batch.append(pca)
    

HBox(children=(IntProgress(value=0, description=u'VGGish', max=1000), HTML(value=u'')))




<IPython.core.display.Javascript object>

# Compute MFCCs
MFCCs are a representation of the timbre of a sound (as opposed to pitch). This is a large part of how we differentiate different sounds from each other. For a thorough explanation of MFCCs you can go [here](http://www.speech.cs.cmu.edu/15-492/slides/03_mfcc.pdf), but only if you're interested. Otherwise feel free to just run the code. :D

In [8]:
512 / 22050.

0.023219954648526078

In [4]:
mfcc_batch = []
for wav_file in tqdm_notebook(wav_files, desc='MFCCs'):
    # load the audio files
    data, sample_rate = librosa.load(wav_file)
    print(sample_rate)
    # compute the mfccs
    mfccs = librosa.feature.mfcc(y=data, sr=sample_rate).T
    mfcc_batch.append(mfccs)

HBox(children=(IntProgress(value=0, description=u'MFCCs', max=1000), HTML(value=u'')))

22050
22050
22050
22050
22050
22050
22050
22050
22050
22050
22050
22050
22050
22050
22050
22050
22050
22050
22050
22050
22050
22050
22050
22050
22050
22050
22050
22050
22050
22050
22050
22050
22050
22050
22050
22050
22050
22050
22050


KeyboardInterrupt: 

# Save features to file

### Info from [UrbanSound8k website](https://serv.cusp.nyu.edu/projects/urbansounddataset/urbansound8k.html)
* slice_file_name: 
The name of the audio file. The name takes the following format: [fsID]-[classID]-[occurrenceID]-[sliceID].wav, where:
[fsID] = the Freesound ID of the recording from which this excerpt (slice) is taken
[classID] = a numeric identifier of the sound class (see description of classID below for further details)
[occurrenceID] = a numeric identifier to distinguish different occurrences of the sound within the original recording
[sliceID] = a numeric identifier to distinguish different slices taken from the same occurrence

* fsID:
The Freesound ID of the recording from which this excerpt (slice) is taken

* start
The start time of the slice in the original Freesound recording

* end:
The end time of slice in the original Freesound recording

* salience:
A (subjective) salience rating of the sound. 1 = foreground, 2 = background.

* fold:
The fold number (1-10) to which this file has been allocated.

* classID:
A numeric identifier of the sound class:
0 = air_conditioner
1 = car_horn
2 = children_playing
3 = dog_bark
4 = drilling
5 = engine_idling
6 = gun_shot
7 = jackhammer
8 = siren
9 = street_music

* class:
The class name: air_conditioner, car_horn, children_playing, dog_bark, drilling, engine_idling, gun_shot, jackhammer, 
siren, street_music.

In [5]:
# create the output file if it doesn't exist
if not os.path.isfile(OUTPUT_FILE):
    df = pd.read_csv('../Data/UrbanSound8K/metadata/UrbanSound8K.csv').set_index('slice_file_name')
    df = df[df['class'] == CLASS]
    df['embedding'] = None
    df['mfcc'] = None
    df.to_csv(OUTPUT_FILE)

# load data for that class
df = pd.read_pickle(OUTPUT_FILE).set_index('slice_file_name')
df.head()

Unnamed: 0_level_0,fsID,start,end,salience,fold,classID,class,embedding,mfcc
slice_file_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
100852-0-0-0.wav,100852,0.0,4.0,1,5,0,air_conditioner,cnumpy.core.multiarray\n_reconstruct\np0\n(cnu...,cnumpy.core.multiarray\n_reconstruct\np0\n(cnu...
100852-0-0-1.wav,100852,0.5,4.5,1,5,0,air_conditioner,cnumpy.core.multiarray\n_reconstruct\np0\n(cnu...,cnumpy.core.multiarray\n_reconstruct\np0\n(cnu...
100852-0-0-10.wav,100852,5.0,9.0,1,5,0,air_conditioner,cnumpy.core.multiarray\n_reconstruct\np0\n(cnu...,cnumpy.core.multiarray\n_reconstruct\np0\n(cnu...
100852-0-0-11.wav,100852,5.5,9.5,1,5,0,air_conditioner,cnumpy.core.multiarray\n_reconstruct\np0\n(cnu...,cnumpy.core.multiarray\n_reconstruct\np0\n(cnu...
100852-0-0-12.wav,100852,6.0,10.0,1,5,0,air_conditioner,cnumpy.core.multiarray\n_reconstruct\np0\n(cnu...,cnumpy.core.multiarray\n_reconstruct\np0\n(cnu...


In [11]:
# Compile computed features so they can be added to the existing dataframe
# use pickle to convert to string so that we can reconstitute them later
new_features = pd.DataFrame({
    'embedding': postprocessed_batch, 
    'mfcc': mfcc_batch,
    # isolate the filename
    'slice_file_name': [os.path.basename(f) for f in wav_files]
}).set_index('slice_file_name')
new_features.head()

Unnamed: 0_level_0,embedding,mfcc
slice_file_name,Unnamed: 1_level_1,Unnamed: 2_level_1
127873-0-0-0.wav,cnumpy.core.multiarray\n_reconstruct\np0\n(cnu...,cnumpy.core.multiarray\n_reconstruct\np0\n(cnu...
134717-0-0-0.wav,cnumpy.core.multiarray\n_reconstruct\np0\n(cnu...,cnumpy.core.multiarray\n_reconstruct\np0\n(cnu...
134717-0-0-1.wav,cnumpy.core.multiarray\n_reconstruct\np0\n(cnu...,cnumpy.core.multiarray\n_reconstruct\np0\n(cnu...
134717-0-0-12.wav,cnumpy.core.multiarray\n_reconstruct\np0\n(cnu...,cnumpy.core.multiarray\n_reconstruct\np0\n(cnu...
134717-0-0-13.wav,cnumpy.core.multiarray\n_reconstruct\np0\n(cnu...,cnumpy.core.multiarray\n_reconstruct\np0\n(cnu...


In [12]:
# insert the new features into the dataframe
df.update(new_features)
df.head()

Unnamed: 0_level_0,fsID,start,end,salience,fold,classID,class,embedding,mfcc
slice_file_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
100852-0-0-0.wav,100852,0.0,4.0,1,5,0,air_conditioner,cnumpy.core.multiarray\n_reconstruct\np0\n(cnu...,cnumpy.core.multiarray\n_reconstruct\np0\n(cnu...
100852-0-0-1.wav,100852,0.5,4.5,1,5,0,air_conditioner,cnumpy.core.multiarray\n_reconstruct\np0\n(cnu...,cnumpy.core.multiarray\n_reconstruct\np0\n(cnu...
100852-0-0-10.wav,100852,5.0,9.0,1,5,0,air_conditioner,cnumpy.core.multiarray\n_reconstruct\np0\n(cnu...,cnumpy.core.multiarray\n_reconstruct\np0\n(cnu...
100852-0-0-11.wav,100852,5.5,9.5,1,5,0,air_conditioner,cnumpy.core.multiarray\n_reconstruct\np0\n(cnu...,cnumpy.core.multiarray\n_reconstruct\np0\n(cnu...
100852-0-0-12.wav,100852,6.0,10.0,1,5,0,air_conditioner,cnumpy.core.multiarray\n_reconstruct\np0\n(cnu...,cnumpy.core.multiarray\n_reconstruct\np0\n(cnu...


In [13]:
# save data
df.to_pickle(OUTPUT_FILE)