In [1]:
import numpy as np
import librosa
from librosa.feature import melspectrogram
import pandas as pd
import soundfile
import os
from os import listdir
from os.path import isfile, join  
from librosa.core import stft
from sklearn.preprocessing import MinMaxScaler
import h5py
import pandas as pd

## Functions

In [2]:
def normalize_amplitude(y, tolerance=0.005):

    mean_value = np.mean(y)
    y -= mean_value
    max_value = max(abs(y)) + tolerance
    return y   

In [3]:
def convert2mel(audio,base_path,fs, n_fft,fmax,n_mels,hop_length_samples, window_lenght,ii):
    """
    Convert raw audio to mel spectrogram
    """
    if ii%500 == 0:
        print("Processing audio number {}".format(ii))
    path = os.path.join(base_path, audio)
    data, source_fs = soundfile.read(file=path)
    data = data.T
    # Resample if the source_fs is different from expected
    if fs != source_fs:
        data = librosa.core.resample(data, source_fs,fs)
    ### extracted from Eduardo Fonseca Code, it seems there are 3 audio corrupted so we need to check length
    data = normalize_amplitude(data)

    powSpectrum = np.abs(stft(data,n_fft,hop_length = hop_length_samples,
                              win_length = window_lenght, window = 'hamming', center=True, pad_mode='reflect'))**2

    mels = melspectrogram(y= None,n_fft=n_fft ,sr=fs ,S= powSpectrum, hop_length= hop_length_samples,
                          n_mels=n_mels,fmax=fmax , fmin = 0.0).T
    
    mel_normalized = (mels -  np.mean(mels, axis =0)) / np.amax(mels)
    return mel_normalized.T.flatten()

def normalize_and_save(list_mels,list_audio_name,which_set,file_name,n_mels = 96):
    """
    Gets mel spectogram, normalize between 0 and 1 and saves it to
    a h5 file
    :list_mels: mel spectrogram
    :list_audio: name of audios
    :param which_set: train or test
    :param file_name: name of the file to be stored 
    """
    
    # normalize using MinMaxScaler
    scaler = MinMaxScaler()
    scaler.fit(list_mels)
    data_normalized = scaler.transform(list_mels)
    
    # reshape to (data entries,number of channels, image widht, image height)
    data_reshape = np.reshape(data_normalized,
                          ((len(data_normalized),1, n_mels,int(data_normalized.shape[1]/n_mels))))
    
    # get labels and audio names
    path = os.path.join("FSDnoisy18k.meta","{}_set.csv".format(which_set))

    df_test = pd.read_csv(path)
    dict_ = dict(zip(df_test.fname, df_test.label))
    
    # Encode to save to h5
    list_targets_encoded = ([dict_[audio_name].encode('utf8') for audio_name in list_audio_name])
    list_audio_encoded = ([audio_name.encode('utf8') for audio_name in list_audio_name])
    
    # Save it to h5 file
    file_name = "processed_{}_set".format(which_set)
    path = os.path.join("DataProcessed",file_name)
    hdf5_name = str(path + '.hdf5')
    hdf5_store = h5py.File(hdf5_name, "w")
    all_inputs = hdf5_store.create_dataset("all_inputs",data = data_reshape, 
                                           shape = (len(data_normalized),1, n_mels,int(data_normalized.shape[1]/n_mels)),
                                           compression="gzip")
    dt = h5py.special_dtype(vlen=str)
    file_name_ = hdf5_store.create_dataset("file_name", data = list_audio_encoded, 
                                          dtype=dt ,compression="gzip")
    dt = h5py.special_dtype(vlen=str)
    targets = hdf5_store.create_dataset("targets", data = list_targets_encoded, 
                                        dtype=dt ,compression="gzip")
    hdf5_store.close()
    # Saved
    
    print("{} set has been processed and saved".format(which_set))
    

## Parameters

In [4]:
n_mels = 96
audio_duration = 2000 # 2 seconds
fs= 32000 # we will make downsampling to save some data!!44100
n_fft = 2048
windows_size_s = 30 # 30 milisecons windowing (to have more context)
windows_size_f = (windows_size_s * fs ) // 1000  # int division # 960 samples
hop_length_samples = int(windows_size_f // 2) ## 480 samples
number_of_frames = fs * 2 # deprecated, use short audio in database already
fmax = int(fs / 2)
fmin = 0
spectrogram_type = 'power'
maximum_mel = 0

## Processing test data

In [18]:
audio_files = [f for f in listdir("AudioClipsCut/test_audio_cut")
               if isfile(join("AudioClipsCut/test_audio_cut", f))]

In [19]:
list_mels = np.asarray([convert2mel(audio,"AudioClipsCut/test_audio_cut",fs, n_fft,fmax,
                                    n_mels,hop_length_samples, windows_size_f,ii) 
                        for ii,audio in enumerate(audio_files)])

list_audio_name = ([audio for audio in audio_files])

normalize_and_save(list_mels,list_audio_name,'test','processed_test_set')

Processing audio number 0
Processing audio number 500
test set has been processed and saved


## Processing train data

In [8]:
audio_files = [f for f in listdir("AudioClipsCut/train_audio_cut")
               if isfile(join("AudioClipsCut/train_audio_cut", f))]

In [9]:
list_mels = np.asarray([convert2mel(audio,"AudioClipsCut/train_audio_cut",fs, n_fft,fmax,
                                    n_mels,hop_length_samples, windows_size_f,ii)
                        for ii,audio in enumerate(audio_files)])

list_cleaned = list_mels[~np.isnan(list_mels).any(axis=1)]

Processing audio number 0
Processing audio number 500
Processing audio number 1000
Processing audio number 1500
Processing audio number 2000
Processing audio number 2500
Processing audio number 3000
Processing audio number 3500
Processing audio number 4000
Processing audio number 4500
Processing audio number 5000
Processing audio number 5500




Processing audio number 6000
Processing audio number 6500
Processing audio number 7000
Processing audio number 7500
Processing audio number 8000
Processing audio number 8500
Processing audio number 9000
Processing audio number 9500
Processing audio number 10000
Processing audio number 10500
Processing audio number 11000
Processing audio number 11500
Processing audio number 12000
Processing audio number 12500
Processing audio number 13000
Processing audio number 13500
Processing audio number 14000
Processing audio number 14500
Processing audio number 15000
Processing audio number 15500
Processing audio number 16000
Processing audio number 16500
Processing audio number 17000


In [10]:
list_audio_name = ([audio for audio in audio_files])

normalize_and_save(list_cleaned,list_audio_name,'train','processed_train_set')

train set has been processed and saved


## Processing valid data

In [5]:
audio_files = [f for f in listdir("AudioClipsCut/valid_audio_cut")
               if isfile(join("AudioClipsCut/valid_audio_cut", f))]

In [6]:
list_mels = np.asarray([convert2mel(audio,"AudioClipsCut/valid_audio_cut",fs, n_fft,fmax,
                                    n_mels,hop_length_samples, windows_size_f,ii)
                        for ii,audio in enumerate(audio_files)])

list_cleaned = list_mels[~np.isnan(list_mels).any(axis=1)]

Processing audio number 0


In [7]:
list_audio_name = ([audio for audio in audio_files])

normalize_and_save(list_cleaned,list_audio_name,'valid','processed_valid_set')

valid set has been processed and saved
