In [1]:
import librosa
import numpy as np
import scipy
import os
import pandas as pd
import h5py
from sklearn.preprocessing import MinMaxScaler


ModuleNotFoundError: No module named 'librosa'

In [2]:
df_train = pd.read_csv("FSDnoisy18k.meta/train_set.csv")
df_test = pd.read_csv("FSDnoisy18k.meta/test_set.csv")
df_valid = pd.read_csv("FSDnoisy18k.meta/valid_set.csv")

In [3]:
class Config(object):
    def __init__(self,
                 sampling_rate=16000, audio_duration=2, n_classes=20,
                 use_mfcc=False,n_mfcc=20):
        self.sampling_rate = sampling_rate
        self.audio_duration = audio_duration
        self.use_mfcc = use_mfcc
        self.n_mfcc = n_mfcc
        self.audio_length = self.sampling_rate * self.audio_duration
        if self.use_mfcc:
            self.dim = (self.n_mfcc *( 1 + int(np.floor(self.audio_length/512))))
        else:
            self.dim = (self.audio_length, 1)
            
    def prepare_data(self,df,data_dir):
        X = np.empty(shape=(df.shape[0],self.dim))
        y = []
        f_name = []
        input_length = self.audio_length
        for i, (fname,label) in enumerate(zip(df.fname,df.label)):
            file_path = data_dir + fname
            data, _ = librosa.core.load(file_path, sr=self.sampling_rate, res_type="kaiser_fast")

            # Random offset / Padding
            if len(data) > input_length:
                max_offset = len(data) - input_length
                offset = np.random.randint(max_offset)
                data = data[offset:(input_length+offset)]
            else:
                if input_length > len(data):
                    max_offset = input_length - len(data)
                    offset = np.random.randint(max_offset)
                else:
                    offset = 0
                data = np.pad(data, (offset, input_length - len(data) - offset), "constant")

            data = librosa.feature.mfcc(data, sr=self.sampling_rate, n_mfcc=self.n_mfcc)
            data = data.reshape((self.dim))
            X[i,] = data
            y.append(label)
            f_name.append(fname)
        return X,y,f_name
    
    def normalize_and_save(self,list_mels,y,list_fname,which_set):
        """
        Gets mel spectogram, normalize between 0 and 1 and saves it to
        a h5 file
        :list_mels: mel spectrogram
        :list_audio: name of audios
        :param which_set: train or test
        """
        # remove weird values
        list_mels = list_mels[~np.isnan(list_mels).any(axis=1)]
        
        # normalize using MinMaxScaler
        scaler = MinMaxScaler()
        scaler.fit(list_mels)
        data_normalized = scaler.transform(list_mels)

        # reshape to (data entries,number of channels, image widht, image height)
        data_reshape = np.reshape(data_normalized,
                              ((len(data_normalized),1, self.n_mfcc,1 + int(np.floor(self.audio_length/512)))))

        # get labels and audio names

        # Encode to save to h5
        
        list_targets_encoded = ([y_.encode('utf8') for y_ in y])
        list_audio_encoded = ([audio_name.encode('utf8') for audio_name in list_fname])

        # Save it to h5 file
        file_name = "processed_{}_set".format(which_set)
        path = os.path.join("DataProcessed",file_name)
        hdf5_name = str(path + '.hdf5')
        hdf5_store = h5py.File(hdf5_name, "w")
        all_inputs = hdf5_store.create_dataset("all_inputs",data = data_reshape, 
                                               shape = ((len(data_normalized),1, self.n_mfcc,1 + int(np.floor(self.audio_length/512)))),
                                               compression="gzip")
        dt = h5py.special_dtype(vlen=str)
        file_name_ = hdf5_store.create_dataset("file_name", data = list_audio_encoded, 
                                              dtype=dt ,compression="gzip")
        dt = h5py.special_dtype(vlen=str)
        targets = hdf5_store.create_dataset("targets", data = list_targets_encoded, 
                                            dtype=dt ,compression="gzip")
        hdf5_store.close()
        # Saved

        print("{} set has been processed and saved".format(which_set)) 


## Valid

In [4]:
config_valid = Config(sampling_rate=44100, audio_duration=2, 
                use_mfcc=True, n_mfcc=40)
X,y,f_name = config_valid.prepare_data(df_valid,'../AudioClipsCut/valid_audio_cut/')
config_valid.normalize_and_save(X,y,f_name,'valid')

valid set has been processed and saved


## Test

In [5]:
config_test = Config(sampling_rate=44100, audio_duration=2, 
                use_mfcc=True, n_mfcc=40)
X,y,f_name = config_test.prepare_data(df_test,'../AudioClipsCut/test_audio_cut/')
config_test.normalize_and_save(X,y,f_name,'test')

test set has been processed and saved


## Train

In [6]:
config_train = Config(sampling_rate=44100, audio_duration=2, 
                use_mfcc=True, n_mfcc=40)
X,y,f_name = config_train.prepare_data(df_train,'../AudioClipsCut/train_audio_cut/')
config_train.normalize_and_save(X,y,f_name,'train')

train set has been processed and saved


In [2]:
import h5py
file=h5py.File("DataProcessed/processed_valid_set.hdf5")

  from ._conv import register_converters as _register_converters


In [3]:
e = file['all_inputs'][:].shape

In [4]:
e

(275, 1, 40, 173)

In [3]:
file.close()