# Generate Dataset

In [22]:
import os
import librosa
import librosa.display
import IPython.display as ipd
import matplotlib.pyplot as plt
import numpy as np
from math import sqrt
import soundfile as sf
from audio_utilities import FeatureExtraction

In [15]:
def get_filepaths(dataset_dir, filter_format="wav", get_duration=False):

    filepath_array = []
    speech_duration = 0

    for i, (path, dir_name, file_name) in enumerate(os.walk(dataset_dir)):
        for file in file_name:
            if filter_format in file:
                filepath = path + "/" + file
                filepath_array.append(filepath)

                if get_duration:
                    load_file = sf.SoundFile(filepath)
                    speech_duration += len(load_file) / load_file.samplerate

    print("Got {} .{} Files".format(len(filepath_array), filter_format))
    if get_duration:
        print("Total Duration: {}m".format(int(speech_duration/60)))

    return filepath_array

def add_noise_speech(speech, noise, snr=5):

    rms_speech = sqrt(np.mean(speech ** 2))
    rms_noise_req = sqrt(rms_speech ** 2/pow(10, snr/10))

    rms_noise = sqrt(np.mean(noise ** 2))
    noise_mod = noise * (rms_noise/rms_noise_req)

    return speech + noise_mod

def get_melbands_gain(clean_speech_stft, noisy_speech_stft, melbands=22):

    clean_mel = audio_utils.get_melspectrogram(audio_stft=clean_speech_stft, number_of_melbands=melbands)
    noisy_mel = audio_utils.get_melspectrogram(audio_stft=noisy_speech_stft, number_of_melbands=melbands)

    gains_speech = np.sqrt(np.divide(clean_mel, noisy_mel))
    # gains_speech = np.where(gains_speech <= 0.01, 0.01, gains_speech)
    # gains_speech = np.divide(gains_speech, np.max(gains_speech))

    return gains_speech

def get_features(clean_speech, noisy_speech, melbands=22):

    # Extract MFCC & Relative Derivatives
    noisy_speech_stft = audio_utils.stft(noisy_speech)
    noisy_speech_mfcc = audio_utils.get_mfccs_from_spectrogram(noisy_speech_stft,
                                                               number_of_melbands=22)
    noisy_speech_mfcc_delta, \
    noisy_speech_mfcc_delta2 = audio_utils.get_mfccs_delta(noisy_speech_mfcc,
                                                           number_of_melbands=9)

    # Extract Spectral Centroid & Bandwidth
    noisy_speech_spec_centroid = audio_utils.get_spectral_centroid(audio_stft=noisy_speech_stft)
    noise_speech_spec_bandwidth = audio_utils.get_spectral_bandwidth(audio_stft=noisy_speech_stft)

    # Extract Gains
    speech_concat_stft = audio_utils.stft(clean_speech)
    speech_melband_gains = get_melbands_gain(clean_speech_stft=speech_concat_stft,
                                             noisy_speech_stft=noisy_speech_stft,
                                             melbands=melbands)

    return  noisy_speech_mfcc, noisy_speech_mfcc_delta, noisy_speech_mfcc_delta2, \
            noisy_speech_spec_centroid, noise_speech_spec_bandwidth, speech_melband_gains


In [16]:

sampling_rate = 16000
frame_length = 1024
hop_length = 512
window_length = 1024
window_function = "vorbis"
number_of_melbands = 22
number_of_features = 42

snr_req = [-5, 0, 5]
# features_speech = []
# features_gain = []
features_speech = np.ndarray((number_of_features,0))
features_gain = np.ndarray((number_of_melbands, 0))

audio_utils = FeatureExtraction(sampling_rate=sampling_rate,
                                frame_length=frame_length, hop_length=hop_length,
                                window_length=window_length, window_function=window_function)

noise_database_path = "Dataset Structure/Dataset/Noise"
speech_database_path = "Dataset Structure/Dataset/Speech"

noise_file_paths = get_filepaths(dataset_dir=noise_database_path, get_duration=True)
speech_file_paths = get_filepaths(dataset_dir=speech_database_path, get_duration=True)


Got 4 .wav Files
Total Duration: 4m
Got 136 .wav Files
Total Duration: 6m


In [None]:
for snr in snr_req:
    print("\n========== SNR {} ============".format(snr))

    for noise_file_path in noise_file_paths:

        # Load Noise
        noise_file = audio_utils.load_audiofile(noise_file_path)
        print("\nCurrently Used Noise:", noise_file_path, len(noise_file))

        speech_file_iterator = 0
        while speech_file_iterator < len(speech_file_paths):

            speech_file = audio_utils.load_audiofile(speech_file_paths[speech_file_iterator])
            speech_concat = speech_file

            # Concat Speech Till Size of Noise
            while len(speech_concat) < len(noise_file):

                speech_file_iterator += 1

                # Break when file ends
                if speech_file_iterator >= len(speech_file_paths):
                    # print(speech_file_iterator)
                    break
                else:
                    speech_file = audio_utils.load_audiofile(speech_file_paths[speech_file_iterator])
                    speech_concat = np.concatenate((speech_concat, speech_file))
                    print("Audio To Be Added: ", speech_file_paths[speech_file_iterator])

                    if len(speech_concat) >= len(noise_file):

                        # Truncate Speech Array to Noise Length
                        speech_concat = speech_concat[:len(noise_file)]

                        # Add Noise to Speech
                        noisy_speech = add_noise_speech(speech_concat, noise_file, snr=snr)

                        # Get Features
                        mfcc, mfcc_d, mfcc_d2, \
                        spec_centroid, spec_bandwidth, gains = get_features(clean_speech=speech_concat,
                                                                            noisy_speech=noisy_speech)

                        # print(len(mfcc), len(mfcc_d), len(mfcc_d2),
                        #       len(spec_centroid), len(spec_bandwidth), len(gains))

                        # Add Features to Array
                        features = np.concatenate((mfcc, mfcc_d, mfcc_d2,
                                                   spec_bandwidth, spec_centroid), axis=0)

                        features_speech = np.concatenate((features_speech, features), axis=1)
                        features_gain = np.concatenate((features_gain, gains), axis=1)

                        print("Added Noise to Speech: ", features_speech.shape, features_gain.shape, "\n")

                        break

### Save to File

In [None]:
print("Saving To File")
np.savez_compressed("feature_dataset.npz", speech_features=features_speech, gains=features_gain)

### Loading From File

In [60]:
from sklearn.model_selection import train_test_split
print("Loading From File")

filename = "feature_dataset.npz"

with np.load(filename) as data:
    speech_features = data["speech_features"]
    gains = data["gains"]
    # print(np.max(gains))

    # gains = np.clip(gains, 0, 1)
    # print(np.max(gains))

    # Reshape (if Reqd)
    speech_features = speech_features.transpose()
    gains = gains.transpose()

    print(speech_features.shape, gains.shape)

    # x_train, x_test, y_train, y_test = train_test_split(speech_features, gains,
    #                                                     test_size=0.3)

    # x_train, x_validation, y_train, y_validation = train_test_split(x_train, y_train,
    #                                                                 test_size=0.2)

    window_size = 2000
    number_of_sequences = int(len(speech_features)/window_size)

    x = speech_features[:number_of_sequences*window_size]
    y = gains[:number_of_sequences*window_size]

    print(x.shape, y.shape)

    x_train = np.reshape(x, (number_of_sequences, window_size, x.shape[1]))
    y_train = np.reshape(y, (number_of_sequences, window_size, y.shape[1]))

    print(x_train.shape, y_train.shape)


Loading From File
(152154, 42) (152154, 22)
(152000, 42) (152000, 22)
(76, 2000, 42) (76, 2000, 22)


In [61]:
import keras
from keras.models import Sequential
from keras.models import Model
from keras.layers import Input
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import GRU
from keras.layers import SimpleRNN
from keras.layers import Dropout
from keras.layers import concatenate
from keras import losses
from keras import regularizers
from keras.constraints import min_max_norm
import h5py

from keras.constraints import Constraint
from keras import backend as K
import numpy as np

#import tensorflow as tf
#from keras.backend.tensorflow_backend import set_session
#config = tf.ConfigProto()
#config.gpu_options.per_process_gpu_memory_fraction = 0.42
#set_session(tf.Session(config=config))


def my_crossentropy(y_true, y_pred):
    return K.mean(2*K.abs(y_true-0.5) * K.binary_crossentropy(y_pred, y_true), axis=-1)

def mymask(y_true):
    return K.minimum(y_true+1., 1.)

def msse(y_true, y_pred):
    return K.mean(mymask(y_true) * K.square(K.sqrt(y_pred) - K.sqrt(y_true)), axis=-1)

def mycost(y_true, y_pred):
    return K.mean(mymask(y_true) * (10*K.square(K.square(K.sqrt(y_pred) - K.sqrt(y_true))) + K.square(K.sqrt(y_pred) - K.sqrt(y_true)) + 0.01*K.binary_crossentropy(y_pred, y_true)), axis=-1)

def my_accuracy(y_true, y_pred):
    return K.mean(2*K.abs(y_true-0.5) * K.equal(y_true, K.round(y_pred)), axis=-1)

class WeightClip(Constraint):
    '''Clips the weights incident to each hidden unit to be inside a range
    '''
    def __init__(self, c=2):
        self.c = c

    def __call__(self, p):
        return K.clip(p, -self.c, self.c)

    def get_config(self):
        return {'name': self.__class__.__name__,
            'c': self.c}

reg = 0.000001
constraint = WeightClip(0.499)

print('Build model...')
main_input = Input(shape=(None, 42), name='main_input')
tmp = Dense(24, activation='tanh', name='input_dense', kernel_constraint=constraint, bias_constraint=constraint)(main_input)
vad_gru = GRU(24, activation='tanh', recurrent_activation='sigmoid', return_sequences=True, name='vad_gru', kernel_regularizer=regularizers.l2(reg), recurrent_regularizer=regularizers.l2(reg), kernel_constraint=constraint, recurrent_constraint=constraint, bias_constraint=constraint)(tmp)
vad_output = Dense(1, activation='sigmoid', name='vad_output', kernel_constraint=constraint, bias_constraint=constraint)(vad_gru)
noise_input = keras.layers.concatenate([tmp, vad_gru, main_input])
noise_gru = GRU(48, activation='relu', recurrent_activation='sigmoid', return_sequences=True, name='noise_gru', kernel_regularizer=regularizers.l2(reg), recurrent_regularizer=regularizers.l2(reg), kernel_constraint=constraint, recurrent_constraint=constraint, bias_constraint=constraint)(noise_input)
denoise_input = keras.layers.concatenate([vad_gru, noise_gru, main_input])

denoise_gru = GRU(96, activation='tanh', recurrent_activation='sigmoid', return_sequences=True, name='denoise_gru', kernel_regularizer=regularizers.l2(reg), recurrent_regularizer=regularizers.l2(reg), kernel_constraint=constraint, recurrent_constraint=constraint, bias_constraint=constraint)(denoise_input)

denoise_output = Dense(22, activation='sigmoid', name='denoise_output', kernel_constraint=constraint, bias_constraint=constraint)(denoise_gru)

model = Model(inputs=main_input, outputs=[denoise_output, vad_output])

model.compile(loss=[mycost, my_crossentropy],
              metrics=[msse],
              optimizer='adam', loss_weights=[10, 0.5])


batch_size = 32

Build model...


In [64]:
print("Train Model")
try:
    model.fit(x_train, y_train,
              batch_size=batch_size, epochs=120, validation_split=0.2)
except KeyboardInterrupt:
    print("Training Interrupted")

Train Model
Epoch 1/120
Epoch 2/120
Epoch 3/120
Epoch 4/120
Training Interrupted
