## In this notebook, we will fine-tune the Denoising Autoencoder (DAE) on our dataset of fake and real voices.

Again, see this repo for the autoencoder architecture: https://github.com/vbelz/Speech-enhancement

In [2]:
# tensorflow and GPU is very buggy on Windows
# however, this cell solves the problem of tensorflow not detecting any GPUs
# https://github.com/tensorflow/tensorflow/issues/48868 provides the solution
import os
os.add_dll_directory("C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v11.2/bin") # tf can now see the CUDA directory

import tensorflow as tf
tf.config.list_physical_devices() # GPU should now appear

[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'),
 PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [3]:
import denoising_AE as DAE # contains useful functions from repo

In [4]:
# grab the model to fine tune
dae = DAE.load_pretrained_model()
# dae.summary()

# we want to denoise our examples
# however, we have to feed the clean example into the network and try to reconstruct that
# so we will use this layer for adding the noise: https://www.tensorflow.org/api_docs/python/tf/keras/layers/GaussianNoise
# we use the standard deviation of 0.01 as done in the autoencoder paper
noisy_input = tf.keras.layers.GaussianNoise(stddev=0.01)(dae.input)
denoised_output = dae(noisy_input)
fine_tune_model = tf.keras.Model(inputs=[noisy_input], outputs=[denoised_output])
fine_tune_model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 128, 128, 1)]     0         
                                                                 
 model_2 (Functional)        (None, 128, 128, 1)       1941093   
                                                                 
Total params: 1,941,093
Trainable params: 1,941,093
Non-trainable params: 0
_________________________________________________________________


### To fine-tune this model, we want to denoise our real voices.

So, load the real voices from the hard drive.

In [5]:
AUDIO_LENGTH = 64000 # 16000 per second, since that is the sample rate
SAMPLE_RATE = 16000
BATCH_SIZE = 64

In [6]:
# get the training data (only use real voices)
# need to look at reconstruction errors
train_ds = tf.keras.utils.audio_dataset_from_directory(
    directory="D:/for-norm/training/real",
    batch_size=BATCH_SIZE,
    validation_split=0,
    seed=0,
    labels=None,
    output_sequence_length=AUDIO_LENGTH # 16000 => all truncated to 1 second, None => all padded to length of longest file
)

Found 26941 files belonging to 1 classes.


In [7]:
def squeeze(audio):
    """
    Converts data to mono (for-norm is mono), this just means one channel.
    """
    audio = tf.squeeze(audio, axis=-1)
    return audio

train_ds = train_ds.map(squeeze, tf.data.AUTOTUNE)

In [8]:
# need to see the shapes of the dataset elements
for example_audio in train_ds.take(1):  
    print(example_audio[0].shape)

(64000,)


In [9]:
# convert stuff to spectrograms
def get_spectrogram(waveform):
    # Convert the waveform to a spectrogram via a STFT (Short-Time Fourier Transform).
    spectrogram = tf.signal.stft(waveform, frame_length=255, frame_step=128)
    # spectrogram = tf.signal.stft(waveform, frame_length=HOP_LENGTH_FRAME, frame_step=FRAME_LENGTH)

    # Obtain the magnitude of the STFT.
    spectrogram = tf.abs(spectrogram)
    
    # Add a `channels` dimension, so that the spectrogram can be used
    # as image-like input data with convolution layers (which expect
    # shape (`batch_size`, `height`, `width`, `channels`).
    # needed to update this for the shape!
    spectrogram = spectrogram[..., tf.newaxis]
    spectrogram = tf.concat([spectrogram, spectrogram], 1)[:, 0:128, 0:128, :]
    print(spectrogram.shape)
    return spectrogram

In [10]:
def make_spec_ds(ds):
    return ds.map(
        map_func=lambda audio: (get_spectrogram(audio)),
        num_parallel_calls=tf.data.AUTOTUNE
    )
train_spectrogram_ds = make_spec_ds(train_ds)

(None, 128, 128, 1)


In [11]:
train_spectrogram_ds = train_spectrogram_ds.cache().shuffle(10000).prefetch(tf.data.AUTOTUNE)

In [12]:
# see the repo for decisions on loss, optimizers, and metrics
fine_tune_model.compile(
    optimizer=tf.keras.optimizers.Adam(),
    loss=tf.keras.losses.Huber(),
    metrics=['accuracy', 'mae'],
)

In [15]:
# need val data
val_ds = tf.keras.utils.audio_dataset_from_directory(
    directory="D:/for-norm/validation/real",
    batch_size=BATCH_SIZE,
    validation_split=0,
    seed=0,
    output_sequence_length=AUDIO_LENGTH
)
val_ds = val_ds.map(squeeze, tf.data.AUTOTUNE)
val_spectrogram_ds = make_spec_ds(val_ds)
val_spectrogram_ds = val_spectrogram_ds.cache().prefetch(tf.data.AUTOTUNE)

Found 0 files belonging to 0 classes.


ValueError: No audio files found in directory D:/for-norm/validation/real. Allowed format(s): ('.wav',)

In [17]:
import os
os.listdir(path="D:/for-norm/validation/real")[:10]

['file10004.wav_16k.wav_norm.wav_mono.wav_silence.wav',
 'file10014.wav_16k.wav_norm.wav_mono.wav_silence.wav',
 'file10016.wav_16k.wav_norm.wav_mono.wav_silence.wav',
 'file10022.wav_16k.wav_norm.wav_mono.wav_silence.wav',
 'file10034.wav_16k.wav_norm.wav_mono.wav_silence.wav',
 'file1005.wav_16k.wav_norm.wav_mono.wav_silence.wav',
 'file1006.wav_16k.wav_norm.wav_mono.wav_silence.wav',
 'file10063.wav_16k.wav_norm.wav_mono.wav_silence.wav',
 'file10067.wav_16k.wav_norm.wav_mono.wav_silence.wav',
 'file10071.wav_16k.wav_norm.wav_mono.wav_silence.wav']

### Let's play some of the audio clips as a demo for the class presentation tomorrow.

Maybe you can put this into a different file and play the real and fake voices to emphasize that the reconstructions are different?

In [29]:
# play some of the audio clips
# start with the clips directly from the dataset
funny_audio = example_audio[0]
from IPython import display
display.display(display.Audio(funny_audio, rate=SAMPLE_RATE))

In [None]:
# now, add some noise
import numpy as np
noisy_funny_audio = funny_audio + np.random.normal(loc=0.0, scale=0.01, size=funny_audio.shape)