In [1]:
import os
import sys
import numpy as np
import tensorflow as tf
import IPython.display as ipd

module_path = os.path.abspath(os.path.join(os.getcwd(), os.pardir))  
if module_path not in sys.path:       
    sys.path.append(module_path)
from utils.helper import wav_clips_to_spectrogram, wav_clips_to_log_spectrogram, rebuild_audio_from_spectro_clips
from utils.dataset import create_samples
from models.autoencoder_conv2d import AutoencoderConv2d

In [2]:
FREQ_BINS = 2049
TIME_FRAMES = 87
samples = create_samples('Dev')
train_sample = samples[0]

x_train = wav_clips_to_spectrogram(train_sample['mix'])
y_train = dict()
y_train['vocals'] = wav_clips_to_spectrogram(train_sample['vocals'])
y_train['bass'] = wav_clips_to_spectrogram(train_sample['bass'])
y_train['drums'] = wav_clips_to_spectrogram(train_sample['drums'])
y_train['other'] =  wav_clips_to_spectrogram(train_sample['other'])

In [16]:
# separator model
separator = AutoencoderConv2d(2049, 87, (3, 3))
model = separator.get_model()
model.summary()


# BEGIN TRAINING
model.compile(optimizer=tf.keras.optimizers.Adadelta(lr=0.3),
              loss={'vocals': tf.keras.losses.MeanSquaredError(),
                    'bass': tf.keras.losses.MeanSquaredError(),
                    'drums': tf.keras.losses.MeanSquaredError(),
                    'other': tf.keras.losses.MeanSquaredError()})

history = model.fit(x_train, y_train,
                    batch_size=1,
                    epochs=100,
                    verbose=2,
                    callbacks=None)

Model: "conv_spectrogram_unet"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
mix (InputLayer)                [(None, 2049, 87)]   0                                            
__________________________________________________________________________________________________
tf_op_layer_ExpandDims_5 (Tenso [(None, 2049, 87, 1) 0           mix[0][0]                        
__________________________________________________________________________________________________
conv2d_85 (Conv2D)              (None, 2049, 87, 4)  40          tf_op_layer_ExpandDims_5[0][0]   
__________________________________________________________________________________________________
layer_normalization_45 (LayerNo (None, 2049, 87, 4)  8           conv2d_85[0][0]                  
______________________________________________________________________________

Epoch 4/100
105/105 - 5s - loss: 12.1390 - vocals_loss: 4.0252 - bass_loss: 2.1590 - drums_loss: 3.3684 - other_loss: 2.5863
Epoch 5/100
105/105 - 5s - loss: 10.2739 - vocals_loss: 3.5802 - bass_loss: 1.8078 - drums_loss: 2.6071 - other_loss: 2.2787
Epoch 6/100
105/105 - 5s - loss: 8.9032 - vocals_loss: 3.1047 - bass_loss: 1.6373 - drums_loss: 2.1260 - other_loss: 2.0352
Epoch 7/100
105/105 - 5s - loss: 7.9428 - vocals_loss: 2.7588 - bass_loss: 1.4353 - drums_loss: 1.9437 - other_loss: 1.8050
Epoch 8/100
105/105 - 5s - loss: 7.2481 - vocals_loss: 2.5253 - bass_loss: 1.3361 - drums_loss: 1.7510 - other_loss: 1.6357
Epoch 9/100
105/105 - 5s - loss: 6.5176 - vocals_loss: 2.1968 - bass_loss: 1.1723 - drums_loss: 1.6272 - other_loss: 1.5213
Epoch 10/100
105/105 - 5s - loss: 5.9287 - vocals_loss: 1.8868 - bass_loss: 1.0810 - drums_loss: 1.5401 - other_loss: 1.4208
Epoch 11/100
105/105 - 5s - loss: 5.6915 - vocals_loss: 1.8940 - bass_loss: 1.0216 - drums_loss: 1.4952 - other_loss: 1.2806
Epoc

Epoch 70/100
105/105 - 5s - loss: 1.4207 - vocals_loss: 0.3764 - bass_loss: 0.1702 - drums_loss: 0.5378 - other_loss: 0.3364
Epoch 71/100
105/105 - 5s - loss: 1.3559 - vocals_loss: 0.3381 - bass_loss: 0.1679 - drums_loss: 0.5307 - other_loss: 0.3192
Epoch 72/100
105/105 - 5s - loss: 1.4887 - vocals_loss: 0.4099 - bass_loss: 0.1736 - drums_loss: 0.5477 - other_loss: 0.3575
Epoch 73/100
105/105 - 5s - loss: 1.3862 - vocals_loss: 0.3578 - bass_loss: 0.1716 - drums_loss: 0.5156 - other_loss: 0.3412
Epoch 74/100
105/105 - 5s - loss: 1.3284 - vocals_loss: 0.3395 - bass_loss: 0.1579 - drums_loss: 0.5098 - other_loss: 0.3211
Epoch 75/100
105/105 - 5s - loss: 1.3991 - vocals_loss: 0.3816 - bass_loss: 0.1652 - drums_loss: 0.5084 - other_loss: 0.3439
Epoch 76/100
105/105 - 5s - loss: 1.3253 - vocals_loss: 0.3418 - bass_loss: 0.1597 - drums_loss: 0.5032 - other_loss: 0.3205
Epoch 77/100
105/105 - 5s - loss: 1.4024 - vocals_loss: 0.4118 - bass_loss: 0.1615 - drums_loss: 0.5047 - other_loss: 0.3244


In [17]:
pred = model.predict(wav_clips_to_spectrogram(train_sample['mix']))

In [None]:
pred_vocal = pred[0]
separated_vocals = rebuild_audio_from_spectro_clips(pred_vocal)
ipd.Audio(separated_vocals, rate=44100)

In [None]:
reconstructed_vocal = rebuild_audio_from_spectro_clips(wav_clips_to_spectrogram(train_sample['vocals']))
ipd.Audio(reconstructed_vocal, rate=44100)