In [1]:
import numpy as np
import tensorflow as tf
import librosa
import os
import itertools
from jupyterplot import ProgressPlot
import IPython

# Hack to get it to work with RTX 2000 cards
gpus = tf.config.experimental.list_physical_devices('GPU')
for gpu in gpus:
  tf.config.experimental.set_memory_growth(gpu, True)


2021-11-22 13:38:34.999417: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-11-22 13:38:35.038637: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-11-22 13:38:35.039270: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero


In [2]:
# Enumerate dataset files
dataset_dir = './Baggins_Music/'
music_filenames = os.listdir(dataset_dir)

In [3]:
# Load a file to play with
song_np, sr = librosa.load(dataset_dir+music_filenames[0], mono=False)
song_np = np.transpose(song_np)



In [None]:
# Papers and resources
# WaveNet paper https://arxiv.org/pdf/1609.03499.pdf
# WaveNet AutoEncoder paper (instrument decomp): https://arxiv.org/pdf/1704.01279.pdf
# NSynth: https://magenta.tensorflow.org/nsynth
# WaveNet implementation: https://github.com/ibab/tensorflow-wavenet

In [4]:
def dilconv_outlen(input_len, dil_rate, kernel_size):
    return input_len - (kernel_size-1)*dil_rate

def calc_num_layers(tgt_output_len, input_len, dil_rate, kernel_size):
    '''
    Calculates how many encoder layers are required to get to the target output length
    from a given input length, given a few conv1d params. 
    '''
    num_layers = (input_len - tgt_output_len)/((kernel_size-1)*dil_rate)
    return num_layers

# Define the audio autoencoder 
class AudioAutoEncoder(tf.keras.Model):

    def __init__(self, input_dur=1, sample_rate=22050):
        '''
        input_dur: duration of input (in seconds)
        sample_rate: sampling frequency of input audio (in hz) 
        '''
        super(AudioAutoEncoder, self).__init__()
        self.input_len = input_dur*sample_rate
        self.sample_rate = sample_rate
        self.act = 'sigmoid'

        # A loss function
        self.mse = tf.keras.losses.MeanSquaredError()
        # An optimizer
        #self.optim = tf.keras.optimizers.SGD(learning_rate=0.1, momentum=0.0, nesterov=False, name='SGD')
        self.optim = tf.keras.optimizers.Adam(learning_rate=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-07, amsgrad=False,name='Adam')


        # Calculate how many layers we need for a receptive field to cover the whole sample
        self.dil_rate = 100
        self.z_len = 1024
        kern_size = 16
        self.num_filts = 16
        self.num_enc_lyrs = int(calc_num_layers(self.z_len, self.input_len, self.dil_rate, kern_size))
        print('Num Layers: ', self.num_enc_lyrs)
        
        # Setup encoder layers
        self.enc_lyrs = []
        for lyr_num in range(self.num_enc_lyrs):
            if(lyr_num+1 >= self.num_enc_lyrs):
                num_filts = 2
            else:
                num_filts = self.num_filts
            self.enc_lyrs += [tf.keras.layers.Conv1D(filters=num_filts, 
                                                     kernel_size=kern_size, 
                                                     strides=1, 
                                                     dilation_rate=self.dil_rate, 
                                                     activation=self.act)]
        
        # Setup encoder layers
        self.dec_lyrs = []
        for lyr_num in range(self.num_enc_lyrs):
            if(lyr_num+1 >= self.num_enc_lyrs):
                num_filts = 2
            else:
                num_filts = self.num_filts
            self.dec_lyrs += [tf.keras.layers.Conv1DTranspose(filters=num_filts, 
                                                     kernel_size=kern_size, 
                                                     strides=1, 
                                                     dilation_rate=self.dil_rate, 
                                                     activation=self.act)]
            
    def encoder(self, x):
        for idx,lyr in enumerate(self.enc_lyrs):
            #print('Layer: ', idx, '\t', x.shape, idx, '/', len(self.enc_lyrs))
            x = lyr(x)
        return x
    
    def decoder(self, x):
        for idx,lyr in enumerate(self.dec_lyrs):
            #print('Layer: ', idx, '\t', x.shape, idx, '/', len(self.dec_lyrs))
            x = lyr(x)
        return x
    
    def calc_loss(self, x_hat, x):
        loss = self.mse(x, x_hat)
        return loss
        
    def call(self, inputs):
        z = self.encoder(inputs)
        #print('Z shape: ', z.shape)
        x_hat = self.decoder(z)
        
        return x_hat

# Instantiate one
auto_encoder = AudioAutoEncoder(sample_rate=sr)

Num Layers:  14


2021-11-22 13:40:47.341220: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2021-11-22 13:40:47.341685: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-11-22 13:40:47.342114: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-11-22 13:40:47.342485: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so retur

In [5]:
# Prep the dataset
ds_num_samples = int(len(song_np)/auto_encoder.input_len)
song_ds = song_np[0:auto_encoder.input_len*ds_num_samples]
song_ds = np.reshape(song_ds, (ds_num_samples, -1, 2))
song_ds.shape

(3111, 22050, 2)

In [6]:
# Smooths input using mean window (should change to exponential smoothing soon)
class Smoother:
    def __init__(self, buf_len=15):
        self.buffer = []
        self.max_len = buf_len
    def smooth(self, x):
        self.buffer += [x]
        if(len(self.buffer) > self.max_len):
            self.buffer.pop(0)
            
        return np.mean(np.array(self.buffer))

In [None]:
# Create the dataset

print(ds_num_samples)
batch_size = 16
#batch_size = 64

# For plotting loss
pp = ProgressPlot()
# Smoother for smoothing
smoother = Smoother(buf_len=15)

# Train the autoencoder
for step in itertools.count():
    # Get the batch
    batch_idxs = np.random.randint(0, ds_num_samples, size=batch_size)
    x = song_ds[batch_idxs]

    # Put batch through model
    
    with tf.GradientTape() as t:
        x_hat = auto_encoder(x)
        # Get Loss
        loss_recon = auto_encoder.calc_loss(x_hat, x)
        pp.update(float(smoother.smooth(loss_recon.numpy())))
        print('Step {0:8d}\tLoss: {1:3.4f}'.format(step, loss_recon.numpy()))
        grads = t.gradient(loss_recon, auto_encoder.variables)
        
    # Backprop grads
    auto_encoder.optim.apply_gradients(zip(grads, auto_encoder.variables))
    
    #if(step >= 2000):
    #    break
    
    
    

3111


<IPython.core.display.Javascript object>

2021-11-22 13:40:49.158850: I tensorflow/stream_executor/cuda/cuda_dnn.cc:369] Loaded cuDNN version 8201
2021-11-22 13:40:50.072144: I tensorflow/core/platform/default/subprocess.cc:304] Start cannot spawn child process: No such file or directory
2021-11-22 13:40:50.072476: I tensorflow/core/platform/default/subprocess.cc:304] Start cannot spawn child process: No such file or directory
2021-11-22 13:40:50.072510: W tensorflow/stream_executor/gpu/asm_compiler.cc:77] Couldn't get ptxas version string: Internal: Couldn't invoke ptxas --version
2021-11-22 13:40:50.072864: I tensorflow/core/platform/default/subprocess.cc:304] Start cannot spawn child process: No such file or directory
2021-11-22 13:40:50.072941: W tensorflow/stream_executor/gpu/redzone_allocator.cc:314] Internal: Failed to launch ptxas
Relying on driver to perform ptx compilation. 
Modify $PATH to customize ptxas location.
This message will be only logged once.


Step        0	Loss: 0.3044
Step        1	Loss: 0.2582
Step        2	Loss: 0.2198
Step        3	Loss: 0.1837
Step        4	Loss: 0.1428
Step        5	Loss: 0.1252
Step        6	Loss: 0.0962
Step        7	Loss: 0.0893
Step        8	Loss: 0.0753
Step        9	Loss: 0.0660
Step       10	Loss: 0.0594
Step       11	Loss: 0.0483
Step       12	Loss: 0.0420
Step       13	Loss: 0.0407
Step       14	Loss: 0.0452
Step       15	Loss: 0.0461
Step       16	Loss: 0.0510
Step       17	Loss: 0.0453
Step       18	Loss: 0.0374
Step       19	Loss: 0.0377
Step       20	Loss: 0.0330
Step       21	Loss: 0.0391
Step       22	Loss: 0.0277
Step       23	Loss: 0.0377
Step       24	Loss: 0.0363
Step       25	Loss: 0.0373
Step       26	Loss: 0.0334
Step       27	Loss: 0.0409
Step       28	Loss: 0.0380
Step       29	Loss: 0.0379
Step       30	Loss: 0.0348
Step       31	Loss: 0.0348
Step       32	Loss: 0.0335
Step       33	Loss: 0.0479
Step       34	Loss: 0.0331
Step       35	Loss: 0.0327
Step       36	Loss: 0.0333
S

Step      304	Loss: 0.0286
Step      305	Loss: 0.0219
Step      306	Loss: 0.0301
Step      307	Loss: 0.0226
Step      308	Loss: 0.0324
Step      309	Loss: 0.0295
Step      310	Loss: 0.0266
Step      311	Loss: 0.0322
Step      312	Loss: 0.0376
Step      313	Loss: 0.0238
Step      314	Loss: 0.0335
Step      315	Loss: 0.0221
Step      316	Loss: 0.0236
Step      317	Loss: 0.0251
Step      318	Loss: 0.0372
Step      319	Loss: 0.0259
Step      320	Loss: 0.0271
Step      321	Loss: 0.0318
Step      322	Loss: 0.0291
Step      323	Loss: 0.0243
Step      324	Loss: 0.0224
Step      325	Loss: 0.0298
Step      326	Loss: 0.0207
Step      327	Loss: 0.0288
Step      328	Loss: 0.0289
Step      329	Loss: 0.0309
Step      330	Loss: 0.0269
Step      331	Loss: 0.0225
Step      332	Loss: 0.0212
Step      333	Loss: 0.0299
Step      334	Loss: 0.0274
Step      335	Loss: 0.0203
Step      336	Loss: 0.0265
Step      337	Loss: 0.0256
Step      338	Loss: 0.0315
Step      339	Loss: 0.0288
Step      340	Loss: 0.0211
S

Step      608	Loss: 0.0276
Step      609	Loss: 0.0239
Step      610	Loss: 0.0228
Step      611	Loss: 0.0285
Step      612	Loss: 0.0289
Step      613	Loss: 0.0218
Step      614	Loss: 0.0263
Step      615	Loss: 0.0308
Step      616	Loss: 0.0261
Step      617	Loss: 0.0233
Step      618	Loss: 0.0217
Step      619	Loss: 0.0293
Step      620	Loss: 0.0263
Step      621	Loss: 0.0284
Step      622	Loss: 0.0272
Step      623	Loss: 0.0320
Step      624	Loss: 0.0215
Step      625	Loss: 0.0315
Step      626	Loss: 0.0186
Step      627	Loss: 0.0295
Step      628	Loss: 0.0219
Step      629	Loss: 0.0253
Step      630	Loss: 0.0213
Step      631	Loss: 0.0286
Step      632	Loss: 0.0171
Step      633	Loss: 0.0287
Step      634	Loss: 0.0328
Step      635	Loss: 0.0234
Step      636	Loss: 0.0302
Step      637	Loss: 0.0171
Step      638	Loss: 0.0308
Step      639	Loss: 0.0196
Step      640	Loss: 0.0262
Step      641	Loss: 0.0273
Step      642	Loss: 0.0220
Step      643	Loss: 0.0230
Step      644	Loss: 0.0247
S

Step      912	Loss: 0.0219
Step      913	Loss: 0.0217
Step      914	Loss: 0.0229
Step      915	Loss: 0.0241
Step      916	Loss: 0.0281
Step      917	Loss: 0.0186
Step      918	Loss: 0.0265
Step      919	Loss: 0.0262
Step      920	Loss: 0.0230
Step      921	Loss: 0.0231
Step      922	Loss: 0.0284
Step      923	Loss: 0.0321
Step      924	Loss: 0.0220
Step      925	Loss: 0.0196
Step      926	Loss: 0.0234
Step      927	Loss: 0.0285
Step      928	Loss: 0.0180
Step      929	Loss: 0.0254
Step      930	Loss: 0.0224
Step      931	Loss: 0.0289
Step      932	Loss: 0.0267
Step      933	Loss: 0.0294
Step      934	Loss: 0.0289
Step      935	Loss: 0.0224
Step      936	Loss: 0.0207
Step      937	Loss: 0.0264
Step      938	Loss: 0.0280
Step      939	Loss: 0.0269
Step      940	Loss: 0.0211
Step      941	Loss: 0.0246
Step      942	Loss: 0.0186
Step      943	Loss: 0.0304
Step      944	Loss: 0.0204
Step      945	Loss: 0.0295
Step      946	Loss: 0.0220
Step      947	Loss: 0.0190
Step      948	Loss: 0.0261
S

Step     1216	Loss: 0.0221
Step     1217	Loss: 0.0192
Step     1218	Loss: 0.0278
Step     1219	Loss: 0.0246
Step     1220	Loss: 0.0252
Step     1221	Loss: 0.0266
Step     1222	Loss: 0.0164
Step     1223	Loss: 0.0279
Step     1224	Loss: 0.0264
Step     1225	Loss: 0.0227
Step     1226	Loss: 0.0293
Step     1227	Loss: 0.0220
Step     1228	Loss: 0.0261
Step     1229	Loss: 0.0239
Step     1230	Loss: 0.0191
Step     1231	Loss: 0.0247
Step     1232	Loss: 0.0240
Step     1233	Loss: 0.0279
Step     1234	Loss: 0.0317
Step     1235	Loss: 0.0302
Step     1236	Loss: 0.0323
Step     1237	Loss: 0.0299
Step     1238	Loss: 0.0230
Step     1239	Loss: 0.0273
Step     1240	Loss: 0.0246
Step     1241	Loss: 0.0199
Step     1242	Loss: 0.0294
Step     1243	Loss: 0.0302
Step     1244	Loss: 0.0293
Step     1245	Loss: 0.0229
Step     1246	Loss: 0.0234
Step     1247	Loss: 0.0192
Step     1248	Loss: 0.0365
Step     1249	Loss: 0.0260
Step     1250	Loss: 0.0324
Step     1251	Loss: 0.0262
Step     1252	Loss: 0.0182
S

Step     1520	Loss: 0.0310
Step     1521	Loss: 0.0258
Step     1522	Loss: 0.0259
Step     1523	Loss: 0.0244
Step     1524	Loss: 0.0297
Step     1525	Loss: 0.0232
Step     1526	Loss: 0.0220
Step     1527	Loss: 0.0204
Step     1528	Loss: 0.0219
Step     1529	Loss: 0.0169
Step     1530	Loss: 0.0313
Step     1531	Loss: 0.0217
Step     1532	Loss: 0.0264
Step     1533	Loss: 0.0292
Step     1534	Loss: 0.0266
Step     1535	Loss: 0.0247
Step     1536	Loss: 0.0253
Step     1537	Loss: 0.0325
Step     1538	Loss: 0.0267
Step     1539	Loss: 0.0196
Step     1540	Loss: 0.0309
Step     1541	Loss: 0.0246
Step     1542	Loss: 0.0291
Step     1543	Loss: 0.0270
Step     1544	Loss: 0.0180
Step     1545	Loss: 0.0321
Step     1546	Loss: 0.0307
Step     1547	Loss: 0.0315
Step     1548	Loss: 0.0310
Step     1549	Loss: 0.0224
Step     1550	Loss: 0.0252
Step     1551	Loss: 0.0205
Step     1552	Loss: 0.0244
Step     1553	Loss: 0.0306
Step     1554	Loss: 0.0188
Step     1555	Loss: 0.0254
Step     1556	Loss: 0.0212
S

In [74]:
# Get some music from the dataset and test it
start_time = 200
duration = 20
start_idx = int(start_time*sr)
end_idx = int((start_time+duration)*sr)

x = song_np[start_idx:end_idx, :]
x_hat = auto_encoder(np.reshape(x, (duration, -1, 2)))
x_hat = tf.reshape(x_hat, (-1, 2))

# Format them for playback
x = np.transpose((x*32768).astype(np.int16))
x_hat = np.transpose((x_hat.numpy()*32768).astype(np.int16))
        

In [75]:
# Play original
IPython.display.Audio(data=x, rate=sr)

In [76]:
print(x_hat.shape)
IPython.display.Audio(data=x_hat, rate=sr)

(2, 441000)


In [77]:
x_hat

array([[3184, 3184, 3184, ..., 2978, 2978, 2978],
       [2992, 2992, 2992, ..., 2839, 2839, 2839]], dtype=int16)