In [31]:
import numpy as np
import tensorflow as tf
import librosa
import os
import itertools
from jupyterplot import ProgressPlot
import IPython
import math

# Hack to get it to work with RTX 2000 cards
gpus = tf.config.experimental.list_physical_devices('GPU')
for gpu in gpus:
  tf.config.experimental.set_memory_growth(gpu, True)


In [2]:
# Enumerate dataset files
dataset_dir = './Baggins_Music/'
music_filenames = os.listdir(dataset_dir)

In [3]:
# Load a file to play with
song_np, sr = librosa.load(dataset_dir+music_filenames[0], mono=False)
song_np = np.transpose(song_np)



In [None]:
# Papers and resources
# WaveNet paper https://arxiv.org/pdf/1609.03499.pdf
# WaveNet AutoEncoder paper (instrument decomp): https://arxiv.org/pdf/1704.01279.pdf
# NSynth: https://magenta.tensorflow.org/nsynth
# WaveNet implementation: https://github.com/ibab/tensorflow-wavenet

In [12]:
def dilconv_outlen(input_len, dil_rate, kernel_size):
    return input_len - (kernel_size-1)*dil_rate

def calc_num_layers(tgt_output_len, input_len, dil_rate, kernel_size):
    '''
    Calculates how many encoder layers are required to get to the target output length
    from a given input length, given a few conv1d params. 
    '''
    num_layers = (input_len - tgt_output_len)/((kernel_size-1)*dil_rate)
    return num_layers

# Define the audio autoencoder 
class AudioAutoEncoder(tf.keras.Model):

    def __init__(self, input_dur=1, sample_rate=22050):
        '''
        input_dur: duration of input (in seconds)
        sample_rate: sampling frequency of input audio (in hz) 
        '''
        super(AudioAutoEncoder, self).__init__()
        self.input_len = input_dur*sample_rate
        self.sample_rate = sample_rate
        self.act = 'sigmoid'

        # A loss function
        #self.mse = tf.keras.losses.MeanSquaredError()
        self.mse = tf.keras.losses.MeanAbsoluteError()
        # An optimizer
        #self.optim = tf.keras.optimizers.SGD(learning_rate=0.1, momentum=0.0, nesterov=False, name='SGD')
        self.optim = tf.keras.optimizers.Adam(learning_rate=0.0001, beta_1=0.9, beta_2=0.999, epsilon=1e-07, amsgrad=False,name='Adam')


        # Calculate how many layers we need for a receptive field to cover the whole sample
        self.dil_rate = 100
        self.z_len = 1024
        kern_size = 16
        self.num_filts = 16
        self.num_enc_lyrs = int(calc_num_layers(self.z_len, self.input_len, self.dil_rate, kern_size))
        print('Num Layers: ', self.num_enc_lyrs)
        
        # Setup encoder layers
        self.enc_lyrs = []
        for lyr_num in range(self.num_enc_lyrs):
            if(lyr_num+1 >= self.num_enc_lyrs):
                num_filts = 2
            else:
                num_filts = self.num_filts
            self.enc_lyrs += [tf.keras.layers.Conv1D(filters=num_filts, 
                                                     kernel_size=kern_size, 
                                                     strides=1, 
                                                     dilation_rate=self.dil_rate, 
                                                     activation=self.act)]
        
        # Setup encoder layers
        self.dec_lyrs = []
        for lyr_num in range(self.num_enc_lyrs):
            if(lyr_num+1 >= self.num_enc_lyrs):
                num_filts = 2
            else:
                num_filts = self.num_filts
            self.dec_lyrs += [tf.keras.layers.Conv1DTranspose(filters=num_filts, 
                                                     kernel_size=kern_size, 
                                                     strides=1, 
                                                     dilation_rate=self.dil_rate, 
                                                     activation=self.act)]
            
    def encoder(self, x):
        for idx,lyr in enumerate(self.enc_lyrs):
            #print('Layer: ', idx, '\t', x.shape, idx, '/', len(self.enc_lyrs))
            x = lyr(x)
        return x
    
    def decoder(self, x):
        for idx,lyr in enumerate(self.dec_lyrs):
            #print('Layer: ', idx, '\t', x.shape, idx, '/', len(self.dec_lyrs))
            x = lyr(x)
        return x
    
    def calc_loss(self, x_hat, x):
        loss = self.mse(x, x_hat)
        return loss
        
    def call(self, inputs):
        z = self.encoder(inputs)
        #print('Z shape: ', z.shape)
        x_hat = self.decoder(z)
        # Zero center and scale 
        x_hat = 2.0*x_hat - 1.0
    
        return x_hat

# Instantiate one
auto_encoder = AudioAutoEncoder(sample_rate=sr)

Num Layers:  14


In [13]:
# Prep the dataset
ds_num_samples = int(len(song_np)/auto_encoder.input_len)
song_ds = song_np[0:auto_encoder.input_len*ds_num_samples]
song_ds = np.reshape(song_ds, (ds_num_samples, -1, 2))
song_ds.shape

(3111, 22050, 2)

In [14]:
# Smooths input using mean window (should change to exponential smoothing soon)
class Smoother:
    def __init__(self, alpha=0.90, skip_steps=1):
        self.step_count = 0
        self.s_last = None
        self.alpha = alpha
        self.skip_steps = skip_steps
    def smooth(self, s_now):
        if(self.s_last):
            self.s_last = s_now*self.alpha + self.s_last*(1-self.alpha)
        else:
            self.s_last = s_now
        
        self.step_count += 1
        
        if(self.step_count % self.skip_steps == 0):
            return self.s_last

In [88]:
# Create the dataset

print(ds_num_samples)
batch_size = 8
#batch_size = 16
#batch_size = 64

# For plotting loss
pp = ProgressPlot()
# Smoother for smoothing
smoother = Smoother(alpha=0.05, skip_steps=1)

# Train the autoencoder
for step in itertools.count():
    # Get the batch
    batch_idxs = np.random.randint(0, ds_num_samples, size=batch_size)
    x = song_ds[batch_idxs]

    # Put batch through model
    
    with tf.GradientTape() as t:
        x_hat = auto_encoder(x)
        # Get Loss
        print('X vs X_HAT: ', x.shape, x_hat.shape)
        loss_recon = auto_encoder.calc_loss(x_hat, x)
        loss_smooth = smoother.smooth(loss_recon.numpy())
        if(loss_smooth):
            pp.update(float(loss_smooth))
        #print('Step {0:8d}\tLoss: {1:3.4f}'.format(step, loss_recon.numpy()))
        grads = t.gradient(loss_recon, auto_encoder.variables)
        
    # Backprop grads
    auto_encoder.optim.apply_gradients(zip(grads, auto_encoder.variables))
    
    #if(step >= 2000):
    #    break
    
    
    

3111


<IPython.core.display.Javascript object>

Layer:  0 	 (8, 22050, 2) 0 / 8
idx: 0	Shape: (8, 22050)
idx: 1	Shape: (8, 22050)
idx: 2	Shape: (8, 22050)
idx: 3	Shape: (8, 22050)
idx: 4	Shape: (8, 22050)
idx: 5	Shape: (8, 22050)
idx: 6	Shape: (8, 22050)
idx: 7	Shape: (8, 22050)
idx: 0	Shape: (8, 11025, 8)
Layer:  1 	 (8, 11025, 8) 1 / 8
idx: 0	Shape: (8, 11025)
idx: 1	Shape: (8, 11025)
idx: 2	Shape: (8, 11025)
idx: 3	Shape: (8, 11025)
idx: 4	Shape: (8, 11025)
idx: 5	Shape: (8, 11025)
idx: 6	Shape: (8, 11025)
idx: 7	Shape: (8, 11025)
idx: 0	Shape: (8, 5512, 8)
Layer:  2 	 (8, 5512, 8) 2 / 8
idx: 0	Shape: (8, 5512)
idx: 1	Shape: (8, 5512)
idx: 2	Shape: (8, 5512)
idx: 3	Shape: (8, 5512)
idx: 4	Shape: (8, 5512)
idx: 5	Shape: (8, 5512)
idx: 6	Shape: (8, 5512)
idx: 7	Shape: (8, 5512)
idx: 0	Shape: (8, 2756, 8)
Layer:  3 	 (8, 2756, 8) 3 / 8
idx: 0	Shape: (8, 2756)
idx: 1	Shape: (8, 2756)
idx: 2	Shape: (8, 2756)
idx: 3	Shape: (8, 2756)
idx: 4	Shape: (8, 2756)
idx: 5	Shape: (8, 2756)
idx: 6	Shape: (8, 2756)
idx: 7	Shape: (8, 2756)
idx: 0	S

2021-11-22 21:31:20.396041: W tensorflow/core/framework/op_kernel.cc:1680] Invalid argument: required broadcastable shapes


InvalidArgumentError: required broadcastable shapes [Op:Sub]

In [19]:
# Get some music from the dataset and test it
start_time = 500
duration = 20
start_idx = int(start_time*sr)
end_idx = int((start_time+duration)*sr)

x = song_np[start_idx:end_idx, :]
x_hat = auto_encoder(np.reshape(x, (duration, -1, 2)))
x_hat = tf.reshape(x_hat, (-1, 2))

# Format them for playback
x = np.transpose((x*32768).astype(np.int16))
x_hat = np.transpose((x_hat.numpy()*32768).astype(np.int16))
        

In [20]:
# Play original
IPython.display.Audio(data=x, rate=sr)

In [21]:
print(x_hat.shape)
IPython.display.Audio(data=x_hat, rate=sr)

(2, 441000)


In [77]:
x_hat

array([[3184, 3184, 3184, ..., 2978, 2978, 2978],
       [2992, 2992, 2992, ..., 2839, 2839, 2839]], dtype=int16)

In [79]:
class AudioLayer(tf.keras.layers.Layer):
    def __init__(self, powers=8, activation='sigmoid', halve=True):
        '''
        Halves the temporal dimension
        Expects tensor of shape (bs, T, chans)
        '''
        super(AudioLayer, self).__init__()
        self.sub_lyrs = []
        self.act = activation
        self.halve = halve
        for lyr_idx in range(powers):
            dil_rate = 2**lyr_idx
            self.sub_lyrs += [tf.keras.layers.Conv1D(filters=1, 
                                                     kernel_size=5, 
                                                     strides=1, 
                                                     padding='same', 
                                                     dilation_rate=dil_rate, 
                                                     activation=self.act)]
        self.pool_lyr = tf.keras.layers.MaxPool1D()

    def call(self, x):
        outputs = []
        for idx,lyr in enumerate(self.sub_lyrs):
            outputs += [tf.squeeze(lyr(x))]
            print(f'idx: {idx}\tShape: {outputs[0].shape}')
            
        stacked = tf.stack(outputs, axis=-1)
        if(self.halve):
            pooled = self.pool_lyr(stacked)
        else:
            pooled = stacked
        print(f'idx: {0}\tShape: {pooled.shape}')
        #x_interlaced = tf.concat([x[:, 0::2], x[:, 1::2]], axis=-1)
        x_interlaced = None
        return pooled, x_interlaced
        
    

In [53]:
block = AudioLayer()
x = tf.zeros((16, 1000, 2))

block(x)[0].shape

idx: 0	Shape: (16, 1000)
idx: 1	Shape: (16, 1000)
idx: 2	Shape: (16, 1000)
idx: 3	Shape: (16, 1000)
idx: 4	Shape: (16, 1000)
idx: 5	Shape: (16, 1000)
idx: 6	Shape: (16, 1000)
idx: 7	Shape: (16, 1000)
idx: 0	Shape: (16, 500, 8)


TensorShape([16, 500, 8])

In [87]:
def dilconv_outlen(input_len, dil_rate, kernel_size):
    return input_len - (kernel_size-1)*dil_rate

def calc_num_layers(tgt_output_len, input_len, dil_rate, kernel_size):
    '''
    Calculates how many encoder layers are required to get to the target output length
    from a given input length, given a few conv1d params. 
    '''
    num_layers = (input_len - tgt_output_len)/((kernel_size-1)*dil_rate)
    return num_layers

# Define the audio autoencoder 
class AudioAutoEncoder(tf.keras.Model):

    def __init__(self, input_dur=1, sample_rate=22050):
        '''
        input_dur: duration of input (in seconds)
        sample_rate: sampling frequency of input audio (in hz) 
        '''
        super(AudioAutoEncoder, self).__init__()
        self.sample_rate = sample_rate
        # Length of the hidden representation
        self.z_len = 128
        # Calculate the size of the input
        self.num_splits = int(math.log2(sample_rate*input_dur/self.z_len) + 1.5)
        self.input_len = self.z_len*2**self.num_splits
        print('Making model w/ num_splits: ', self.num_splits, '\tand input len: ', self.input_len)
        # Activation function to use
        self.act = 'sigmoid'

        # A loss function
        #self.loss_fn = tf.keras.losses.MeanSquaredError()
        self.loss_fn = tf.keras.losses.MeanAbsoluteError()
        # An optimizer
        #self.optim = tf.keras.optimizers.SGD(learning_rate=0.1, momentum=0.0, nesterov=False, name='SGD')
        self.optim = tf.keras.optimizers.Adam(learning_rate=0.0001, beta_1=0.9, beta_2=0.999, epsilon=1e-07, amsgrad=False,name='Adam')
        
        self.enc_lyrs = []
        for split_idx in range(self.num_splits):
            self.enc_lyrs += [AudioLayer()]
        
        self.dec_lyrs = []
        for dec_idx in range(self.num_splits):
            self.dec_lyrs += [AudioLayer(halve=False)]
        self.last_lyr = tf.keras.layers.Conv1D(filters=2, 
                                                 kernel_size=1, 
                                                 strides=1, 
                                                 padding='same', 
                                                 dilation_rate=1, 
                                                 activation=self.act)
        

    def encoder(self, x):
        for idx,lyr in enumerate(self.enc_lyrs):
            print('Layer: ', idx, '\t', x.shape, idx, '/', len(self.enc_lyrs))
            x, x_bypass = lyr(x)
        return x
    
    def decoder(self, x):
        # Blow it up back to the original size
        print('z before', x.shape)
        repeat_num = int(self.input_len/self.z_len)
        x = tf.repeat(x, repeat_num, axis=1)
        print('z after', x.shape, '\twith repeat num: ', repeat_num)
        for idx,lyr in enumerate(self.dec_lyrs):
            print('Layer: ', idx, '\t', x.shape, idx, '/', len(self.dec_lyrs))
            x, x_bypass = lyr(x)
        x = self.last_lyr(x)
        return x
    
    def calc_loss(self, x_hat, x):
        loss = self.loss_fn(x, x_hat)
        return loss
        
    def call(self, inputs):
        z = self.encoder(inputs)
        #print('Z shape: ', z.shape)
        x_hat = self.decoder(z)
        # Zero center and scale 
        x_hat = 2.0*x_hat - 1.0
    
        return x_hat

# Instantiate one
auto_encoder = AudioAutoEncoder(sample_rate=sr)

Making model w/ num_splits:  8 	and input len:  32768


In [81]:
auto_encoder(tf.zeros((16, auto_encoder.input_len, 2)))

Layer:  0 	 (16, 32768, 2) 0 / 8
idx: 0	Shape: (16, 32768)
idx: 1	Shape: (16, 32768)
idx: 2	Shape: (16, 32768)
idx: 3	Shape: (16, 32768)
idx: 4	Shape: (16, 32768)
idx: 5	Shape: (16, 32768)
idx: 6	Shape: (16, 32768)
idx: 7	Shape: (16, 32768)
idx: 0	Shape: (16, 16384, 8)
Layer:  1 	 (16, 16384, 8) 1 / 8
idx: 0	Shape: (16, 16384)
idx: 1	Shape: (16, 16384)
idx: 2	Shape: (16, 16384)
idx: 3	Shape: (16, 16384)
idx: 4	Shape: (16, 16384)
idx: 5	Shape: (16, 16384)
idx: 6	Shape: (16, 16384)
idx: 7	Shape: (16, 16384)
idx: 0	Shape: (16, 8192, 8)
Layer:  2 	 (16, 8192, 8) 2 / 8
idx: 0	Shape: (16, 8192)
idx: 1	Shape: (16, 8192)
idx: 2	Shape: (16, 8192)
idx: 3	Shape: (16, 8192)
idx: 4	Shape: (16, 8192)
idx: 5	Shape: (16, 8192)
idx: 6	Shape: (16, 8192)
idx: 7	Shape: (16, 8192)
idx: 0	Shape: (16, 4096, 8)
Layer:  3 	 (16, 4096, 8) 3 / 8
idx: 0	Shape: (16, 4096)
idx: 1	Shape: (16, 4096)
idx: 2	Shape: (16, 4096)
idx: 3	Shape: (16, 4096)
idx: 4	Shape: (16, 4096)
idx: 5	Shape: (16, 4096)
idx: 6	Shape: (16, 

<tf.Tensor: shape=(16, 32768, 2), dtype=float32, numpy=
array([[[ 0.18860364, -0.07849425],
        [ 0.20197368, -0.08248645],
        [ 0.20930636, -0.09856153],
        ...,
        [ 0.13424647, -0.12228614],
        [ 0.14966226, -0.1437558 ],
        [ 0.16191053, -0.14838505]],

       [[ 0.18860364, -0.07849425],
        [ 0.20197368, -0.08248645],
        [ 0.20930636, -0.09856153],
        ...,
        [ 0.13424647, -0.12228614],
        [ 0.14966226, -0.1437558 ],
        [ 0.16191053, -0.14838505]],

       [[ 0.18860364, -0.07849425],
        [ 0.20197368, -0.08248645],
        [ 0.20930636, -0.09856153],
        ...,
        [ 0.13424647, -0.12228614],
        [ 0.14966226, -0.1437558 ],
        [ 0.16191053, -0.14838505]],

       ...,

       [[ 0.18860364, -0.07849425],
        [ 0.20197368, -0.08248645],
        [ 0.20930636, -0.09856153],
        ...,
        [ 0.13424647, -0.12228614],
        [ 0.14966226, -0.1437558 ],
        [ 0.16191053, -0.14838505]],

       

In [43]:
tf.repeat(np.arange(3), 3)

<tf.Tensor: shape=(9,), dtype=int64, numpy=array([0, 0, 0, 1, 1, 1, 2, 2, 2])>