In [1]:
import numpy as np
import tensorflow as tf
import librosa
import os

In [2]:
# Hack to get it to work with RTX 2000 cards
gpus = tf.config.experimental.list_physical_devices('GPU')
for gpu in gpus:
  tf.config.experimental.set_memory_growth(gpu, True)

In [3]:
# Load a file
dataset_dir = './Baggins_Music/'
music_filenames = os.listdir(dataset_dir)

In [4]:
y, sr = librosa.load(dataset_dir+music_filenames[0], mono=False)
y = np.transpose(y)



In [7]:
y.shape

(68618189, 2)

In [8]:
# Papers and resources
# WaveNet paper https://arxiv.org/pdf/1609.03499.pdf
# WaveNet AutoEncoder paper (instrument decomp): https://arxiv.org/pdf/1704.01279.pdf
# NSynth: https://magenta.tensorflow.org/nsynth
# WaveNet implementation: https://github.com/ibab/tensorflow-wavenet

22050

In [28]:
# Define the audio autoencoder 
class AudioAutoEncoder(tf.keras.Model):

    def __init__(self, input_len=1024):
        super(AudioAutoEncoder, self).__init__()
        self.input_len = input_len
        
        # Setup encoder layers
        self.conv1 = tf.keras.layers.Conv1D(filters=32, kernel_size=16, strides=1, dilation_rate=10, activation='relu')
        self.conv2 = tf.keras.layers.Conv1D(filters=32, kernel_size=16, strides=1, dilation_rate=10, activation='relu')
        self.conv3 = tf.keras.layers.Conv1D(filters=32, kernel_size=16, strides=1, dilation_rate=10, activation='relu')
        self.conv4 = tf.keras.layers.Conv1D(filters=32, kernel_size=16, strides=1, dilation_rate=10, activation='relu')
        self.conv5 = tf.keras.layers.Conv1D(filters=32, kernel_size=16, strides=1, dilation_rate=10, activation='relu')
        self.flatten = tf.keras.layers.Flatten()
        self.fc1 = tf.keras.layers.Dense(units=256, activation='relu')
        self.fc2 = tf.keras.layers.Dense(units=128, activation='relu')
        
        # Setup decoder layers
        # Transpose Convolution: https://www.tensorflow.org/api_docs/python/tf/keras/layers/Conv1DTranspose
        

    def encoder(self, x):
        x = self.conv1(x)
        x = self.conv2(x)
        x = self.conv3(x)
        x = self.conv4(x)
        x = self.conv5(x)
        # Flatten the layer
        x = self.flatten(x)
        x = self.fc1(x)
        x = self.fc2(x)
        return x
    
    def decoder(self, z):
        return z
        
    def call(self, inputs):
        z = self.encoder(inputs)
        x_hat = self.decoder(z)
        
        return x_hat

    

In [29]:
auto_encoder = AudioAutoEncoder()
y = np.random.rand(1, 1000, 2)
print(y.shape)
print(auto_encoder(y).shape)

(1, 1000, 2)


To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.

(1, 128)
