## Generative Sequencers with Neural Networks

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, losses
from tensorflow.keras.models import Model

import librosa
import aubio

### Defining and Checking the AE Architecture

In [None]:

encoder = tf.keras.Sequential([
    tf.keras.layers.InputLayer(input_shape=(16)),
    tf.keras.layers.Dense(8,activation='relu'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dense(4,activation='relu')
    ])

decoder = tf.keras.Sequential([
    tf.keras.layers.InputLayer(input_shape=(4)),
    tf.keras.layers.Dense(8,activation='relu'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dense(16,activation='sigmoid')
    ])

autoencoder = tf.keras.Sequential([encoder,decoder])

In [None]:
# Compile the autoencoder model
autoencoder.compile(optimizer='adam', loss='binary_crossentropy')
autoencoder.summary()

In [None]:
test_vector = np.array([[1,0,0,1,1,0,0,0,1,0,1,1,0,1,0,0]])
print(test_vector)

In [None]:
#Feed forward test vectors through NNs

ff = autoencoder.predict(test_vector)
print('AE output: ',ff)

encoder_ff = encoder.predict(test_vector)
print('Encoder output: ',encoder_ff)

decoder_ff = decoder.predict(encoder_ff)
print('Decoder output: ',decoder_ff)



In [None]:
print('Test vector: ', test_vector)
#Threshold AE output
op = []
for i in ff[0]:
    if i > 0.5:
        op.append(1)
    if i < 0.5:
        op.append(0)
print('AE output: ',op)


#Threshold decoder output
op=[]
for i in decoder_ff[0]:
    if i > 0.5:
        op.append(1)
    if i < 0.5:
        op.append(0)

print('Decoder output: ',op)

Using the test vector here is only to test that the NN architecture is succcessful and all parts are connected and can be accessed. Stacked AE architecture is deterministic, so feeding values through it will result in the same outputs.  As the weights and biases are initialised to random values the outputs from the test vector will currently be arbitrary but consistant.  Training the AE will eventually result in accurate reconstruction of the data

In [None]:
autoencoder.fit(test_vector, test_vector, epochs=100, batch_size=1)

In [None]:
#Feed forward test vectors through NNs

ff = autoencoder.predict(test_vector)
print('AE output: ',ff)

encoder_ff = encoder.predict(test_vector)
print('Encoder output: ',encoder_ff)

decoder_ff = decoder.predict(encoder_ff)
print('Decoder output: ',decoder_ff)



In [None]:
print('Test vector: ', test_vector)
#Threshold AE output
op = []
for i in ff[0]:
    if i > 0.5:
        op.append(1)
    if i < 0.5:
        op.append(0)
print('AE output: ',op)


#Threshold decoder output
op=[]
for i in decoder_ff[0]:
    if i > 0.5:
        op.append(1)
    if i < 0.5:
        op.append(0)

print('Decoder output: ',op)

## Encoding Audio Files

Here is the main code for the data pipeline.  This encodes the data, firstly as many-hot encoded 16th note step on/off binary values, secondly as 48-PPQN offset values for micro-timing 'groove'.

48-PPQN = 48 Pulses per Quarter Note

therefore:
- 48 divisions between 1/4 notes
- 24 divisions between 1/8 notes
- 12 divisins between 1/16 notes
- 6 divisions between 1/32 notes

FYI: MIDI beat clock runs at 24-PPQN however MIDI ticks are 960 ticks per QN

In [None]:

filename = '/usr/home/folder/file.wav'

#Setting variables for time-base etc.
per_quarter_note = 48
timebase= per_quarter_note*4
sr = 44100
win_s = 512
hop_length = win_s // 2
dur = None
bar_length = 1
# Load the audio file
signal_full, sr = librosa.load(filename, sr=sr, mono=True, duration=dur)
onsets = librosa.onset.onset_detect(y=signal_full, sr=sr, backtrack = False, units='samples')

# Set the desired range for onsets
desired_range = 16 * bar_length

# Check if onsets is greater than the desired range
while len(onsets) > desired_range:
    # Double the bar_length
    bar_length *= 2

    # Recalculate the desired range based on the updated bar_length
    desired_range = 16 * bar_length


#set 16ths and ppqn values based on bar length
num_ppqn = timebase*bar_length
sixteenths_div = 16*bar_length

#Define ppqn and sixteenth note sizes
ppqn_timebase = round(signal_full.shape[0] / num_ppqn)
sixteenths = round(signal_full.shape[0] / sixteenths_div)

#Initialise slice arrays
steps = []
ppqn_slices = []

#Get 16th note slice points
for f in range(sixteenths_div):
    out= f * sixteenths
    steps=np.append(steps, out)
#Get PPQN slice points
for f in range(num_ppqn):
    out= f * ppqn_timebase
    ppqn_slices=np.append(ppqn_slices, out)

# Convert onset times to integers using np.round
onsets = np.round(onsets).astype(int)

# Round onset points down to the nearest sixteenth note and disgard double event triggers
onset_points_rounded = []
previous_rounded_onset = None
keep_mask = np.ones(len(onsets), dtype=bool)
for i, onset in enumerate(onsets):
    rounded_onset = int(round(onset / sixteenths))
    # Check if the current rounded onset is the same as the previous one
    if rounded_onset != previous_rounded_onset:
        onset_points_rounded.append(rounded_onset)
        previous_rounded_onset = rounded_onset
    else:
        # If the onset is a duplicate, mark it for removal
        keep_mask[i] = False

# Filter the 'onsets' array to keep only the onsets that haven't been processed
onsets = onsets[keep_mask]

#Round onsets to nearest PPQN timebase point
ppqn_onsets = [int(onset // ppqn_timebase) * ppqn_timebase for onset in onsets]

# One-hot encode the rounded onset points in an array of length 16
num_sixteenths = sixteenths_div
ohe_sixteenths = np.zeros(num_sixteenths)
for onset in onset_points_rounded:
    if onset < num_sixteenths:
        ohe_sixteenths[onset] = 1

#Get distances of substeps from nearest 16th note
substeps = []
quantize_points_rounded_sixteenths = [int(round(ppqn_timebase / sixteenths)) * sixteenths for ppqn_timebase in onsets]
for f, c in zip(ppqn_onsets, quantize_points_rounded_sixteenths):
    m = f - c
    ss = m //ppqn_timebase
    s = (ss - -6) / (6 - -6)
    if s >= 1:
        substeps = np.append(substeps,s)
    else:
        substeps = np.append(substeps,s)


#Match substep values to corresponding event
substeps_full = []
j = 0  # Index for substeps array
for binary_value in ohe_sixteenths:
    if binary_value == 1 and j < len(substeps):
        substeps_full.append(substeps[j])
        j += 1
    else:
        substeps_full.append(0)

ss_arr = np.array(substeps_full)


# Concatenate the two arrays horizontally
#op= np.concatenate((ohe_sixteenths, substeps_full))
print(bar_length, ' bars long')

if bar_length == 1:
    op= np.concatenate((ohe_sixteenths, substeps_full))
    print(op)

else:
    split1 = np.split(ohe_sixteenths,bar_length)
    split2 = np.split(ss_arr,bar_length)
    stack = np.empty(32)
    for i in range(bar_length):
        join = np.concatenate((split1[i], split2[i]))
        stack = np.vstack((stack,join))
    print(split1)



In [None]:
plt.figure(figsize=(20, 5))
plt.plot(signal_full,label='Waveform',zorder=0)
plt.vlines(onsets, -1, 1, alpha=1, color='r',
           linestyle='-', label='onsets', zorder=1)
plt.vlines(steps, -1, 1, alpha=1, color='y',
               linestyle='--', label='16th', zorder=1)
plt.show

plt.figure(figsize=(20, 5))
plt.plot(signal_full,label='Waveform',zorder=0)
plt.vlines(onsets, -1, 1, alpha=1, color='r',
           linestyle='-', label='onsets', zorder=1)
plt.vlines(ppqn_slices, -1, 1, alpha=1, color='b',
           linestyle='--', label='ppqn', zorder=1)

plt.show

In [None]:

filename = '/usr/home/folder/file.wav'

#Setting variables for time-base etc.
per_quarter_note = 48
timebase= per_quarter_note*4
sr = 44100
win_s = 512
hop_length = win_s // 2
dur = None
bar_length = 1
# Load the audio file
signal_full, sr = librosa.load(filename, sr=sr, mono=True, duration=dur)
# Load the audio file
s = aubio.source(filename, sr, hop_length)
sr = aubio.source.get_samplerate(s)

#aubio.onset.set_threshold
o = aubio.onset("default", win_s, hop_length, sr)

aubio.onset.set_threshold(o,0.8)

# list of onsets, in samples
onsets = []

total_frames = 0
while True:
    samples, read = s()
    if o(samples):

        onsets.append(int(o.get_last()))
    total_frames += read
    if read < hop_length: break

# Set the desired range for onsets
desired_range = 16 * bar_length

# Check if onsets is greater than the desired range
while len(onsets) > desired_range:
    # Double the bar_length
    bar_length *= 2

    # Recalculate the desired range based on the updated bar_length
    desired_range = 16 * bar_length


#set 16ths and ppqn values based on bar length
num_ppqn = timebase*bar_length
sixteenths_div = 16*bar_length

#Define ppqn and sixteenth note sizes
ppqn_timebase = round(signal_full.shape[0] / num_ppqn)
sixteenths = round(signal_full.shape[0] / sixteenths_div)

#Initialise slice arrays
steps = []
ppqn_slices = []

#Get 16th note slice points
for f in range(sixteenths_div):
    out= f * sixteenths
    steps=np.append(steps, out)
#Get PPQN slice points
for f in range(num_ppqn):
    out= f * ppqn_timebase
    ppqn_slices=np.append(ppqn_slices, out)

# Convert onset times to integers using np.round
onsets = np.round(onsets).astype(int)

# Round onset points down to the nearest sixteenth note and disgard double event triggers
onset_points_rounded = []
previous_rounded_onset = None
keep_mask = np.ones(len(onsets), dtype=bool)
for i, onset in enumerate(onsets):
    rounded_onset = int(round(onset / sixteenths))
    # Check if the current rounded onset is the same as the previous one
    if rounded_onset != previous_rounded_onset:
        onset_points_rounded.append(rounded_onset)
        previous_rounded_onset = rounded_onset
    else:
        # If the onset is a duplicate, mark it for removal
        keep_mask[i] = False

# Filter the 'onsets' array to keep only the onsets that haven't been processed
onsets = onsets[keep_mask]

#Round onsets to nearest PPQN timebase point
ppqn_onsets = [int(onset // ppqn_timebase) * ppqn_timebase for onset in onsets]

# One-hot encode the rounded onset points in an array of length 16
num_sixteenths = sixteenths_div
ohe_sixteenths = np.zeros(num_sixteenths)
for onset in onset_points_rounded:
    if onset < num_sixteenths:
        ohe_sixteenths[onset] = 1

#Get distances of substeps from nearest 16th note
substeps = []
quantize_points_rounded_sixteenths = [int(round(ppqn_timebase / sixteenths)) * sixteenths for ppqn_timebase in onsets]
for f, c in zip(ppqn_onsets, quantize_points_rounded_sixteenths):
    m = f - c
    ss = m //ppqn_timebase
    s = (ss - -6) / (6 - -6)
    if s >= 1:
        substeps = np.append(substeps,s)
    else:
        substeps = np.append(substeps,s)


#Match substep values to corresponding event
substeps_full = []
j = 0  # Index for substeps array
for binary_value in ohe_sixteenths:
    if binary_value == 1 and j < len(substeps):
        substeps_full.append(substeps[j])
        j += 1
    else:
        substeps_full.append(0)

ss_arr = np.array(substeps_full)


# Concatenate the two arrays horizontally
#op= np.concatenate((ohe_sixteenths, substeps_full))
print('Length in bars: ',bar_length)

if bar_length == 1:
    op= np.concatenate((ohe_sixteenths, substeps_full))
    print(op)

else:
    split1 = np.split(ohe_sixteenths,bar_length)
    split2 = np.split(ss_arr,bar_length)
    stack = np.empty(32)
    for i in range(bar_length):
        join = np.concatenate((split1[i], split2[i]))
        stack = np.vstack((stack,join))
    print(split1)



In [None]:
#Plot figures of waveform and onset detection with 16th divisions and 48-PPQN
plt.figure(figsize=(20, 5))
plt.plot(signal_full,label='Waveform',zorder=0)
plt.vlines(onsets, -1, 1, alpha=1, color='r',
           linestyle='-', label='onsets', zorder=1)
plt.vlines(steps, -1, 1, alpha=1, color='y',
               linestyle='--', label='16th', zorder=1)
plt.show

plt.figure(figsize=(20, 5))
plt.plot(signal_full,label='Waveform',zorder=0)
plt.vlines(onsets, -1, 1, alpha=1, color='r',
           linestyle='-', label='onsets', zorder=1)
plt.vlines(ppqn_slices, -1, 1, alpha=1, color='b',
           linestyle='--', label='ppqn', zorder=1)

plt.show

## Dataset

Encoding the audio corpus runs in a loop to create the dataset array.  In Deep Steps the proccessed dataset is stored as a CSV to avoid repeating the processing step for re-training.

In [None]:

#Load dataset from csv file
dataset = np.loadtxt('dataset.csv', delimiter=',')
print(dataset.shape)

## Re-Defining Autoencoder for step on/off & micro-timing offsets

In [None]:

encoder = tf.keras.Sequential([
    tf.keras.layers.InputLayer(input_shape=(32)),
    tf.keras.layers.Dense(16,activation='relu'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dense(8,activation='relu'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dense(4,activation='relu')
    ])

decoder = tf.keras.Sequential([
    tf.keras.layers.InputLayer(input_shape=(4)),
    tf.keras.layers.Dense(8,activation='relu'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dense(16,activation='relu'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dense(32,activation='sigmoid')
    ])

autoencoder = tf.keras.Sequential([encoder,decoder])

# Compile the autoencoder model
autoencoder.compile(optimizer='adam', loss=losses.MeanSquaredError())

# Train the autoencoder model
history = autoencoder.fit(dataset, dataset, epochs=100, batch_size=16)

In [None]:
#Plotting loss over epoches

plt.plot(history.history['loss'])
plt.title('AE Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.show()

In [None]:
#Sending training data through TRAINED encoder to get vectors to visualise latent space
latent_vectors = []
for testing in dataset:
    test_data = testing

    test_data = np.reshape(test_data, (1, 32))

    encoded_data = encoder.predict(test_data)

    latent_vectors.append(encoded_data[0])


latent_vectors = np.array(latent_vectors)



In [None]:
#Plotting 4 dimensions of latent space

plt.scatter(latent_vectors[:, 0], latent_vectors[:, 1])
plt.xlabel('Latent Dimension 1')
plt.ylabel('Latent Dimension 2')
plt.title('Latent Vectors Visualization')
plt.show()

plt.scatter(latent_vectors[:, 2], latent_vectors[:, 3])
plt.xlabel('Latent Dimension 3')
plt.ylabel('Latent Dimension 4')
plt.title('Latent Vectors Visualization')
plt.show()

## Testing trained AE reconstructing training data

In [None]:
#Create a test vector from dataset
test_vector = np.array([dataset[100]])
print(test_vector)

In [None]:
#Feed test vector through encoder to see compressed representation
encoder_test = encoder.predict(test_vector)
print(encoder_test)

In [None]:
#Feed encoded data through decoder to check reconstruction
decoder_test = decoder.predict(encoder_test)

#Split into step on/off and offsets
split = np.split(decoder_test[0],2)

steps = split[0]

substeps = split[1]

output1 = []
output2 = []
seq=[]
tolerance=0.5

#Use threshold and scaling to convert generated rhythm to binary and substeps
for op in steps:
    if op > tolerance:
        output1.append(1)
    else: output1.append(0)

for s in substeps:
    op = int(s * (6 - -6) + -6)
    output2.append(op)

print('Test Vector: ',test_vector)
print('AE: Raw Generation ',decoder_test)
print('Step on/off: ',output1)
print('Substeps raw: ',substeps)
print('48-PPQN substeps ',output2)

## Ex Nihilo Generation

Here we use various methods of creating new latent vectors to feed through decoder to generate new parts

In [None]:
#Manually creating latent vector

a_value = 0.1
b_value = 1.5
c_value = 2.3
d_value = 0.2

latent_vector = np.array([[a_value, b_value, c_value, d_value]])

generated_rhythm = decoder.predict(latent_vector)
split = np.split(generated_rhythm[0],2)

steps = split[0]

substeps = split[1]

output1 = []
output2 = []
seq=[]
tolerance=0.5
for op in steps:
    if op > tolerance:
        output1.append(1)
    else: output1.append(0)
        
for s in substeps:
    op = int(s * (6 - -6) + -6)
    output2.append(op)
    
print('AE: Raw Generation ',generated_rhythm)
print('Step on/off: ',output1)
print('48-PPQN substeps ',output2)

In [None]:
#Random latent vector with values between 0-1

latent_vector = np.random.rand(1,4)



print(latent_vector)

generated_rhythm = decoder.predict(latent_vector)
split = np.split(generated_rhythm[0],2)

steps = split[0]

substeps = split[1]

output1 = []
output2 = []
seq=[]
tolerance=0.5
for op in steps:
    if op > tolerance:
        output1.append(1)
    else: output1.append(0)
        
for s in substeps:
    op = int(s * (6 - -6) + -6)
    output2.append(op)
    
print('AE: Raw Generation ',generated_rhythm)
print('Step on/off: ',output1)
print('48-PPQN substeps ',output2)

In [None]:
# Random latent vector using a Gaussian distribution with mean 3 and standard deviation 2.5

latent_vector = 3 + 2.5 * np.random.randn(1,4)

print(latent_vector)

generated_rhythm = decoder.predict(latent_vector)
split = np.split(generated_rhythm[0],2)

steps = split[0]

substeps = split[1]

output1 = []
output2 = []
seq=[]
tolerance=0.5
for op in steps:
    if op > tolerance:
        output1.append(1)
    else: output1.append(0)
        
for s in substeps:
    op = int(s * (6 - -6) + -6)
    output2.append(op)
    
print('AE: Raw Generation ',generated_rhythm)
print('Step on/off: ',output1)
print('48-PPQN substeps ',output2)