# ChordGAN

This notebook aims to replicate the work done to create `chordGAN` but using TF 2 instead, since the other version is old and deprecated

In [1]:
import pretty_midi
import reverse_pianoroll
import convert
import librosa

import numpy as np
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import glob

import tensorflow as tf
from tensorflow import keras
from keras.models import Model, Sequential
from keras.layers import Dense, Input, Concatenate
from keras.initializers import GlorotNormal
from keras.activations import sigmoid
from keras.losses import Loss

%load_ext lab_black

# python script, in github repo
print("TF version:", tf.__version__)
print("Keras version:", keras.__version__)
print("Numpy version:", np.__version__)

TF version: 2.6.0
Keras version: 2.6.0
Numpy version: 1.21.2


Use the following to avoid GPU errors (if needed)

In [2]:
# physical_devices = tf.config.experimental.list_physical_devices('GPU')

# for device in physical_devices:
#     tf.config.experimental.set_memory_growth(device, True)

## Setup helper functions

In [20]:
# add songs to data
def get_songs(path):
    files = glob.glob("{}/*.mid*".format(path))
    songs = []
    for f in files:
        try:
            data = pretty_midi.PrettyMIDI(f)
            song = data.get_piano_roll(fs=16)
            song = convert.forward(song)
            # song = np.transpose(song) - if your code matrices aren't working, try uncommenting this. the convert.py file might not be updated
            songs.append(song)
        except Exception as e:
            raise e
    return songs

# custom function to extract chroma features from song
def get_chromas(songs):
    chromas = []
    for song in songs:
        chroma = np.zeros(shape=(np.shape(song)[0], 12))
        for i in np.arange(np.shape(song)[0]):
            for j in np.arange(78):
                if song[i][j] > 0:
                    chroma[i][np.mod(j, 12)] += 1
        # print(np.shape(chroma))
        chromas.append(chroma)

    return chromas

In [4]:
songs = get_songs("../data/chordGan/Pop/Pop_Music_Midi")[:10]
chromas = get_chromas(songs)
print("{} songs processed".format(len(songs)))
print("{} songs processed".format(len(chromas)))



10 songs processed
10 songs processed


## Setup GAN model

Note:

The discriminator has both the song timeseries and the chromagram as inputs, hence the two-dimensional inputs!

In [7]:
# xavier_init = GlorotNormal(shape=[X_dim + Z_dim, 512])# AKA xavier init


def xavier_init(size, dtype=None):
    input_dim = size[0]
    xavier_stddev = 1.0 / tf.sqrt(input_dim / 2)
    return tf.random.normal(shape=size, stddev=xavier_stddev)


def build_generator(Z_dim, n_units=128):
    inputs = Input(shape=[None, Z_dim])
    z = Dense(n_units, kernel_initializer=xavier_init, activation="relu")(inputs)
    output = Dense(X_dim, kernel_initializer=xavier_init, activation="sigmoid")(z)

    return Model(inputs=[inputs], outputs=[output], name="generator")


def build_discriminator(X_dim, Z_dim, n_units=512):
    data_inputs = Input(shape=[None, X_dim], name="data_input")
    chroma_inputs = Input(shape=[None, Z_dim], name="chroma_input")

    x = Concatenate(axis=2)([data_inputs, chroma_inputs])
    x = Dense(n_units, kernel_initializer=xavier_init, activation="relu")(x)
    logits = Dense(1, kernel_initializer=xavier_init)(x)
    probas = sigmoid(logits)

    return Model(inputs=[data_inputs, chroma_inputs], outputs=[logits, probas])

In [8]:
def generator_loss(
    D_fake_logits,
    fake_samples,
    z,
    lambda_=100,
    gen_loss=keras.losses.MeanSquaredError(),
):
    """
    Note: While the paper describes L1 loss (which is MAE) the code uses MSE.

    Parameters
    ----------
    D_fake_logits:
        The logits output by the discriminator given the fake sample.
    fake_samples :
        The sample data created by the generator.
    z:
        The real chromagram data.
    lambda_ : float, Optional
        Normalization factor
    gen_loss :
        The type of loss to use for the generated fake samples.
    """
    binary_cross_entropy = keras.losses.BinaryCrossEntropy(from_logits=True)

    # Probability the generator fooled the discriminator (i.e. all predictions on fake samples were labelled 1)
    G_fooling = tf.reduce_mean(
        binary_cross_entropy(tf.ones_like(D_fake_logits), D_fake_logits)
    )
    G_loss = tf.reduce_mean(gen_loss(z, fake_samples))
    return G_fooling + lambda_ * G_loss


def discriminator_loss(discriminator, true_samples, fake_samples):
    binary_cross_entropy = keras.losses.BinaryCrossEntropy(from_logits=True)

    D_true_logits, _ = discriminator(true_samples)
    D_fake_logits, _ = discriminator(fake_samples)

    # Discriminator should identify the true samples as 1s
    D_true_loss = tf.reduce_mean(
        binary_cross_entropy(np.ones_like(D_true_logits), true_logits)
    )
    # And the fake samples as 0s
    D_fake_loss = tf.reduce_mean(
        binary_cross_entropy(np.zeros_like(D_fake_logits), fake_logits)
    )
    return D_loss_real + D_loss_fake, D_fake_logits

In [21]:
low_note, high_note = 0, 78  # The index of lowest/highest note on the piano roll
note_range = high_note - low_note

n_timesteps = 4  # This is the number of timesteps that we will create at a time
X_dim = 2 * note_range * n_timesteps  # This is the size of the visible layer.
Z_dim = 12 * n_timesteps
n_hidden = 50  # This is the size of the hidden layer

print(f"X_dim = {X_dim}")
print(f"Z_dim = {Z_dim}")

discriminator = build_discriminator(X_dim, Z_dim)
generator = build_generator(Z_dim)

D_optimizer = keras.optimizers.Adam()
G_optimizer = keras.optimizers.Adam()

X_dim = 624
Z_dim = 48


In [31]:
np.floor(songs[0].shape[0] / n_timesteps)

64.0

In [32]:
songs[0].shape[0] // n_timesteps

64

In [38]:
num_epochs = 1
batch_size = 10

for epoch in range(num_epochs):
    print(f"Epoch {epoch}/{num_epochs}", flush=True)
    for song, chroma in zip(songs, chromas):
        # Reshape song from song_timesteps x 2*note_range
        # to n_timesteps x 2*note_range
        # print("initial song shape", song.shape)
        song_timesteps = song.shape[0] // n_timesteps
        song = song[: song_timesteps * n_timesteps]  # discard any extra timesteps
        song = song.reshape([song_timesteps, song.shape[1] * n_timesteps])
        # print("  final song shape", song.shape)

        # Similar process for the chroma
        # print("initial chroma shape", chroma.shape)
        chroma = chroma[: song_timesteps * n_timesteps]
        chroma = chroma.reshape([song_timesteps, chroma.shape[1] * n_timesteps])
        # print("  final chroma shape", chroma.shape)

        # TODO: write training loop

Epoch 0/1
