First, we import everything we will need in our project

In [375]:
import matplotlib.pyplot as plt
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
# Useful parameters for reproducibility
# SEED = 1234
# torch.manual_seed(SEED)
# torch.backends.cudnn.deterministic = True
from sklearn.model_selection import train_test_split

import random, os
from tqdm import tqdm

from PIL import Image
from torchvision import datasets, transforms, utils
from os import walk
from os.path import join, normpath
import pretty_midi

from math import ceil
import time
import torch.optim as optim
from torch.nn.utils.rnn import pad_sequence
import torch.nn as nn
import pandas as pd

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(DEVICE)

cuda


This cell is used so that the ploted graph are oppened in a nex window instead of beeing printed in the notebook

In [376]:
%matplotlib qt

This is our <font color="red"> dataloader </font>. This is the object used to load everything we need to train, test and evaluate our model. <font color="green"> This dataloader load and return the whole song.</font>

In [377]:
class FixedSizePianoMusic(data.Dataset):
  def __init__(self, size=1000, midi_dir=None, transform=None):
    super().__init__()
    if midi_dir is None: # Empty dataset
      self.musics = []
    else: # Non-empty dataset
      file_list = list([f"{root}\{f}" for root,d_names,f_names in os.walk(midi_dir) for f in f_names])
      # Early loading! splits the musics into blocks of size 'size'
      self.musics = []
      for f in file_list:
        long_music = torch.Tensor(pretty_midi.PrettyMIDI(f).get_piano_roll(fs=100))
        long_music = torch.transpose(long_music, 0, 1)
        new_size = (len(long_music) // size) * size
        long_music = long_music[:new_size] # Cutting off the rest
        blocks = torch.reshape(long_music, (len(long_music) // size, size, 128))
        for block in blocks:
          self.musics.append(block)
    self.transform = transform

  def __len__(self):
      return len(self.musics)


  def __getitem__(self, index):
      try:
        return self.musics[index] if self.transform is None else self.transform(self.musics[index])
      except IndexError:
        raise IndexError("Item does not exist, have you loaded the MIDI files correctly?")

  def _create_from_self(self, musics):
    new_dataset = FixedSizePianoMusic()
    new_dataset.musics = musics
    new_dataset.transform = self.transform
    return new_dataset

  def splits(self, test_size=0.3):
    train_musics, test_musics = train_test_split(self.musics, test_size=test_size)
    return self._create_from_self(train_musics), self._create_from_self(test_musics)

This is the function used to apply all the <font color="red"> transformation </font> we need to apply on the pianorolls before outputing them.

In [378]:
def transform(element): # Turns all velocity values to 0-1, and extract piano notes
  element = torch.Tensor(element)
  element = torch.where(element != 0, 1, 0)
  element = element[:, 21:109] # 109-21 = 88 => the piano notes of a piano roll
  element = element.float()
  totals = element.sum(dim=1).reshape((-1,1)) # We want vectors to sum to one
  totals[totals == 0] = 1
  element /= totals
  return torch.transpose(element, 0, 1)

This is another <font color="red">transform function</font> that will separate each song in 10 seconds subpassages

In [379]:
def sub_transform(element):
    element = torch.Tensor(element)
    element = torch.where(element != 0, 1, 0)
    element = element[:, 21:109] # 109-21 = 88 => the piano notes of a piano roll
    element = element.float()
    totals = element.sum(dim=1).reshape((-1,1)) # We want vectors to sum to one
    totals[totals == 0] = 1
    element /= totals
    element = torch.transpose(element, 0, 1)
    return element

In the next cell, we will <font color="red"> initiate both the train/test dataset</font> as well as the <font color="green"> train/test loader</font>.

In [380]:
'''train_dataset = FixedSizePianoMusic(midi_dir="js/all/", size=NBR_TIME_STEPS, transform=sub_transform)
test_dataset = FixedSizePianoMusic(midi_dir="js/all/", size=NBR_TIME_STEPS, transform=sub_transform)

train_loader = data.DataLoader(train_dataset, batch_size=1, shuffle=True)
test_loader = data.DataLoader(test_dataset, batch_size=1, shuffle=True)'''

'train_dataset = FixedSizePianoMusic(midi_dir="js/all/", size=NBR_TIME_STEPS, transform=sub_transform)\ntest_dataset = FixedSizePianoMusic(midi_dir="js/all/", size=NBR_TIME_STEPS, transform=sub_transform)\n\ntrain_loader = data.DataLoader(train_dataset, batch_size=1, shuffle=True)\ntest_loader = data.DataLoader(test_dataset, batch_size=1, shuffle=True)'

Let's test what we've got so far (the execution of this cell is not mandatory)

In [381]:
'''first_train_pianoroll = next(iter(train_loader))
first_test_pianoroll = next(iter(test_loader))'''

'first_train_pianoroll = next(iter(train_loader))\nfirst_test_pianoroll = next(iter(test_loader))'

In the next cell, we write a <font color="red">function able to plot the pianorolls</font> that we can use to test different parts of the code

In [382]:
def plotPianoRoll(pianoroll, path=None):
    fig, ax = plt.subplots(figsize=(160, 60))
    ax.imshow(pianoroll, cmap='binary', interpolation='nearest')
    if(path != None):
        ax.set_title(path)
    ax.invert_yaxis()
    ax.set_xlabel('Nbr Timesteps')
    ax.set_ylabel('Note value')
    plt.show()

Let's test if everything is all right (again, this cell is to be used only for test purposes)

In [383]:
#plotPianoRoll(first_train_pianoroll[0])

Now, we can get our hands dirty by implementing the <font color="red"> neural network </font> we will use. A possible choice would be a <font color="red">recurent neural network</font>

In [384]:
class SimpleAE(nn.Module):
    def __init__(self, input_size, latent_dim):
        super().__init__()
        self.encoder = nn.Sequential(
            nn.Flatten(),
            nn.Linear(88*input_size, 1024),
            nn.ReLU(),
            nn.Linear(1024, 512),
            nn.ReLU(),
            nn.Linear(512, latent_dim),

        )

        self.decoder = nn.Sequential(
            nn.Linear(latent_dim, 512),
            nn.ReLU(),
            nn.Linear(512, 1024),
            nn.ReLU(),
            nn.Linear(1024, 88*input_size),
            nn.Unflatten(1, (88, input_size)),
            nn.Softmax(dim=1)
        )
    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded
    
    def decode(self, encoded):
        decoded =  self.decoder(encoded)
        return decoded

class AEModel(nn.Module):
    def __init__(self, input_size, latent_dim):
        super().__init__()

        #these functions are the one used in the encoder
        self.maxPool1 = nn.MaxPool1d(kernel_size=2, return_indices=True)
        self.maxPool2 = nn.MaxPool1d(kernel_size=2, return_indices=True)
        self.maxPool3 = nn.MaxPool1d(kernel_size=2, return_indices=True)
        self.conv1 = nn.Sequential(
            nn.Conv1d(in_channels=88, out_channels=64, kernel_size=3, padding="same"),
            nn.ReLU()
        )
        self.conv2 = nn.Sequential(
            nn.Conv1d(in_channels=64, out_channels=32, kernel_size=3, padding="same"),
            nn.ReLU()
        )
        self.conv3 = nn.Sequential(
            nn.Conv1d(in_channels=32, out_channels=16, kernel_size=3, padding="same"),
            nn.ReLU()
        )
        self.reduce = nn.Sequential(
            nn.Flatten(),
            #16*125=[2000]
            nn.Linear(16*(input_size//8), 512),
            nn.ReLU(),
            nn.Dropout(),
            nn.Linear(512, latent_dim),
            nn.ReLU()
        )

        #These functions will be used in the decoder
        self.maxUnpool1 = nn.MaxUnpool1d(kernel_size=2)
        self.maxUnpool2 = nn.MaxUnpool1d(kernel_size=2)
        self.maxUnpool3 = nn.MaxUnpool1d(kernel_size=2)
        self.convTrans1 = nn.Sequential(
            nn.ReLU(),
            nn.ConvTranspose1d(in_channels=16, out_channels=32, kernel_size=3, padding=1),    
        )
        self.convTrans2 = nn.Sequential(
            nn.ReLU(),
            nn.ConvTranspose1d(in_channels=32, out_channels=64, kernel_size=3, padding=1),    
        )
        self.convTrans3 = nn.Sequential(
            nn.ReLU(),
            nn.ConvTranspose1d(in_channels=64, out_channels=88, kernel_size=3, padding=1),    
        )
        self.expend = nn.Sequential(
            nn.Linear(latent_dim, 512),
            nn.ReLU(),
            nn.Dropout(),
            nn.Linear(512, 16*(input_size//8)),
            nn.ReLU(),
            nn.Unflatten(1, (16, input_size//8))
        )
        self.out = nn.Softmax(dim=1)

    def forward(self, x):
        #The first step is to encode x
        self.indice1 = None
        self.indice2 = None
        self.indice3 = None
        x = self.conv1(x)
        x, self.indice1 = self.maxPool1(x)
        x = self.conv2(x)
        x, self.indice2 = self.maxPool2(x)
        x = self.conv3(x)
        x, self.indice3 = self.maxPool3(x)
        x = self.reduce(x)

        #The next step is to decode everything
        x = self.expend(x)
        x = self.maxUnpool1(x, self.indice3)
        x = self.convTrans1(x)
        x = self.maxUnpool2(x, self.indice2)
        x = self.convTrans2(x)
        x = self.maxUnpool3(x, self.indice1)
        x = self.convTrans3(x)
        x = self.out(x)
        return x

    def decode(self, latent_space):
        x = self.expend(latent_space)
        x = self.maxUnpool1(x, self.indice3[0:1])
        x = self.convTrans1(x)
        x = self.maxUnpool2(x, self.indice2[0:1])
        x = self.convTrans2(x)
        x = self.maxUnpool3(x, self.indice1[0:1])
        x = self.convTrans3(x)
        x = self.out(x)
        return x

class BetterAE(nn.Module):
    def __init__(self, input_size, lattent_dim):
        super().__init__()
        self.encoder = nn.Sequential(
            nn.Conv1d(in_channels=88, out_channels=128, kernel_size=3, padding='same'),
            nn.ReLU(),
            nn.Conv1d(in_channels=128, out_channels=64, kernel_size=3, padding='same'),
            nn.ReLU(),
            nn.Flatten(),
            nn.Linear(64*input_size, 512),
            nn.ReLU(),
            nn.Dropout(),
            nn.Linear(512, lattent_dim),
        )

        self.decoder = nn.Sequential(
            nn.Linear(lattent_dim, 512),
            nn.ReLU(),
            nn.Dropout(),
            nn.Linear(512, 64*input_size),
            nn.ReLU(),
            nn.Unflatten(1, (64, input_size)),
            nn.ConvTranspose1d(in_channels=64, out_channels=128, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.ConvTranspose1d(in_channels=128, out_channels=88, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.Softmax(dim=1)
        )
    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded
    
    def decode(self, encoded):
        decoded =  self.decoder(encoded)
        return decoded

In [385]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

In [386]:
def train(model, dataloader, optimizer, criterion, step_size=4):
    epoch_loss = 0
    loop_count = 0
    model.train()

    for i, batch in tqdm(enumerate(dataloader), total=len(dataloader), leave=False):
        musics = batch.to(DEVICE)
        optimizer.zero_grad()
        # The length of a music is the minimum between its unpadded length and the current loop
        predictions = model(musics)
        loss = criterion(predictions, musics)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
        loop_count += 1

    return epoch_loss / loop_count

def evaluate(model, dataloader, criterion, step_size=4):
    epoch_loss = 0
    loop_count = 0

    model.eval()

    with torch.no_grad():
        for i, batch in tqdm(enumerate(dataloader), total=len(dataloader), leave=False):
            musics = batch.to(DEVICE)
            # The length of a music is the minimum between its unpadded length and the current loop
            predictions = model(musics)
            loss = criterion(predictions, musics)
            epoch_loss += loss.item()
            loop_count += 1

    return epoch_loss / loop_count

In [393]:
# Define the length of each sub_song. One second = 100 time_steps.
# If NBR_TIME_STEPS = 1000, each sub_songs will last 10 seconds
NBR_TIME_STEPS = 500
BATCH_SIZE = 128

NOTE_DIM = 88
HIDDEN_DIM = 1024
OUTPUT_DIM = 88
N_LAYERS = 3
DROPOUT = 0.2

In [388]:
def create_midi(filename, pianoroll, instrument):
    pianoroll = torch.transpose(pianoroll, 0, 1)
    pm = pretty_midi.PrettyMIDI()
    instrument = pretty_midi.instrument_name_to_program(instrument)
    piano = pretty_midi.Instrument(program=instrument)
    velocity = 90
    current_time = 0
    note_list = [0] * 88
    for measure in pianoroll:
        for current_note in range(len(measure)):
            if measure[current_note] != 0 and note_list[current_note] == 0:
                note_list[current_note] = current_time
            if measure[current_note] == 0 and note_list[current_note] != 0:
                note = pretty_midi.Note(
                    velocity=velocity,
                    pitch=current_note,
                    start=note_list[current_note],
                    end=current_time
                )
                note_list[current_note] = 0
                piano.notes.append(note)
        current_time += 1/100
    pm.instruments.append(piano)
    pm.write(filename)

    return pianoroll.detach()

In [435]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs
  
def compute_epochs(model, train_dataloader, test_dataloader, criterion, optimizer, n_epochs, best_path=None, verbose=True):
  best_test_loss = float('inf')
  criterion = criterion.to(DEVICE)
  
  train_losses = []
  test_losses = []
  epoch_times = []

  for epoch in range(n_epochs):
    start_time = time.time()
    train_loss = train(model, train_dataloader, optimizer, criterion)
    test_loss = evaluate(model, test_dataloader, criterion)

    end_time = time.time()
    

    if best_path and test_loss < best_test_loss:
      best_test_loss = test_loss
      torch.save(model.state_dict(), best_path)
    
    train_losses.append(train_loss)
    test_losses.append(test_loss)
    epoch_times.append(end_time - start_time)

    if verbose:
      epoch_mins, epoch_secs = epoch_time(start_time, end_time)
      print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
      print(f'\tTrain Loss: {train_loss}')
      print(f'\t Test Loss: {test_loss}')
    
  return model, train_losses, test_losses, epoch_times

class ModelProvider():
  def __init__(self):
    self.AEs= {}
  
  def convAE(self, index):
    return self._fetch_model_(self.AEs, BetterAE, index)
  def poolAE(self, index):
    return self._fetch_model_(self.AEs, AEModel, index)
  def linearAE(self, index):
    return self._fetch_model_(self.AEs, SimpleAE, index)
  
  def _fetch_model_(self, models, ModelClass, index, **kwargs):
    if index in models:
      return models[index]
    else:
      models[index] = ModelClass(NBR_TIME_STEPS, HIDDEN_DIM)
      models[index].to(DEVICE)
      return models[index]

def generate_models(train_dataloader, test_dataloader, n_epochs=8):
  mp = ModelProvider()
  params_list = []
  # model_funcs = {'RNN-tanh': mp.RNNtanh, 'RNN-relu': mp.RNNrelu, 'LSTM': mp.LSTM, 'GRU': mp.GRU}
  model_funcs = {'CAE': mp.convAE}
  # criterions = {'BCE': nn.BCELoss(), 'MSE': nn.MSELoss(), 'CE': nn.CrossEntropyLoss()}
  criterions = {'MSE': nn.MSELoss()}
  optimizer_funcs = {'ADAM': optim.Adam}
  
  index = 0
  for k0, model_func in model_funcs.items():
    for k1, criterion in criterions.items():
      for k2, optimizer_func in optimizer_funcs.items():
        params_list.append({'name': f'{k0}-{k1}-{k2}', 'model': model_func(index), 'criterion': criterion, 'optimizer': optimizer_func(model_func(index).parameters())})
        index += 1

  directory = f'models-generation-{n_epochs}epochs-{BATCH_SIZE}batchs-{time.time()}'
  os.makedirs(directory)

  for i, params_row in enumerate(params_list):
    print(f'Processing model {i+1}/{len(params_list)} {params_row["name"]}')
    os.makedirs(f'{directory}/{params_row["name"]}')
    model, train_losses, test_losses, epoch_times = compute_epochs(params_row['model'], train_dataloader, test_dataloader, params_row['criterion'], params_row['optimizer'], n_epochs, 
                                                                   best_path=f'{directory}/{params_row["name"]}/{params_row["name"]}.pt', 
                                                                   verbose=False)
    
    # Saving losses and times to CSV
    pd.DataFrame.from_dict({'train_loss':train_losses, 'test_loss': test_losses, 'epoch_time': epoch_times}).to_csv(f'{directory}/{params_row["name"]}/{params_row["name"]}.csv')
    # Reloading the model with its best one (best test loss)
    model = params_row['model']
    model.load_state_dict(torch.load(f'{directory}/{params_row["name"]}/{params_row["name"]}.pt'))
    # Sampling a music for generation
    dataset = FixedSizePianoMusic(midi_dir="./js/bigMix", size=NBR_TIME_STEPS, transform=sub_transform)
    random_sample = dataset[random.randint(0, len(dataset))]
    create_midi(f'original-sample.mid', random_sample, 'Acoustic Grand Piano')
    random_sample = random_sample.cuda()
    print(random_sample.get_device())
    #print(model.get_device())
    random_sample = torch.reshape(random_sample, (1, 88, 500))
    # Generating a music based on a few notes of the sample
    print(random_sample.shape)
    roll = model.forward(random_sample)
    print(roll[0])
    roll = torch.where(roll >= 1/10, 1, 0)
    create_midi(f'uncompressed-sample.mid', roll[0], 'Acoustic Grand Piano')
    latent_space = torch.randn(1,HIDDEN_DIM)
    model = model.to(DEVICE)
    latent_space = latent_space.to(DEVICE)
    #latent_space = latent_space.to(DEVICE)
    result = model.decode(latent_space)
    # Cleaning the result
    pianoroll = torch.where(result >= 1/10, 1, 0)
    model = model.cpu()
    pianoroll = pianoroll.to('cpu')
    x_np = pianoroll.detach().numpy()[0]
    print(x_np.shape)
    x_df = pd.DataFrame(x_np)
    # Saving the generation as a CSV
    x_df.to_csv(f'{directory}/{params_row["name"]}/{params_row["name"]}-sample.csv')
    # Saving the generation as a MIDI
    print(pianoroll[0][5])
    print(pianoroll.shape)
    create_midi(f'{directory}/{params_row["name"]}/{params_row["name"]}-sample.mid', pianoroll[0], 'Acoustic Grand Piano')
    #plotPianoRoll(pianoroll[0])


In [395]:
dataset = FixedSizePianoMusic(midi_dir="./js/bigMix", size=NBR_TIME_STEPS, transform=sub_transform)

train_dataset, test_dataset = dataset.splits()


train_loader = data.DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=0)
test_loader = data.DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=0)

In [436]:
generate_models(train_loader, test_loader, n_epochs=1)

Processing model 1/1 CAE-MSE-ADAM


                                               

0
torch.Size([1, 88, 500])
tensor([[0.0074, 0.0058, 0.0059,  ..., 0.0060, 0.0056, 0.0069],
        [0.0074, 0.0058, 0.0059,  ..., 0.0060, 0.0056, 0.0069],
        [0.0074, 0.0058, 0.0059,  ..., 0.0060, 0.0056, 0.0069],
        ...,
        [0.0074, 0.0058, 0.0059,  ..., 0.0060, 0.0056, 0.0069],
        [0.0074, 0.0058, 0.0059,  ..., 0.0060, 0.0056, 0.0069],
        [0.0074, 0.0058, 0.0059,  ..., 0.0060, 0.0056, 0.0069]],
       device='cuda:0', grad_fn=<SelectBackward0>)
(88, 500)
tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1,

In [392]:
torch.cuda.empty_cache()