# Dataset preparation

In [2]:
# to be uncommented on colab
#sudo apt install -y fluidsynth

# if you are using macOS run
#brew install fluidsynth

In [3]:
!pip3 install --upgrade pyfluidsynth



In [4]:
!pip3 install pretty_midi



In [5]:
!pip3 install mido



In [6]:
!pip3 install miditok



In [7]:
import pathlib
import glob
import os
import shutil
import math

dir_path = pathlib.Path('midis')
count = 0

filenames = glob.glob(str(dir_path/'*.mid*'))
print('Number of files:', len(filenames))

Number of files: 10855


In [8]:
import pretty_midi

pm = pretty_midi.PrettyMIDI(filenames[1])

In [9]:
import fluidsynth
from IPython import display

_SAMPLING_RATE = 16000
def display_audio(pm: pretty_midi.PrettyMIDI, seconds=30):
  waveform = pm.fluidsynth(fs=_SAMPLING_RATE)
  waveform_short = waveform[:seconds*_SAMPLING_RATE]
  return display.Audio(waveform_short, rate=_SAMPLING_RATE)

In [10]:
#display_audio(pm)

In [11]:
def make_subset(dir=None, start_index=0, end_index=0) -> None:
    dir = "dataset/" + dir
    if os.path.exists(dir):
        shutil.rmtree(dir)
    os.makedirs(dir)
    files = filenames[start_index:end_index]
    for filename in files:
        if os.path.isfile(filename):
            shutil.copyfile(src=filename, dst=filename.replace("midis/", dir + '/'))

    print(f"{dir}: {len(files)}")


train_size = 0.7
validation_size = 0.1
test_size = 0.2

train_path = pathlib.Path('dataset/training')
test_path = pathlib.Path('dataset/test')
validation_path = pathlib.Path('dataset/validation')

make_subset("training", 0, math.floor(len(filenames)*train_size))
make_subset("validation", math.floor(len(filenames)*train_size), math.floor(len(filenames)*(train_size+validation_size)))
make_subset("test", math.floor(len(filenames)*(train_size+validation_size)), math.floor(len(filenames)*(train_size+validation_size + test_size)))

dataset/training: 7598
dataset/validation: 1086
dataset/test: 2171


In [12]:
from miditok import REMI

pitch_range = range(21, 109)
nb_velocities = 32
additional_tokens = {'Chord': False, 'Rest': True, 'Tempo': False, 'Program': False, 'TimeSignature': False,
                     'rest_range': (2, 8),  # (half, 8 beats)
                     'nb_tempos': 64,  # nb of tempo bins
                     }  # (min, max)

tokenizer = REMI(pitch_range=pitch_range, nb_velocities=nb_velocities, additional_tokens=additional_tokens, pad=True, sos_eos=True)

In [13]:
tokens = 0
for token in tokenizer.vocab:
    print(token)
    if tokens > 100:
        break
    tokens += 1

PAD_None
SOS_None
EOS_None
Bar_None
Pitch_21
Pitch_22
Pitch_23
Pitch_24
Pitch_25
Pitch_26
Pitch_27
Pitch_28
Pitch_29
Pitch_30
Pitch_31
Pitch_32
Pitch_33
Pitch_34
Pitch_35
Pitch_36
Pitch_37
Pitch_38
Pitch_39
Pitch_40
Pitch_41
Pitch_42
Pitch_43
Pitch_44
Pitch_45
Pitch_46
Pitch_47
Pitch_48
Pitch_49
Pitch_50
Pitch_51
Pitch_52
Pitch_53
Pitch_54
Pitch_55
Pitch_56
Pitch_57
Pitch_58
Pitch_59
Pitch_60
Pitch_61
Pitch_62
Pitch_63
Pitch_64
Pitch_65
Pitch_66
Pitch_67
Pitch_68
Pitch_69
Pitch_70
Pitch_71
Pitch_72
Pitch_73
Pitch_74
Pitch_75
Pitch_76
Pitch_77
Pitch_78
Pitch_79
Pitch_80
Pitch_81
Pitch_82
Pitch_83
Pitch_84
Pitch_85
Pitch_86
Pitch_87
Pitch_88
Pitch_89
Pitch_90
Pitch_91
Pitch_92
Pitch_93
Pitch_94
Pitch_95
Pitch_96
Pitch_97
Pitch_98
Pitch_99
Pitch_100
Pitch_101
Pitch_102
Pitch_103
Pitch_104
Pitch_105
Pitch_106
Pitch_107
Pitch_108
Velocity_3
Velocity_7
Velocity_11
Velocity_15
Velocity_19
Velocity_23
Velocity_27
Velocity_31
Velocity_35
Velocity_39


In [14]:
from miditoolkit import MidiFile
from tqdm import tqdm

def tokenize_directory(dir=None):
    all_tokens = []
    path = pathlib.Path('dataset/' + dir)
    files_to_tokenize = glob.glob(str(path/'*.mid*'))
    for file in tqdm(files_to_tokenize):
        all_tokens.append(tokenizer.midi_to_tokens(MidiFile(file)))
    return all_tokens

training_tokens = tokenize_directory("training")
len(training_tokens)

 59%|█████▉    | 4474/7598 [21:56<15:19,  3.40it/s]  


KeyboardInterrupt: 

In [None]:
import numpy as np
import tensorflow as tf

training_ds = tf.data.Dataset.from_tensor_slices(training_tokens)
training_ds.element_spec

In [None]:
training_ds.save("tf_dataset/training/")