# Tokenization

In [None]:
%%capture
!sudo apt install -y fluidsynth
!pip install --upgrade pyfluidsynth
!pip install pretty_midi
!pip install miditok
!pip install split-folders

In [None]:
import sys
import pathlib
import glob
import os
import shutil
import math
import collections
import pretty_midi
import fluidsynth
import splitfolders

from miditok import TSD
from IPython import display
from pathlib import Path
from miditoolkit import MidiFile
from miditok.utils import get_midi_programs

All the MIDI files are stored on the shared Google Drive folder. In order to use this you should add a shortcut from your Drive to the shared project folder.

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


We copy all the midi zips to the Colab storage.

In [3]:
!cp /content/drive/MyDrive/__Baldi_Cecchetti/midis/jazz_midis.zip /content
!cp /content/drive/MyDrive/__Baldi_Cecchetti/midis/piano_midis.zip /content
!unzip -qq /content/jazz_midis.zip -d /content
!unzip -qq /content/piano_midis.zip -d /content

In [None]:
jazz_midi_path = Path('jazz_midi')
piano_midi_path = Path('piano_midis')

We set the Tokenizer params.

*   `pitch_range`: the MIDI norm can represent pitch values from 0 to 127, but the GM2 specification recommend from 21 to 108 for piano, which covers the recommended pitch values for all MIDI program. Notes with pitches under or above this range can be discarded or clipped to the limits.
*   `nb_velocities`: the number of velocity values to represent.
* `beat_res`: the number of samples within a beat.


`midi_valid` is a function that discards short MIDIs and MIDIs with a time segnature different from 4/*.


In [None]:
pitch_range = range(21, 109)
nb_velocities = 32
beat_res = {(0, 4): 16}

# A validation method to discard MIDIs we do not want
def midi_valid(midi) -> bool:
    if any(ts.numerator != 4 for ts in midi.time_signature_changes):
        return False  # time signature different from 4/*, 4 beats per bar
    if midi.max_tick < 10 * midi.ticks_per_beat:
        return False  # this MIDI is too short
    return True

Utility function used to play MIDI files inside a Colab notebook

In [None]:
_SAMPLING_RATE = 32000

def display_audio(midi_path: str, seconds=30):
    pm = pretty_midi.PrettyMIDI(midi_path)
    waveform = pm.fluidsynth(fs=_SAMPLING_RATE)
    # Take a sample of the generated waveform to mitigate kernel resets
    waveform_short = waveform[:seconds*_SAMPLING_RATE]
    return display.Audio(waveform_short, rate=_SAMPLING_RATE)

# Jazz Dataset

In [None]:
midi_files = glob.glob(str(jazz_midi_path/'*.mid*'))
print('Number of files:', len(midi_files))

Number of files: 456


In [None]:
jazz_tokens_path = Path('tokens_jazz')

In [None]:
display_audio('jazz_midi/3.mid')

Output hidden; open in https://colab.research.google.com to view.

## Standard Tokenization

The Jazz Dataset only contains Jazz solos, so we don't have Chords. We only set `Rest` to True, which is used whenever a segment of time is silent.

In [None]:
additional_tokens = {'Chord': False, 
                     'Rest': True, 
                     'Tempo': False, 
                     'Program': False, 
                     'TimeSignature': False,
                     'rest_range': (2, 2)}  # (quarter, 2 beats)

tokenizer = TSD(pitch_range=pitch_range, 
                beat_res=beat_res, 
                nb_velocities=nb_velocities, 
                additional_tokens=additional_tokens,
                pad=False) # we do not need the pad token

Checking that the tokenizer does not alter the original MIDI file.

In [None]:
midi = MidiFile('jazz_midi/3.mid')
tokens = tokenizer(midi)
converted_back_midi = tokenizer(tokens, get_midi_programs(midi))
converted_back_midi.dump('converted_midi.midi')
display_audio('converted_midi.midi')

Output hidden; open in https://colab.research.google.com to view.

`tokenize_midi_dataset` tokenizes the whole dataset discarding invalid MIDI files.

In [None]:
tokenizer.tokenize_midi_dataset(midi_files, 
                                pathlib.Path(jazz_tokens_path/'tokens'), 
                                midi_valid)

Tokenizing MIDIs (tokens_jazz/tokens): 100%|██████████| 456/456 [00:16<00:00, 28.25it/s]


In [None]:
tokenizer.vocab

Vocabulary - 251 tokens of 5 types

## BPE Tokenization

We use Byte Pair Encoding, which merges the tokens that are most frequently adjacent.

In [None]:
tokenizer_bpe = TSD(params=Path(jazz_tokens_path/'tokens/config.txt'))

In [None]:
# Constructs the vocabulary with BPE
tokenizer_bpe.learn_bpe(tokens_path=Path(jazz_tokens_path/'tokens'), vocab_size=500,
                    out_dir=Path(jazz_tokens_path/'tokens_bpe'))

# Converts the tokenized musics into tokens with BPE
tokenizer_bpe.apply_bpe_to_dataset(Path(jazz_tokens_path/'tokens'), Path(jazz_tokens_path/'tokens_bpe'))

Loading token files: 100%|██████████| 428/428 [00:00<00:00, 3898.66it/s]
Learning byte pair encoding: 100%|██████████| 249/249 [02:13<00:00,  1.87it/s, seq_len_variation=-41.96, avg_nb_token_combs=2.04, max_nb_token_combs=3]


Mean of original lengths: 1758.9205607476636
Mean length after BPE: 1020.9322429906542
Variation from original: -41.96 %


Applying BPE to dataset: 100%|██████████| 428/428 [00:28<00:00, 14.85it/s]


We split the tokenized files into train, test and validation directories.

In [None]:
dir = "dataset_jazz/"
if os.path.exists(dir):
    shutil.rmtree(dir)

splitfolders.ratio(
    jazz_tokens_path, 
    output="dataset_jazz", 
    seed=1337, 
    ratio=(0.7, 0.1, 0.2)
)

Copying files: 858 files [00:00, 6531.25 files/s]


We then copy the Tokenizer configuration files to the train directory in order to use them in the other notebooks.

In [None]:
!cp /content/tokens_jazz/tokens/config.txt /content/dataset_jazz/train/tokens/
!cp /content/tokens_jazz/tokens_bpe/config.txt /content/dataset_jazz/train/tokens_bpe/

We save the zipped dataset folder to the drive.

In [None]:
!zip -r -qq /content/dataset_jazz.zip /content/dataset_jazz
!cp -r /content/dataset_jazz.zip /content/drive/MyDrive/__Baldi_Cecchetti/datasets

# Piano Dataset

In [None]:
midi_files = glob.glob(str(piano_midi_path/'*.mid*'))
print('Number of files:', len(midi_files))

Number of files: 775


In [None]:
piano_tokens_path = Path('tokens_piano')

In [None]:
display_audio('piano_midis/003.midi')

Output hidden; open in https://colab.research.google.com to view.

## Standard Tokenization

For this dataset we also need Chords and Tempo, because the tempo of the song changes throughout its duration.

In [None]:
additional_tokens = {'Chord': True, 
                     'Rest': True, 
                     'Tempo': True, 
                     'Program': False, 
                     'TimeSignature': False,
                     'rest_range': (2, 8),
                     'nb_tempos': 32,
                     'tempo_range': (40, 250)}  # (quarter, 2 beats)

tokenizer = TSD(pitch_range=pitch_range, 
                beat_res=beat_res, 
                nb_velocities=nb_velocities, 
                additional_tokens=additional_tokens,
                pad=False)

In [None]:
midi = MidiFile('piano_midis/003.midi')
tokens = tokenizer(midi)
converted_back_midi = tokenizer(tokens, get_midi_programs(midi))
converted_back_midi.dump('converted_midi.midi')
display_audio('converted_midi.midi')

Output hidden; open in https://colab.research.google.com to view.

In [None]:
tokenizer.tokenize_midi_dataset(midi_files, 
                                pathlib.Path(piano_tokens_path/'tokens'), 
                                midi_valid)

Tokenizing MIDIs (tokens_piano/tokens): 100%|██████████| 775/775 [02:03<00:00,  6.26it/s]


In [None]:
tokenizer.vocab

Vocabulary - 306 tokens of 7 types

## BPE Tokenization

We apply Byte Pair Encoding to the dataset.

In [None]:
tokenizer_bpe = TSD(params=Path(piano_tokens_path/'tokens/config.txt'))

# Constructs the vocabulary with BPE
tokenizer_bpe.learn_bpe(tokens_path=Path(piano_tokens_path/'tokens'), vocab_size=500,
                    out_dir=Path(piano_tokens_path/'tokens_bpe'))

# Converts the tokenized musics into tokens with BPE
tokenizer_bpe.apply_bpe_to_dataset(Path(piano_tokens_path/'tokens'), Path(piano_tokens_path/'tokens_bpe'))

Loading token files: 100%|██████████| 775/775 [00:00<00:00, 1528.16it/s]
Learning byte pair encoding: 100%|██████████| 258/258 [15:10<00:00,  3.53s/it, seq_len_variation=-29.89, avg_nb_token_combs=2.17, max_nb_token_combs=3]


Mean of original lengths: 5441.8387096774195
Mean length after BPE: 3815.3767741935485
Variation from original: -29.89 %


Applying BPE to dataset: 100%|██████████| 775/775 [03:30<00:00,  3.68it/s]


We split the dataset into train, test and validation sets.

In [None]:
dir = "dataset_piano/"
if os.path.exists(dir):
    shutil.rmtree(dir)

splitfolders.ratio(
    piano_tokens_path, 
    output="dataset_piano", 
    seed=1337, 
    ratio=(0.7, 0.1, 0.2)
)

Copying files: 1552 files [00:00, 5356.66 files/s]


We save the datasets to the drive with the tokenizer configuration.

In [None]:
!cp /content/tokens_piano/tokens/config.txt /content/dataset_piano/train/tokens/
!cp /content/tokens_piano/tokens_bpe/config.txt /content/dataset_piano/train/tokens_bpe/

In [None]:
!zip -r -qq /content/dataset_piano.zip /content/dataset_piano
!cp -r /content/dataset_piano.zip /content/drive/MyDrive/__Baldi_Cecchetti/datasets