In [None]:
from pathlib import Path
from random import shuffle
from evaluate import load as load_metric
from miditok import REMI, TokenizerConfig
from miditok.pytorch_data import DatasetMIDI, DataCollator, split_midis_for_training
from miditok.data_augmentation import augment_midi_dataset

In [None]:
tokenizer = REMI(params=Path("tokenizer.json"))

In [None]:
midi_paths = list(Path("maestro").glob("**/*.midi"))

# Split MIDI paths in train/valid/test sets
total_num_files = len(midi_paths)
num_files_valid = round(total_num_files * 0.15)
num_files_test = round(total_num_files * 0.15)
shuffle(midi_paths)
midi_paths_valid = midi_paths[:num_files_valid]
midi_paths_test = midi_paths[num_files_valid:num_files_valid + num_files_test]
midi_paths_train = midi_paths[num_files_valid + num_files_test:]

for files_paths, subset_name in [
    (midi_paths_train, "train"), (midi_paths_valid, "valid"), (midi_paths_test, "test")
]:

    # Split the MIDIs into chunks of sizes 1024 tokens
    subset_chunks_dir = Path(f"Maestro_{subset_name}")
    split_midis_for_training(
        files_paths=files_paths,
        tokenizer=tokenizer,
        save_dir=subset_chunks_dir,
        max_seq_len=1024,
        num_overlap_bars=2,
    )

    #data augmentation
    augment_midi_dataset(
        subset_chunks_dir,
        pitch_offsets=[-12, 12],
        velocity_offsets=[-4, 4],
        duration_offsets=[-0.5, 0.5],
    )