# Symbolic Music Tokenizers

This notebook shows how symbolic music tokenizers can be used to convert symbolic music data (like MIDI files) into token sequences suitable for machine learning models, and vice versa.

For MIDI and ABC, we will use the `MIDITok` library, while for MusicXML, we will use the `linearize-musicxml` library.

In [None]:
import os
from miditok import (
    REMI,
    CPWord,
    Event,
    TokenizerConfig,
)

# from linearize_musicxml import MusicXMLTokenizer,
from pathlib import Path
from glob import glob

In [None]:
# define paths
DATA_PATH = Path("../data")
MIDI_PATH = DATA_PATH / "midi"
MUSICXML_PATH = DATA_PATH / "musicxml"
ABC_PATH = DATA_PATH / "abc"

# create lists of files
midi_files = sorted(glob(str(MIDI_PATH / "*.mid")))
musicxml_files = sorted(
    glob(str(MUSICXML_PATH / "*.mxl"))
)  # in the same folder you can find .musicxml files too (non-compressed)
abc_files = sorted(glob(str(ABC_PATH / "*.abc")))
print(
    f"Found {len(midi_files)} MIDI files."
    f"\nFound {len(musicxml_files)} MusicXML files."
    f"\nFound {len(abc_files)} ABC files."
)

In [None]:
# Our parameters
TOKENIZER_PARAMS = {
    "pitch_range": (21, 109),
    "beat_res": {(0, 4): 8, (4, 12): 4},
    "num_velocities": 32,
    "special_tokens": ["PAD", "BOS", "EOS", "MASK"],
    "use_chords": True,
    "use_rests": False,
    "use_tempos": True,
    "use_time_signatures": False,
    "use_programs": False,
    "num_tempos": 32,  # number of tempo bins
    "tempo_range": (40, 250),  # (min, max)
}
config = TokenizerConfig(**TOKENIZER_PARAMS)

In [None]:
# tokenize the example MIDI file with REMI tokenizer
remi_tokenizer = REMI(tokenizer_config=config)
remi_tokens = remi_tokenizer(midi_files[0])
print(f"REMI tokenized sequence length: {len(remi_tokens)}")
print(f"First 20 REMI tokens: {remi_tokens[:20]}")