In [69]:
from typing import List
from pathlib import Path
import midifile_binding as mf 
import symusic as sm 
import miditoolkit as mtk
import pretty_midi as pm
import music21 as m21
import timeit
import os
from tqdm import tqdm
from time import sleep


In [70]:
MIDI_DATASET_NAMES = ('maestro', 'musicnet', 'POP909')
ABC_DATASET_NAMES = ('nottingham',)
DATASET_ROOT = "./symusic_benchmark_datasets"

def is_valid(f: str):
    try:
        sm.Score(f)
        return True
    except:
        return False

print("Scanning datasets to filter out invalid files...")
sleep(0.1)
MIDI_DATASET = {
    name: sorted(list(filter(
        is_valid, 
        tqdm(list(map(str, Path(DATASET_ROOT).joinpath(name).rglob('*.mid*'))), desc=name)
    )))  for name in MIDI_DATASET_NAMES
}

ABC_DATASET = {
    name: sorted(list(filter(
        is_valid, 
        tqdm(list(map(str, Path(DATASET_ROOT).joinpath(name).rglob('*.abc'))), desc=name)
    ))) for name in ABC_DATASET_NAMES
}

Scanning datasets to filter out invalid files...


maestro: 100%|██████████| 1276/1276 [00:01<00:00, 890.92it/s]
musicnet: 100%|██████████| 330/330 [00:00<00:00, 2775.41it/s]
POP909: 100%|██████████| 2898/2898 [00:01<00:00, 1843.90it/s]
nottingham: 100%|██████████| 14/14 [00:00<00:00, 45.63it/s]


In [71]:
# show file numbers and average file size(in KB) for each dataset
# show in pandas dataframe
import pandas as pd
dataset_stat = pd.DataFrame()
for name, files in MIDI_DATASET.items():
    dataset_stat.loc[name, 'format'] = 'midi'
    dataset_stat.loc[name, 'file_num'] = len(files)
    dataset_stat.loc[name, 'avg_size(KB)'] = sum(Path(f).stat().st_size for f in files) / len(files) / 1024

for name, files in ABC_DATASET.items():
    dataset_stat.loc[name, 'format'] = 'abc'
    dataset_stat.loc[name, 'file_num'] = len(files)
    dataset_stat.loc[name, 'avg_size(KB)'] = sum(Path(f).stat().st_size for f in files) / len(files) / 1024

# set file number to integer
dataset_stat['file_num'] = dataset_stat['file_num'].astype(int)
dataset_stat

Unnamed: 0,format,file_num,avg_size(KB)
maestro,midi,1276,64.187625
musicnet,midi,322,30.557414
POP909,midi,2898,13.598484
nottingham,abc,14,34.595843


In [72]:
# MAX_FILES_PER_DATASET = int(dataset_stat['file_num'].max())  
# REPEAT_TIMES = 5

MAX_FILES_PER_DATASET = 2 # for testing
REPEAT_TIMES = 2 # for testing

print(f"MAX_FILES_PER_DATASET: {MAX_FILES_PER_DATASET}")
print(f"REPEAT_TIMES: {REPEAT_TIMES}")

MAX_FILES_PER_DATASET: 2
REPEAT_TIMES: 2


In [73]:
def bench_midi(files: List[str], repeat=10):
    def bench_read(load, _files):
        for f in _files:
            load(f)

    def bench_rw(load, dump, _files):
        for f in _files:
            score = load(f)
            dump(score, './tmp')
    
    env = globals()
    env.update(locals())
    
    read_time = {
        'symusic': timeit.timeit('bench_read(sm.Score, files)', number=repeat, globals=env),
        'midifile': timeit.timeit('bench_read(mf.load, files)', number=repeat, globals=env),
        'miditoolkit': timeit.timeit('bench_read(mtk.MidiFile, files)', number=repeat, globals=env),
        'prettymidi': timeit.timeit('bench_read(pm.PrettyMIDI, files)', number=repeat, globals=env),
        'music21': timeit.timeit('bench_read(m21.converter.parse, files)', number=repeat, globals=env),
    }
    write_time = {
        'symusic': timeit.timeit('bench_rw(sm.Score, lambda x,y: x.dump_midi(y), files)', number=repeat, globals=env),
        'midifile': timeit.timeit('bench_rw(mf.load, lambda x,y: x.dump_midi(y), files)', number=repeat, globals=env),
        'miditoolkit': timeit.timeit('bench_rw(mtk.MidiFile, lambda x,y: x.dump(y), files)', number=repeat, globals=env),
        'prettymidi': timeit.timeit('bench_rw(pm.PrettyMIDI, lambda x,y: x.write(y), files)', number=repeat, globals=env),
        'music21': timeit.timeit('bench_rw(m21.converter.parse, lambda x,y: x.write("midi", y), files)', number=repeat, globals=env),
    }
    os.remove('./tmp')
    read_time = {
        k: v / repeat
        for k, v in read_time.items()
    }
    write_time = {
        k: v / repeat - read_time[k]
        for k, v in write_time.items()
    }
    return read_time, write_time
from collections import defaultdict
midi_read_benchmark = defaultdict(list)
midi_write_benchmark = defaultdict(list)

for name, files in MIDI_DATASET.items():
    print(f"benchmarking {name}...")
    read_time, write_time = bench_midi(files[:MAX_FILES_PER_DATASET], repeat=3)
    for k, v in read_time.items():
        midi_read_benchmark[k].append(v)
    for k, v in write_time.items():
        midi_write_benchmark[k].append(v)

benchmarking maestro...
benchmarking musicnet...
benchmarking POP909...
defaultdict(<class 'list'>, {'symusic': [0.0006487999999080785, 0.0011542000005041093, 0.0005798333331767935], 'midifile': [0.0021046000001661014, 0.010774399999111969, 0.001722866666871899], 'miditoolkit': [0.05599586666721734, 0.27994786666628596, 0.044650833333435], 'prettymidi': [0.05486363333329791, 0.3070209666669446, 0.05871616666627233], 'music21': [0.10975473333261714, 0.55689646666724, 0.21955876666591698]})
defaultdict(<class 'list'>, {'symusic': [0.0016698999994938881, 0.0026175333332503214, 0.001306999999845478], 'midifile': [0.0019405666656287695, 0.0024893666680630595, 0.0009777999997216587], 'miditoolkit': [0.10585419999915757, 0.3077287333338366, 0.04239503333398413], 'prettymidi': [0.12580063333249805, 0.4858014333318958, 0.05566176666737495], 'music21': [2.1206434666673886, 9.36407663333254, 3.453902733333962]})


In [74]:
midi_read_pd = pd.DataFrame(dict(midi_read_benchmark), index=MIDI_DATASET.keys())
midi_write_pd = pd.DataFrame(dict(midi_write_benchmark), index=MIDI_DATASET.keys())
print("read midi files:")
print(midi_read_pd)
print("\nwrite midi files:")
print(midi_write_pd)

read midi files:
           symusic  midifile  miditoolkit  prettymidi   music21
maestro   0.000649  0.002105     0.055996    0.054864  0.109755
musicnet  0.001154  0.010774     0.279948    0.307021  0.556896
POP909    0.000580  0.001723     0.044651    0.058716  0.219559

write midi files:
           symusic  midifile  miditoolkit  prettymidi   music21
maestro   0.001670  0.001941     0.105854    0.125801  2.120643
musicnet  0.002618  0.002489     0.307729    0.485801  9.364077
POP909    0.001307  0.000978     0.042395    0.055662  3.453903


In [75]:
def bench_abc(files: List[str], repeat=10):
    def bench_read(load, _files):
        for f in _files:
            load(f)

    def bench_rw(load, dump, _files):
        for f in _files:
            score = load(f)
            dump(score, './tmp')
    
    env = globals()
    env.update(locals())
    
    read_time = {
        'symusic': timeit.timeit('bench_read(sm.Score, files)', number=repeat, globals=env),
        'music21': timeit.timeit('bench_read(m21.converter.parse, files)', number=repeat, globals=env),
    }
    write_time = {
        'symusic': timeit.timeit('bench_rw(sm.Score, lambda x,y: x.dump_abc(y), files)', number=repeat, globals=env),
        # music21 for NA
        'music21': float('nan'),
    }
    os.remove('./tmp')

    read_time = {
        k: v / repeat
        for k, v in read_time.items()
    }
    write_time = {
        k: v / repeat - read_time[k]
        for k, v in write_time.items()
    }
    return read_time, write_time

abc_read_benchmark = defaultdict(list)
abc_write_benchmark = defaultdict(list)

for name, files in ABC_DATASET.items():
    print(f"benchmarking {name}...")
    read_time, write_time = bench_abc(files[:MAX_FILES_PER_DATASET], repeat=3)
    for k, v in read_time.items():
        abc_read_benchmark[k].append(v)
    for k, v in write_time.items():
        abc_write_benchmark[k].append(v)


benchmarking nottingham...
defaultdict(<class 'list'>, {'symusic': [0.035935033333468404], 'music21': [0.7204875666660276]})
defaultdict(<class 'list'>, {'symusic': [0.03777630000089024], 'music21': [nan]})


In [76]:
abc_read_pd = pd.DataFrame(dict(abc_read_benchmark), index=ABC_DATASET.keys())
abc_write_pd = pd.DataFrame(dict(abc_write_benchmark), index=ABC_DATASET.keys())
print("read abc files:")
print(abc_read_pd)
print("\nwrite abc files:")
print(abc_write_pd)

read abc files:
             symusic   music21
nottingham  0.035935  0.720488

write abc files:
             symusic  music21
nottingham  0.037776      NaN
