In [20]:
from typing import List
from pathlib import Path
import midifile_binding as mf 
import symusic as sm 
import miditoolkit as mtk
import pretty_midi as pm
import music21 as m21
import timeit
import os
from tqdm import tqdm
from time import sleep


In [21]:
MIDI_DATASET_NAMES = ('maestro', 'musicnet', 'POP909')
ABC_DATASET_NAMES = ('nottingham',)
DATASET_ROOT = "./symusic_benchmark_datasets"

def is_valid(f: str):
    try:
        sm.Score(f)
        return True
    except:
        return False

print("Scanning datasets to filter out invalid files...")
sleep(0.1)
MIDI_DATASET = {
    name: sorted(list(filter(
        is_valid, 
        tqdm(list(map(str, Path(DATASET_ROOT).joinpath(name).rglob('*.mid*'))), desc=name)
    )))  for name in MIDI_DATASET_NAMES
}

ABC_DATASET = {
    name: sorted(list(filter(
        is_valid, 
        tqdm(list(map(str, Path(DATASET_ROOT).joinpath(name).rglob('*.abc'))), desc=name)
    ))) for name in ABC_DATASET_NAMES
}

Scanning datasets to filter out invalid files...


maestro: 100%|██████████| 1276/1276 [00:00<00:00, 1746.27it/s]
musicnet: 100%|██████████| 330/330 [00:00<00:00, 3237.00it/s]
POP909: 100%|██████████| 2898/2898 [00:00<00:00, 4048.59it/s]
nottingham: 100%|██████████| 14/14 [00:00<00:00, 49.06it/s]


In [22]:
# show file numbers and average file size(in KB) for each dataset
# show in pandas dataframe
import pandas as pd
dataset_stat = pd.DataFrame()
for name, files in MIDI_DATASET.items():
    dataset_stat.loc[name, 'format'] = 'midi'
    dataset_stat.loc[name, 'file_num'] = len(files)
    dataset_stat.loc[name, 'avg_size(KB)'] = sum(Path(f).stat().st_size for f in files) / len(files) / 1024

for name, files in ABC_DATASET.items():
    dataset_stat.loc[name, 'format'] = 'abc'
    dataset_stat.loc[name, 'file_num'] = len(files)
    dataset_stat.loc[name, 'avg_size(KB)'] = sum(Path(f).stat().st_size for f in files) / len(files) / 1024

# set file number to integer
dataset_stat['file_num'] = dataset_stat['file_num'].astype(int)
dataset_stat

Unnamed: 0,format,file_num,avg_size(KB)
maestro,midi,1276,64.187625
musicnet,midi,322,30.557414
POP909,midi,2898,13.598484
nottingham,abc,14,34.595843


In [68]:
MAX_FILES_PER_DATASET = int(dataset_stat['file_num'].max())  
REPEAT_TIMES = 5

# MAX_FILES_PER_DATASET = 2 # for testing
# REPEAT_TIMES = 2 # for testing

print(f"MAX_FILES_PER_DATASET: {MAX_FILES_PER_DATASET}")
print(f"REPEAT_TIMES: {REPEAT_TIMES}")

MAX_FILES_PER_DATASET: 2898
REPEAT_TIMES: 5


In [50]:
def bench_midi(files: List[str], repeat=10):
    def bench_read(load, _files):
        for f in _files:
            load(f)

    def bench_rw(load, dump, _files):
        for f in _files:
            score = load(f)
            dump(score, './tmp')
    
    env = globals()
    env.update(locals())
    
    read_time = {
        'symusic': timeit.timeit('bench_read(sm.Score, files)', number=repeat, globals=env),
        'midifile': timeit.timeit('bench_read(mf.load, files)', number=repeat, globals=env),
        'miditoolkit': timeit.timeit('bench_read(mtk.MidiFile, files)', number=repeat, globals=env),
        'prettymidi': timeit.timeit('bench_read(pm.PrettyMIDI, files)', number=repeat, globals=env),
        'music21': timeit.timeit('bench_read(m21.converter.parse, files)', number=repeat, globals=env),
    }
    write_time = {
        'symusic': timeit.timeit('bench_rw(sm.Score, lambda x,y: x.dump_midi(y), files)', number=repeat, globals=env),
        'midifile': timeit.timeit('bench_rw(mf.load, lambda x,y: x.dump_midi(y), files)', number=repeat, globals=env),
        'miditoolkit': timeit.timeit('bench_rw(mtk.MidiFile, lambda x,y: x.dump(y), files)', number=repeat, globals=env),
        'prettymidi': timeit.timeit('bench_rw(pm.PrettyMIDI, lambda x,y: x.write(y), files)', number=repeat, globals=env),
        'music21': timeit.timeit('bench_rw(m21.converter.parse, lambda x,y: x.write("midi", y), files)', number=repeat, globals=env),
    }
    os.remove('./tmp')
    read_time = {
        k: v / repeat
        for k, v in read_time.items()
    }
    write_time = {
        k: v / repeat - read_time[k]
        for k, v in write_time.items()
    }
    return read_time, write_time
from collections import defaultdict
midi_read_benchmark = defaultdict(list)
midi_write_benchmark = defaultdict(list)

for name, files in MIDI_DATASET.items():
    print(f"benchmarking {name}...")
    read_time, write_time = bench_midi(files[:MAX_FILES_PER_DATASET], repeat=3)
    for k, v in read_time.items():
        midi_read_benchmark[k].append(v)
    for k, v in write_time.items():
        midi_write_benchmark[k].append(v)

print(midi_read_benchmark)
print(midi_write_benchmark)

benchmarking maestro...
benchmarking musicnet...
benchmarking POP909...
defaultdict(<class 'list'>, {'symusic': [0.0010693999999299801, 0.001066233333403943, 0.0012448333339610447], 'midifile': [0.001969166666337211, 0.010396899999856638, 0.0016550666672022392], 'miditoolkit': [0.057998033333812295, 0.2578369999998055, 0.03902103333408983], 'prettymidi': [0.057695866666714814, 0.2720793666667305, 0.044580600000093305], 'music21': [0.42597036666726734, 0.5057103666670931, 0.7620659333321479]})
defaultdict(<class 'list'>, {'symusic': [0.0008527000002989855, 0.002921166667268456, 0.0009103666658726677], 'midifile': [0.0010294000009404653, 0.004542199999074608, 0.0019098666671197864], 'miditoolkit': [0.06206126666681181, 0.28408510000008386, 0.06564246666554632], 'prettymidi': [0.09847189999951905, 0.454017066667196, 0.08055553333421508], 'music21': [1.837374733332884, 9.21867523333276, 2.909512000001996]})


In [63]:
midi_read_pd = pd.DataFrame(dict(midi_read_benchmark), index=MIDI_DATASET.keys())
midi_write_pd = pd.DataFrame(dict(midi_write_benchmark), index=MIDI_DATASET.keys())
print("read midi files:")
print(midi_read_pd)
print("\nwrite midi files:")
print(midi_write_pd)

read midi files:
           symusic  midifile  miditoolkit  prettymidi   music21
maestro   0.001069  0.001969     0.057998    0.057696  0.425970
musicnet  0.001066  0.010397     0.257837    0.272079  0.505710
POP909    0.001245  0.001655     0.039021    0.044581  0.762066

write midi files:
           symusic  midifile  miditoolkit  prettymidi   music21
maestro   0.000853  0.001029     0.062061    0.098472  1.837375
musicnet  0.002921  0.004542     0.284085    0.454017  9.218675
POP909    0.000910  0.001910     0.065642    0.080556  2.909512


In [64]:
def bench_abc(files: List[str], repeat=10):
    def bench_read(load, _files):
        for f in _files:
            load(f)

    def bench_rw(load, dump, _files):
        for f in _files:
            score = load(f)
            dump(score, './tmp')
    
    env = globals()
    env.update(locals())
    
    read_time = {
        'symusic': timeit.timeit('bench_read(sm.Score, files)', number=repeat, globals=env),
        'music21': timeit.timeit('bench_read(m21.converter.parse, files)', number=repeat, globals=env),
    }
    write_time = {
        'symusic': timeit.timeit('bench_rw(sm.Score, lambda x,y: x.dump_abc(y), files)', number=repeat, globals=env),
        # music21 for NA
        'music21': float('nan'),
    }
    os.remove('./tmp')

    read_time = {
        k: v / repeat
        for k, v in read_time.items()
    }
    write_time = {
        k: v / repeat - read_time[k]
        for k, v in write_time.items()
    }
    return read_time, write_time

abc_read_benchmark = defaultdict(list)
abc_write_benchmark = defaultdict(list)

for name, files in ABC_DATASET.items():
    print(f"benchmarking {name}...")
    read_time, write_time = bench_abc(files[:MAX_FILES_PER_DATASET], repeat=3)
    for k, v in read_time.items():
        abc_read_benchmark[k].append(v)
    for k, v in write_time.items():
        abc_write_benchmark[k].append(v)

print(abc_read_benchmark)
print(abc_write_benchmark)


benchmarking nottingham...
defaultdict(<class 'list'>, {'symusic': [0.051845133332487116], 'music21': [3.208246866666741]})
defaultdict(<class 'list'>, {'symusic': [0.074953733333435], 'music21': [nan]})


In [65]:
abc_read_pd = pd.DataFrame(dict(abc_read_benchmark), index=ABC_DATASET.keys())
abc_write_pd = pd.DataFrame(dict(abc_write_benchmark), index=ABC_DATASET.keys())
print("read abc files:")
print(abc_read_pd)
print("\nwrite abc files:")
print(abc_write_pd)

read abc files:
             symusic   music21
nottingham  0.051845  3.208247

write abc files:
             symusic  music21
nottingham  0.074954      NaN
