### Create CSV from midi sources

In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import glob
import os
from tqdm import tqdm
from IPython.display import Image, Audio
import traceback

In [3]:
# parallel
from functools import partial
from pathlib import Path

In [4]:
from data_sources import *
from midi_data import *

In [5]:
version = 'v4'
data_path = Path('data/midi')
version_path = data_path/version
orig_path = version_path/'midi_sources'
metapath = version_path/'metadata'
all_csv = metapath/'midi_sources.csv'
metapath.mkdir(parents=True, exist_ok=True)

In [6]:
def create_paths(dirname):
    "Standardize midi_source paths"
    dir_path = orig_path/dirname
    json_path = metapath/f'{dirname}_metadata.json'
    csv_path = metapath/f'{dirname}_metadata.csv'
    return dir_path, json_path, csv_path

In [7]:
sources = ['hooktheory', 'freemidi', 'midiworld', 'ecomp', 'cprato', 'classic_piano', 'wikifonia']

In [8]:
version_path.relative_to(data_path)

PosixPath('v4')

In [9]:
def relative_path(filepath):
    return str(Path(filepath).relative_to(version_path))

### Remove corrupted file - this causes deadlock with music21 processing

In [10]:
corrupted_files = [
    'midiworld/named_midi/NITRO_BRO_-_IT_WONT_DIE.mid',
    'ecomp/2004/MORET02.mid',
    'ecomp/2006/Mordvinov9.MID',
    'ecomp/2006/Na06.MID',
    'ecomp/2008/Cui01.MID',
    'ecomp/2008/Cui02.MID',
    'ecomp/2008/Cui03.MID',
    'ecomp/2008/Cui04.MID',
    'ecomp/2008/Cui05.MID',
    'ecomp/2008/Cui06.MID',
    'ecomp/2008/Cui07.MID',
    'ecomp/2008/Cui08.MID',
    'ecomp/2008/Tan01.MID',
    'ecomp/2008/Tan02.MID',
    'ecomp/2008/Tan03.MID',
    'ecomp/2018/KaszoS14.MID',
    'midiworld/named_midi/Rob_Zombie_-_Demonoid_Phenomenon.mid',
    'midiworld/named_midi/Rob_Zombie_-_Demonoid_Phenomenon.mxl',
]
for f in corrupted_files:
    fp = orig_path/f
    if fp.exists(): fp.unlink()

### Hooktheory

In [11]:
ht_path, ht_json, ht_csv = create_paths('hooktheory')
ht_song_list = metapath/'hooktheory_key2info.json'

In [12]:
def song_key(s): return '_'.join(s.parts[-3:-1])

In [14]:
ht_key2info = load_json(ht_song_list)

if ht_key2info is None:
    song_info = list((ht_path/'xml').glob('*/*/*/*.json'))
    ht_key2info = {song_key(s):json.load(open(s, 'r')) for s in song_info}
    save_json(ht_key2info, ht_song_list)
len(ht_key2info)

11873

In [16]:
ht_midi_list = list((ht_path/'pianoroll').glob('*/*/*/*_key.mid')); 
len(ht_midi_list)

19876

In [23]:
def get_ht_jsonfile(midi_file): # using json instead of midi for metadata
    return str(midi_file.with_suffix('.json')).replace('pianoroll', 'event').replace('_key', '_symbol_key')

In [24]:
def get_hooktheory_attr(fp):
    song_info = ht_key2info[song_key(fp)]
    song_json = json.load(open(get_ht_jsonfile(fp), 'r'))
    metadata = song_json['metadata']
    artist = fp.parts[-3]
    title = fp.parts[-2]
    section = fp.name.split('_')[0]
    
    ht_key = metadata['key']
    ht_mode = metadata['mode']
    if ht_mode is None: ht_mode = 'major'
    ht_offset = keyc_offset(ht_key, ht_mode)
    
    # convert stream here
    return {
        'artist': artist,
        'title': title,
        'midi': relative_path(fp),
        'section': section,
        'parts': song_info['section'],
        'song_url': song_info['song_url'],
        'genres': song_info['genres'],
        'midi_title': metadata['title'],
        'source': 'hooktheory',
        'ht_bpm': metadata['BPM'],
        'ht_mode': metadata['mode'],
        'ht_key': metadata['key'],
        'ht_offset': ht_offset,
        'ht_time_signature': metadata['beats_in_measure']
    }

In [25]:
# sanity check
# hook_out = get_hooktheory_attr(song_json[1000]); hook_out

In [26]:
ht_metadata = parse_midi_dir(ht_midi_list, ht_json, base_path=version_path, 
                             meta_func=get_hooktheory_attr)

Midi Exeption: data/midi/v4/midi_sources/hooktheory/pianoroll/w/weezer/beverly-hills/intro-and-verse_key.mid list index out of range


Midi Exeption: data/midi/v4/midi_sources/hooktheory/pianoroll/w/wolfgang-amadeus-mozart/symphony-no-25-in-g-minor/intro-and-verse_key.mid list index out of range
Midi Exeption: data/midi/v4/midi_sources/hooktheory/pianoroll/y/ylvis/the-fox---what-does-the-fox-say/verse-and-pre-chorus_key.mid list index out of range
Midi Exeption: data/midi/v4/midi_sources/hooktheory/pianoroll/j/jay-z/so-ambitious-feat-pharrel-williams/intro_key.mid list index out of range
Midi Exeption: data/midi/v4/midi_sources/hooktheory/pianoroll/j/johnny-cash/folsom-prison-blues/verse_key.mid list index out of range
Midi Exeption: data/midi/v4/midi_sources/hooktheory/pianoroll/j/jack-johnson/do-you-remember/intro-and-verse_key.mid list index out of range
Midi Exeption: data/midi/v4/midi_sources/hooktheory/pianoroll/j/justin-moore/if-heaven-wasnt-so-far-awat/intro-and-verse_key.mid list index out of range
Midi Exeption: data/midi/v4/midi_sources/hooktheory/pianoroll/j/john-cage/433/solo_key.mid list index out of ran

Midi Exeption: data/midi/v4/midi_sources/hooktheory/pianoroll/q/queen/bohemian-rhapsody/instrumental_key.mid list index out of range
Midi Exeption: data/midi/v4/midi_sources/hooktheory/pianoroll/a/adventure-club/wonder/bridge_key.mid list index out of range
Midi Exeption: data/midi/v4/midi_sources/hooktheory/pianoroll/a/antonio-vivaldi/the-four-seasons-concerto-no-4-winter/intro_key.mid list index out of range
Midi Exeption: data/midi/v4/midi_sources/hooktheory/pianoroll/a/adventure-club/wonder/outro_key.mid list index out of range
Midi Exeption: data/midi/v4/midi_sources/hooktheory/pianoroll/a/adelina-tahiri/heart-on-fire/intro_key.mid list index out of range
Midi Exeption: data/midi/v4/midi_sources/hooktheory/pianoroll/a/antonim/melancholy-soldier/bridge_key.mid list index out of range
Midi Exeption: data/midi/v4/midi_sources/hooktheory/pianoroll/a/asf/asdfas/intro-and-verse_key.mid list index out of range


#### Save song_list

In [27]:
ht_metadata = load_json(ht_json)
arr2csv(ht_metadata.values(), ht_csv)
df = pd.read_csv(ht_csv); df.head()

Unnamed: 0,genres,ht_offset,source,song_url,parts,seconds,inferred_offset,title,time_signature,ht_key,...,ht_time_signature,section,instruments,ht_mode,midi_title,ht_bpm,bpm,midi,inferred_key,quarter_length
0,,0,hooktheory,https://www.hooktheory.com/theorytab/view/wayn...,"intro,chorus",25.411765,0.0,yu-gi-oh-theme-song,3/4,C,...,3,intro,Piano,1.0,yu-gi-oh,85,85.0,midi_sources/hooktheory/pianoroll/w/wayne-shar...,C major,36.0
1,Holiday,-1,hooktheory,https://www.hooktheory.com/theorytab/view/wham...,"intro,verse,chorus",17.777778,-1.0,last-christmas,4/4,Db,...,4,chorus,"Piano,Piano",1.0,Last Christmas Chorus,108,108.0,midi_sources/hooktheory/pianoroll/w/wham/last-...,B- minor,32.0
2,,6,hooktheory,https://www.hooktheory.com/theorytab/view/wolf...,verse,15.11811,1.0,love-and-war,4/4,G#,...,4,verse,"Piano,Piano",2.0,wolfgang gartner love and war,127,127.0,midi_sources/hooktheory/pianoroll/w/wolfgang-g...,G# minor,32.0
3,Electronic,3,hooktheory,https://www.hooktheory.com/theorytab/view/wood...,"intro,bridge",12.972973,-2.0,rainbow-factory,4/4,F#,...,4,intro,"Piano,Piano",6.0,Rainbow Factory,148,148.0,midi_sources/hooktheory/pianoroll/w/woodentoas...,D major,32.0
4,,8,hooktheory,https://www.hooktheory.com/theorytab/view/wolf...,pre-chorus,15.0,1.0,space-junk,4/4,F#,...,4,pre-chorus,"Piano,Piano",2.0,Space Junk,128,128.0,midi_sources/hooktheory/pianoroll/w/wolfgang-g...,B major,32.0


In [28]:
df.shape

(19876, 21)

## FreeMidi

In [29]:
fm_path, _, fm_csv = create_paths('freemidi')
fm_dance_path = metapath/f'freemidi_dance_metadata.json'
fm_pop_path = metapath/f'freemidi_pop_metadata.json'
list(fm_path.glob('*'))

[PosixPath('data/midi/v4/midi_sources/freemidi/genre-disco'),
 PosixPath('data/midi/v4/midi_sources/freemidi/genre-pop'),
 PosixPath('data/midi/v4/midi_sources/freemidi/genre-dance-eletric'),
 PosixPath('data/midi/v4/midi_sources/freemidi/genre-punk'),
 PosixPath('data/midi/v4/midi_sources/freemidi/genre-hip-hop-rap'),
 PosixPath('data/midi/v4/midi_sources/freemidi/genre-rock')]

In [30]:
def parse_freemidi_songs(fp, genres=None, source=None):
    name = fp.with_suffix('').name.split(' - ')
    artist = name[0]
    title = name[-1]
    return {
        'artist': artist.strip(),
        'title': title.strip(),
        'midi': relative_path(fp),
        'genres': genres,
        'source': source
    }

In [31]:
d_parse_func = partial(parse_freemidi_songs, genres='dance', source='freemidi')
dir_path = fm_path/'genre-dance-eletric'
file_list = get_files(dir_path, extensions=['.mid'], recurse=True)
fm_dance_list = parse_midi_dir(file_list, fm_dance_path,
                               base_path=version_path, meta_func=d_parse_func)

  warn("Your generator is empty.")


In [32]:
p_parse_func = partial(parse_freemidi_songs, genres='pop', source='freemidi')
dir_path = fm_path/'genre-pop'
file_list = get_files(dir_path, extensions=['.mid'], recurse=True)
fm_pop_list = parse_midi_dir(file_list, fm_pop_path,
                             base_path=version_path, meta_func=p_parse_func)

In [33]:
fmd = load_json(fm_dance_path)
fmp = load_json(fm_pop_path)
fm_all = list(fmd.values())+list(fmp.values())
arr2csv(fm_all, fm_csv)
df = pd.read_csv(fm_csv); df.head()

Unnamed: 0,artist,genres,seconds,inferred_offset,source,title,bpm,midi,time_signature,instruments,inferred_key,quarter_length
0,Gloria Estefan,dance,,,freemidi,I'm Not Givin' You Up,,midi_sources/freemidi/genre-dance-eletric/Glor...,,,,
1,Radiohead,dance,85.263158,2.0,freemidi,Fitter Happier,76.0,midi_sources/freemidi/genre-dance-eletric/Radi...,4/4,"Piano,Piano",G minor,108.0
2,Tune Up,dance,28.732394,-5.0,freemidi,Bounce,142.0,midi_sources/freemidi/genre-dance-eletric/Tune...,,,F major,68.0
3,Daft Punk,dance,40.0,-4.0,freemidi,The Grid,102.0,midi_sources/freemidi/genre-dance-eletric/Daft...,4/4,"StringInstrument,StringInstrument,StringInstru...",C# minor,68.0
4,Bjork,dance,,3.0,freemidi,Glora,65.0,midi_sources/freemidi/genre-dance-eletric/Bjor...,4/4,"Flute,Flute,Flute,Flute",A major,613/6


### Gather Cprato

In [34]:
cp_path, cp_json, cp_csv = create_paths('cprato')
list(cp_path.glob('*'))[:5]

[PosixPath('data/midi/v4/midi_sources/cprato/Basto - Again And Again (midi By Carlo Prato) (www.cprato.com).mid'),
 PosixPath('data/midi/v4/midi_sources/cprato/The Weeknd ft. Lana Del Rey - Stargirl Interlude  (midi by Carlo Prato) (www.cprato.com).mid'),
 PosixPath('data/midi/v4/midi_sources/cprato/Two Steps From Hell - Magic of Love  (midi by Carlo Prato) (www.cprato.com).mid'),
 PosixPath('data/midi/v4/midi_sources/cprato/Bermuda Loverz - My Girl (Ladidada) (Rimini Rockaz Radio Edit) (Midi By Carlo Prato) (www.cprato.com).mid'),
 PosixPath('data/midi/v4/midi_sources/cprato/Cascada - Everytime We Touch (Midi By Carlo Prato) (www.cprato.com).mid')]

In [35]:
def parse_cprato_songs(fp, genres=None, source=None):
    name = fp.with_suffix('').name.split(' - ')
    artist = name[0]
    title = name[-1].replace('(midi by Carlo Prato) (www.cprato.com)', '')
    return {
        'artist': artist.strip(),
        'title': title.strip(),
        'midi': relative_path(fp),
        'genres': genres,
        'source': source
    }

In [36]:
cp_meta = partial(parse_cprato_songs, genres='EDM,inferred', source='cprato')
file_list = get_files(cp_path, extensions=['.mid'], recurse=True)
cp_md = parse_midi_dir(file_list, cp_json, 
                       base_path=version_path, meta_func=cp_meta)

In [37]:
cp = load_json(cp_json)
arr2csv(cp.values(), cp_csv)
df = pd.read_csv(cp_csv); df.head()

Unnamed: 0,artist,genres,seconds,inferred_offset,source,title,bpm,midi,time_signature,instruments,inferred_key,quarter_length
0,Nora En Pure,"EDM,inferred",51.147541,4,cprato,Morning Dew (Original Mix),122.0,midi_sources/cprato/Nora En Pure - Morning Dew...,4/4,"Piano,Piano,Piano,Piano,Piano,Piano",F minor,104.0
1,deadmau5,"EDM,inferred",35.478261,-1,cprato,Cat Thruster,115.0,midi_sources/cprato/deadmau5 - Cat Thruster (...,4/4,"Fretless Bass,Fretless Bass,Fretless Bass,Pian...",B- minor,68.0
2,Basshunter,"EDM,inferred",29.142857,-5,cprato,So Near So Close (Midi By Carlo Prato) (www.cp...,140.0,midi_sources/cprato/Basshunter - So Near So Cl...,,,F major,68.0
3,The Hitmen,"EDM,inferred",29.142857,-5,cprato,Bass Up,140.0,midi_sources/cprato/The Hitmen - Bass Up (mid...,,,F major,68.0
4,Cascada,"EDM,inferred",30.857143,4,cprato,Everytime We Touch (Midi By Carlo Prato) (www....,140.0,midi_sources/cprato/Cascada - Everytime We Tou...,,,A- major,72.0


### Gather MidiWorld

In [38]:
mw_path, mw_json, mw_csv = create_paths('midiworld')

In [39]:
def parse_midiworld_songs(fp):
    name = fp.with_suffix('').name.replace('_', ' ').split(' - ')
    artist = name[0]
    title = name[-1]
    return {
        'artist': artist.strip(),
        'title': title.strip(),
        'midi': relative_path(fp),
        'genres': 'pop,inferred',
        'source': 'midiworld'
    }

In [40]:
file_list = get_files(mw_path/'named_midi', extensions=['.mid'], recurse=True)
mw_md = parse_midi_dir(file_list, mw_json, base_path=version_path, meta_func=parse_midiworld_songs)

In [41]:
mw = load_json(mw_json)
arr2csv(mw.values(), mw_csv)
df = pd.read_csv(mw_csv); df.head()

Unnamed: 0,artist,genres,seconds,inferred_offset,source,title,bpm,midi,time_signature,instruments,inferred_key,quarter_length
0,Kona,"pop,inferred",1.193182,-2.0,midiworld,Drumloop,176.0,midi_sources/midiworld/named_midi/Kona_-_Druml...,4/4,Piano,B minor,3.5
1,Cameron Lee Simpson,"pop,inferred",2.526316,-5.0,midiworld,Cue,95.0,midi_sources/midiworld/named_midi/Cameron_Lee_...,4/4,Piano,D minor,4.0
2,TV Themes,"pop,inferred",19.25,0.0,midiworld,Looney Tunes,160.0,midi_sources/midiworld/named_midi/TV_Themes_-_...,4/4,"Marimba,Acoustic Bass,Trombone,StringInstrumen...",C major,154/3
3,TV Themes,"pop,inferred",41.929134,3.0,midiworld,Millenium,127.0,midi_sources/midiworld/named_midi/TV_Themes_-_...,4/4,"Timpani,Taiko,Violin,Contrabass,Voice",A major,88.75
4,Kaito,"pop,inferred",67.916667,-4.0,midiworld,Cantarella,144.0,midi_sources/midiworld/named_midi/Kaito_-_Cant...,2/4,Piano,C# minor,163.0


### Gather Wikifonia

In [42]:
wf_path, wf_json, wf_csv = create_paths('wikifonia')

In [43]:
def parse_wikifonia_songs(fp):
    name = fp.with_suffix('').name.replace('_', ' ').split(' - ')
    artist = name[0]
    title = name[-1]
    return {
        'artist': artist.strip(),
        'title': title.strip(),
        'mxl': relative_path(fp),
        'genres': 'pop,inferred',
        'source': 'wikifonia'
    }

In [44]:
# Warning: if you get a deadlock: 
# PosixPath('data/midi/midi_sources/midiworld/named_midi/NITRO_BRO_-_IT_WONT_DIE.mid') is broken
file_list = get_files(wf_path, extensions=['.mxl'], recurse=True)
wf_md = parse_midi_dir(file_list, wf_json, base_path=version_path, meta_func=parse_wikifonia_songs)

In [45]:
wf = load_json(wf_json)
arr2csv(wf.values(), wf_csv)
df = pd.read_csv(wf_csv); df.head()

Unnamed: 0,artist,genres,seconds,inferred_offset,source,title,bpm,midi,time_signature,mxl,instruments,inferred_key,quarter_length
0,"Arthur Siegel, June Carroll","pop,inferred",,0.0,wikifonia,Love Is A Simple Thing,,"midi_sources/from_mxl/wikifonia/Arthur Siegel,...",4/4,"midi_sources/wikifonia/Arthur Siegel, June Car...",Voice,C major,76.0
1,"Nacio Herb Brown, Arthur Freed","pop,inferred",,-3.0,wikifonia,Broadway Melody,,midi_sources/from_mxl/wikifonia/Nacio Herb Bro...,2/2,"midi_sources/wikifonia/Nacio Herb Brown, Arthu...",,C minor,132.0
2,"Hans Leo Hassler, From the Latin","pop,inferred",,0.0,wikifonia,O Sacred Head Now Wounded,,midi_sources/from_mxl/wikifonia/Hans Leo Hassl...,4/4,"midi_sources/wikifonia/Hans Leo Hassler, From ...",,A minor,64.0
3,Amanda McBroom,"pop,inferred",,0.0,wikifonia,The Rose,,midi_sources/from_mxl/wikifonia/Amanda McBroom...,4/4,midi_sources/wikifonia/Amanda McBroom - The Ro...,,C major,148.0
4,Jimmy McHugh,"pop,inferred",,0.0,wikifonia,Sunny Side Of The Street,,midi_sources/from_mxl/wikifonia/Jimmy McHugh -...,4/4,midi_sources/wikifonia/Jimmy McHugh - Sunny Si...,Choir Aahs,C major,129.0


### Yamaha - piano

In [46]:
ec_path, ec_json, ec_csv = create_paths('ecomp')
ec_song = ec_path/'song_list.json'
list(ec_path.glob('*'))[:5]

[PosixPath('data/midi/v4/midi_sources/ecomp/song_list.json'),
 PosixPath('data/midi/v4/midi_sources/ecomp/2017'),
 PosixPath('data/midi/v4/midi_sources/ecomp/2008'),
 PosixPath('data/midi/v4/midi_sources/ecomp/2006'),
 PosixPath('data/midi/v4/midi_sources/ecomp/2004')]

In [47]:
ec_songs = load_json(ec_path/'song_list.json')

In [48]:
def parse_ecomp_songs(fp):
    song_info = ec_songs[fp.stem]
    return {
        'artist': song_info['artist'],
        'title': song_info['title'],
        'mxl': relative_path(fp),
        'genres': 'classical',
        'source': 'ecomp'
    }

In [49]:
file_list = get_files(ec_path, extensions=['.mxl'], recurse=True)

In [50]:
sorted_files = sorted([fp.stat().st_size/1000 for fp in file_list])[-200:]

In [51]:
file_list = [fp for fp in file_list if fp.stat().st_size/1000 < 400] # over 200mb takes crazy long to analyze
ec_md = parse_midi_dir(file_list, ec_json, base_path=version_path, meta_func=parse_ecomp_songs)

In [52]:
ec = load_json(ec_json)
arr2csv(ec.values(), ec_csv)
df = pd.read_csv(ec_csv); df.head()

Unnamed: 0,artist,genres,seconds,inferred_offset,source,title,bpm,midi,time_signature,mxl,instruments,inferred_key,quarter_length
0,,classical,123.130435,3,ecomp,II. Allegro con moto,115.0,midi_sources/from_mxl/ecomp/2017/SunY06.mid,4/4,midi_sources/ecomp/2017/SunY06.mxl,"Piano,Piano",A major,236.0
1,,classical,128.4,-5,ecomp,VI. Allegro (Fuga),200.0,midi_sources/from_mxl/ecomp/2017/TuanS10.mid,4/4,midi_sources/ecomp/2017/TuanS10.mxl,"Piano,Piano",F major,428.0
2,,classical,212.264151,0,ecomp,II. Intermezzo in A Minor,106.0,midi_sources/from_mxl/ecomp/2017/WangH09.mid,3/4,midi_sources/ecomp/2017/WangH09.mxl,"Piano,Piano",A minor,375.0
3,Johann Sebastian Bach,classical,152.432432,-4,ecomp,"Prelude and Fugue in E Major, WTC I, BWV 854",74.0,midi_sources/from_mxl/ecomp/2017/MiyashitaM01.mid,4/4,midi_sources/ecomp/2017/MiyashitaM01.mxl,"Piano,Piano",E major,188.0
4,,classical,246.571918,-5,ecomp,I. Ruhig bewegt,73.0,midi_sources/from_mxl/ecomp/2017/KabuliL05.mid,4/4,midi_sources/ecomp/2017/KabuliL05.mxl,"Piano,Piano",D minor,71999/240


### Reformatting code

In [54]:
# def reformat_json(out_path):
#     f2m = load_json(out_path)
#     m = {relative_path(k):v for k,v in f2m.items()}
#     save_json(m, out_path)
#     return m

In [56]:
# def reformat_key(k):
#     return k.replace('data/midi/midi_sources_fromxml_v3','midi_sources/from_mxl')

In [57]:
# def reformat_data(d):
#     if 'midi' in d: d['midi'] = reformat_key(d['midi'])
#     if 'mxl' in d: d['mxl'] = d['mxl'].replace('data/midi/midi_sources_v3', 'midi_sources')
#     return d

In [59]:
# f2m = load_json(clc_json)
# m = {reformat_data(v)['mxl']:reformat_data(v) for k,v in f2m.items() if 'mxl' in reformat_data(v)}
# save_json(m, clc_json)

### Classic Piano

In [65]:
clc_path, clc_json, clc_csv = create_paths('classic_piano')
list(clc_path.glob('*'))[:5]

[PosixPath('data/midi/v4/midi_sources/classic_piano/liz_rhap15_format0.mxl'),
 PosixPath('data/midi/v4/midi_sources/classic_piano/ty_september_format0.mxl'),
 PosixPath('data/midi/v4/midi_sources/classic_piano/clementi_opus36_2_2_format0.mid'),
 PosixPath('data/midi/v4/midi_sources/classic_piano/mz_333_2_format0.mid'),
 PosixPath('data/midi/v4/midi_sources/classic_piano/haydn_7_1_format0.mid')]

In [66]:
def parse_classic_songs(fp):
    name = fp.with_suffix('').name.split('_')
    artist = name[0]
    title = ' '.join(name[1:])
    return {
        'artist': artist,
        'title': title,
        'mxl': relative_path(fp),
        'genres': 'classical',
        'source': 'classical_piano'
    }

In [67]:
file_list = get_files(clc_path, extensions=['.mxl'], recurse=True)

In [68]:
file_list = [fp for fp in file_list if fp.stat().st_size/1000 < 350] # over 200mb takes crazy long to analyze

In [69]:
clc_md = parse_midi_dir(file_list, clc_json, base_path=version_path, meta_func=parse_classic_songs)

In [70]:
clc = load_json(clc_json)
arr2csv(clc.values(), clc_csv)
df = pd.read_csv(clc_csv); df.head()

Unnamed: 0,artist,genres,seconds,inferred_offset,source,title,bpm,midi,time_signature,mxl,instruments,inferred_key,quarter_length
0,haydn,classical,57.682776,5,classical_piano,8 2 format0,100.0,midi_sources/from_mxl/classic_piano/haydn_8_2_...,3/4,midi_sources/classic_piano/haydn_8_2_format0.mxl,"Piano,Piano",G major,96.0
1,chpn-p18,classical,41.887744,4,classical_piano,format0,126.0,midi_sources/from_mxl/classic_piano/chpn-p18_f...,4/4,midi_sources/classic_piano/chpn-p18_format0.mxl,"Piano,Piano",F minor,85.0
2,chpn-p14,classical,28.414025,6,classical_piano,format0,156.0,midi_sources/from_mxl/classic_piano/chpn-p14_f...,4/4,midi_sources/classic_piano/chpn-p14_format0.mxl,"Piano,Piano",E- minor,76.0
3,scn15,classical,128.220469,-5,classical_piano,7 format0,100.0,midi_sources/from_mxl/classic_piano/scn15_7_fo...,4/4,midi_sources/classic_piano/scn15_7_format0.mxl,"Piano,Piano",F major,132.0
4,chpn-p23,classical,51.438271,-5,classical_piano,format0,150.0,midi_sources/from_mxl/classic_piano/chpn-p23_f...,4/4,midi_sources/classic_piano/chpn-p23_format0.mxl,"Piano,Piano",F major,92.0


### Creating CSV

In [73]:
all_csvs = [create_paths(s)[-1] for s in sources]
all_dfs = [pd.read_csv(csv) for csv in all_csvs]

In [74]:
merged_df = pd.concat(all_dfs, sort=False)
merged_df = merged_df.reset_index(drop=True); merged_df

Unnamed: 0,genres,ht_offset,source,song_url,parts,seconds,inferred_offset,title,time_signature,ht_key,...,section,instruments,ht_mode,midi_title,ht_bpm,bpm,midi,inferred_key,quarter_length,mxl
0,,0.0,hooktheory,https://www.hooktheory.com/theorytab/view/wayn...,"intro,chorus",25.411765,0.0,yu-gi-oh-theme-song,3/4,C,...,intro,Piano,1.0,yu-gi-oh,85.0,85.0,midi_sources/hooktheory/pianoroll/w/wayne-shar...,C major,36,
1,Holiday,-1.0,hooktheory,https://www.hooktheory.com/theorytab/view/wham...,"intro,verse,chorus",17.777778,-1.0,last-christmas,4/4,Db,...,chorus,"Piano,Piano",1.0,Last Christmas Chorus,108.0,108.0,midi_sources/hooktheory/pianoroll/w/wham/last-...,B- minor,32,
2,,6.0,hooktheory,https://www.hooktheory.com/theorytab/view/wolf...,verse,15.118110,1.0,love-and-war,4/4,G#,...,verse,"Piano,Piano",2.0,wolfgang gartner love and war,127.0,127.0,midi_sources/hooktheory/pianoroll/w/wolfgang-g...,G# minor,32,
3,Electronic,3.0,hooktheory,https://www.hooktheory.com/theorytab/view/wood...,"intro,bridge",12.972973,-2.0,rainbow-factory,4/4,F#,...,intro,"Piano,Piano",6.0,Rainbow Factory,148.0,148.0,midi_sources/hooktheory/pianoroll/w/woodentoas...,D major,32,
4,,8.0,hooktheory,https://www.hooktheory.com/theorytab/view/wolf...,pre-chorus,15.000000,1.0,space-junk,4/4,F#,...,pre-chorus,"Piano,Piano",2.0,Space Junk,128.0,128.0,midi_sources/hooktheory/pianoroll/w/wolfgang-g...,B major,32,
5,,1.0,hooktheory,https://www.hooktheory.com/theorytab/view/weeb...,"intro,verse",13.714286,6.0,donkeys,4/4,B,...,intro,Piano,1.0,Donkeys Intro,140.0,140.0,midi_sources/hooktheory/pianoroll/w/weebl/donk...,F# major,32,
6,,-5.0,hooktheory,https://www.hooktheory.com/theorytab/view/wolf...,verse,15.000000,-5.0,illmerica,4/4,D,...,verse,"Piano,Piano",6.0,illmerica,128.0,128.0,midi_sources/hooktheory/pianoroll/w/wolfgang-g...,D minor,32,
7,,-4.0,hooktheory,https://www.hooktheory.com/theorytab/view/will...,chorus,28.965517,-4.0,blue-eyes-cryin,4/4,E,...,chorus,"Piano,Piano",,Blue Eyes Cryin,58.0,58.0,midi_sources/hooktheory/pianoroll/w/willie-nel...,E major,28,
8,,-2.0,hooktheory,https://www.hooktheory.com/theorytab/view/will...,verse,42.352941,-2.0,you-were-always-on-my-mind,4/4,D,...,verse,"Piano,Piano",1.0,You Were Always On My Mind,68.0,68.0,midi_sources/hooktheory/pianoroll/w/willie-nel...,D major,48,
9,,1.0,hooktheory,https://www.hooktheory.com/theorytab/view/weeb...,"intro,verse",13.714286,1.0,donkeys,4/4,B,...,verse,"Piano,Piano",1.0,Donkeys,140.0,140.0,midi_sources/hooktheory/pianoroll/w/weebl/donk...,B major,32,


In [75]:
[df.shape for df in all_dfs], merged_df.shape

([(19876, 21),
  (5784, 12),
  (4712, 12),
  (2538, 13),
  (314, 12),
  (328, 13),
  (6391, 13)],
 (39943, 22))

In [76]:
merged_df.to_csv(all_csv, index=False)