### Create CSV from midi sources

In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import glob
import os
from tqdm import tqdm
from IPython.display import Image, Audio
import traceback

In [3]:
# parallel
from functools import partial
from pathlib import Path

In [4]:
from data_sources import *
from midi_data import *

In [5]:
path = Path('data/midi')
orig_path = path/'midi_sources_v3'
metapath = path/'metadata_v3'
all_csv = metapath/'midi_sources_v3.csv'
metapath.mkdir(parents=True, exist_ok=True)

In [6]:
def create_paths(dirname):
    "Standardize midi_source paths"
    dir_path = orig_path/dirname
    json_path = metapath/f'{dirname}_metadata.json'
    csv_path = metapath/f'{dirname}_metadata.csv'
    return dir_path, json_path, csv_path

In [7]:
sources = ['hooktheory', 'freemidi', 'midiworld', 'ecomp', 'cprato', 'classic_piano', 'wikifonia']

In [8]:
directories = [x for x in path.iterdir() if x.is_dir()]; directories

[PosixPath('data/midi/metadata'),
 PosixPath('data/midi/midi_sources'),
 PosixPath('data/midi/midi_transform_v1'),
 PosixPath('data/midi/midi_sources_fromxml_v3'),
 PosixPath('data/midi/midi_transcribe_v2_shortcont'),
 PosixPath('data/midi/midi_numpy_v2'),
 PosixPath('data/midi/metadata_v3'),
 PosixPath('data/midi/midi_transcribe_v2_longcont'),
 PosixPath('data/midi/midi_sources_v3'),
 PosixPath('data/midi/midi_transcribe_v1'),
 PosixPath('data/midi/midi_npz_v2'),
 PosixPath('data/midi/midi_transcribe_v1_simple'),
 PosixPath('data/midi/midi_transform_v2'),
 PosixPath('data/midi/midi_transform_v3'),
 PosixPath('data/midi/midi_transcribe_v2_shortdur'),
 PosixPath('data/midi/metadata_v3_old'),
 PosixPath('data/midi/midi_transcribe_v2_longdur'),
 PosixPath('data/midi/midi_transcribe_v2_simple')]

### Remove corrupted file - this causes deadlock with music21 processing

In [9]:
corrupted_files = [
    'midiworld/named_midi/NITRO_BRO_-_IT_WONT_DIE.mid',
    'ecomp/2004/MORET02.mid',
    'ecomp/2006/Mordvinov9.MID',
    'ecomp/2006/Na06.MID',
    'ecomp/2008/Cui01.MID',
    'ecomp/2008/Cui02.MID',
    'ecomp/2008/Cui03.MID',
    'ecomp/2008/Cui04.MID',
    'ecomp/2008/Cui05.MID',
    'ecomp/2008/Cui06.MID',
    'ecomp/2008/Cui07.MID',
    'ecomp/2008/Cui08.MID',
    'ecomp/2008/Tan01.MID',
    'ecomp/2008/Tan02.MID',
    'ecomp/2008/Tan03.MID',
    'ecomp/2018/KaszoS14.MID'
]
for f in corrupted_files:
    fp = path/f
    if fp.exists(): fp.unlink()

### Hooktheory

In [10]:
ht_path, ht_json, ht_csv = create_paths('hooktheory')
ht_song_list = metapath/'hooktheory_key2info.json'

In [11]:
def song_key(s): return '_'.join(s.parts[-3:-1])

In [12]:
ht_song_list

PosixPath('data/midi/metadata_v3/hooktheory_key2info.json')

In [13]:
ht_key2info = load_json(ht_song_list)

if ht_key2info is None:
    song_info = list((ht_path/'xml').glob('*/*/*/*.json'))
    ht_key2info = {song_key(s):json.load(open(s, 'r')) for s in song_info}
    save_json(ht_key2info, ht_song_list)
    len(song_info)

In [14]:
song_json = list((ht_path/'event').glob('*/*/*/*_key.json')); len(song_json) # using json instead of midi for metadata

19876

In [15]:
def get_ht_midifile(json_file):
    return str(json_file.with_suffix('.mid')).replace('event', 'pianoroll').replace('symbol_', '')

In [16]:
def get_hooktheory_attr(fp):
    song_info = ht_key2info[song_key(fp)]
    song_json = json.load(open(fp, 'r'))
    metadata = song_json['metadata']
    artist = fp.parts[-3]
    title = fp.parts[-2]
    section = fp.name.split('_')[0]
    midi_path = get_ht_midifile(fp)
    
    ht_key = metadata['key']
    ht_mode = metadata['mode']
    if ht_mode is None: ht_mode = 'major'
    ht_offset = keyc_offset(ht_key, ht_mode)
    
    # convert stream here
    metadata = {
        'artist': artist,
        'title': title,
        'midi': midi_path,
        'section': section,
        'parts': song_info['section'],
        'song_url': song_info['song_url'],
        'genres': song_info['genres'],
        'midi_title': metadata['title'],
        'source': 'hooktheory',
        'ht_bpm': metadata['BPM'],
        'ht_mode': metadata['mode'],
        'ht_key': metadata['key'],
        'ht_offset': ht_offset,
        'ht_time_signature': metadata['beats_in_measure']
    }
    
    return {
        'file_path': midi_path, # midi path not json path
        'metadata': metadata
    }

In [17]:
# sanity check
# hook_out = get_hooktheory_attr(song_json[1000]); hook_out

In [18]:
ht_metadata = parse_midi_dir(song_json, ht_json, meta_func=get_hooktheory_attr, key_func=get_ht_midifile)

Midi Exeption: data/midi/midi_sources_v3/hooktheory/pianoroll/w/weezer/beverly-hills/intro-and-verse_key.mid list index out of range


Midi Exeption: data/midi/midi_sources_v3/hooktheory/pianoroll/w/wolfgang-amadeus-mozart/symphony-no-25-in-g-minor/intro-and-verse_key.mid list index out of range
Midi Exeption: data/midi/midi_sources_v3/hooktheory/pianoroll/y/ylvis/the-fox---what-does-the-fox-say/verse-and-pre-chorus_key.mid list index out of range
Midi Exeption: data/midi/midi_sources_v3/hooktheory/pianoroll/j/jay-z/so-ambitious-feat-pharrel-williams/intro_key.mid list index out of range
Midi Exeption: data/midi/midi_sources_v3/hooktheory/pianoroll/j/johnny-cash/folsom-prison-blues/verse_key.mid list index out of range
Midi Exeption: data/midi/midi_sources_v3/hooktheory/pianoroll/j/jack-johnson/do-you-remember/intro-and-verse_key.mid list index out of range
Midi Exeption: data/midi/midi_sources_v3/hooktheory/pianoroll/j/justin-moore/if-heaven-wasnt-so-far-awat/intro-and-verse_key.mid list index out of range
Midi Exeption: data/midi/midi_sources_v3/hooktheory/pianoroll/j/janis-ian/at-spianorolleen/verse_key.mid Cannot 

Midi Exeption: data/midi/midi_sources_v3/hooktheory/pianoroll/t/tame-impala/pianorollually/verse_key.mid Cannot find file in data/midi/midi_sources_v3/hooktheory/pianoroll/t/tame-impala/pianorollually/verse_key.mid
Midi Exeption: data/midi/midi_sources_v3/hooktheory/pianoroll/t/tame-impala/pianorollually/intro_key.mid Cannot find file in data/midi/midi_sources_v3/hooktheory/pianoroll/t/tame-impala/pianorollually/intro_key.mid
Midi Exeption: data/midi/midi_sources_v3/hooktheory/pianoroll/t/tame-impala/pianorollually/pre-chorus-and-chorus_key.mid Cannot find file in data/midi/midi_sources_v3/hooktheory/pianoroll/t/tame-impala/pianorollually/pre-chorus-and-chorus_key.mid
Midi Exeption: data/midi/midi_sources_v3/hooktheory/pianoroll/t/the-beatles/drive-my-car/verse_key.mid list index out of range
Midi Exeption: data/midi/midi_sources_v3/hooktheory/pianoroll/t/the-beatles/the-end/solo_key.mid list index out of range
Midi Exeption: data/midi/midi_sources_v3/hooktheory/pianoroll/k/katy-pary/f

#### Save song_list

In [19]:
ht_metadata = load_json(ht_json)
arr2csv(ht_metadata.values(), ht_csv)
df = pd.read_csv(ht_csv); df.head()

Unnamed: 0,source,artist,title,ht_key,parts,ht_offset,ht_bpm,midi,genres,time_signature,...,inferred_offset,section,seconds,midi_title,song_url,ht_mode,instruments,bpm,ht_time_signature,inferred_key
0,hooktheory,weezer,beverly-hills,C,intro-and-verse,0,128,data/midi/midi_sources_v3/hooktheory/pianoroll...,,,...,,intro-and-verse,,My New Song,https://www.hooktheory.com/theorytab/view/weez...,,,,4,
1,hooktheory,willie-nelson,on-the-road-again,E,"verse,bridge",-4,112,data/midi/midi_sources_v3/hooktheory/pianoroll...,,4/4,...,-4.0,verse,15.0,On The Road Again,https://www.hooktheory.com/theorytab/view/will...,1.0,Piano,112.0,4,E major
2,hooktheory,weebl,donkeys,B,"intro,verse",1,140,data/midi/midi_sources_v3/hooktheory/pianoroll...,,4/4,...,6.0,intro,13.714286,Donkeys Intro,https://www.hooktheory.com/theorytab/view/weeb...,1.0,Piano,140.0,4,F# major
3,hooktheory,wayne-sharpe,yu-gi-oh-theme-song,C,"intro,chorus",0,128,data/midi/midi_sources_v3/hooktheory/pianoroll...,,4/4,...,0.0,chorus,15.0,yu-gi-oh3,https://www.hooktheory.com/theorytab/view/wayn...,1.0,"Piano,Piano",128.0,4,C major
4,hooktheory,wham,last-christmas,Db,"intro,verse,chorus",-1,108,data/midi/midi_sources_v3/hooktheory/pianoroll...,Holiday,4/4,...,-1.0,chorus,17.777778,Last Christmas Chorus,https://www.hooktheory.com/theorytab/view/wham...,1.0,"Piano,Piano",108.0,4,B- minor


In [20]:
df.shape

(19876, 21)

## FreeMidi

In [21]:
fm_path, _, fm_csv = create_paths('freemidi')
fm_dance_path = metapath/f'freemidi_dance_metadata.json'
fm_pop_path = metapath/f'freemidi_pop_metadata.json'
list(fm_path.glob('*'))

[PosixPath('data/midi/midi_sources_v3/freemidi/._genre-rock'),
 PosixPath('data/midi/midi_sources_v3/freemidi/._.DS_Store'),
 PosixPath('data/midi/midi_sources_v3/freemidi/genre-disco'),
 PosixPath('data/midi/midi_sources_v3/freemidi/._genre-pop'),
 PosixPath('data/midi/midi_sources_v3/freemidi/genre-pop'),
 PosixPath('data/midi/midi_sources_v3/freemidi/._genre-hip-hop-rap'),
 PosixPath('data/midi/midi_sources_v3/freemidi/genre-dance-eletric'),
 PosixPath('data/midi/midi_sources_v3/freemidi/genre-punk'),
 PosixPath('data/midi/midi_sources_v3/freemidi/genre-hip-hop-rap'),
 PosixPath('data/midi/midi_sources_v3/freemidi/.DS_Store'),
 PosixPath('data/midi/midi_sources_v3/freemidi/genre-rock'),
 PosixPath('data/midi/midi_sources_v3/freemidi/._genre-dance-eletric'),
 PosixPath('data/midi/midi_sources_v3/freemidi/._genre-disco'),
 PosixPath('data/midi/midi_sources_v3/freemidi/._genre-punk')]

In [22]:
def parse_freemidi_songs(fp, genres=None, source=None):
    name = fp.with_suffix('').name.split(' - ')
    artist = name[0]
    title = name[-1]
    metadata = {
        'artist': artist.strip(),
        'title': title.strip(),
        'midi': str(fp),
        'genres': genres,
        'source': source
    }
    return {
        'file_path': fp,
        'metadata': metadata
    }

In [23]:
d_parse_func = partial(parse_freemidi_songs, genres='dance', source='freemidi')
dir_path = fm_path/'genre-dance-eletric'
file_list = get_files(dir_path, extensions=['.mid'], recurse=True)
fm_dance_list = parse_midi_dir(file_list, fm_dance_path, meta_func=d_parse_func)

Midi Exeption: data/midi/midi_sources_v3/freemidi/genre-dance-eletric/Gloria Estefan - I'm Not Givin' You Up.mid badly formated midi bytes, got: b'RIFFB\x8c\x00\x00RMIDdata~\x8b\x00\x00'
Midi Exeption: data/midi/midi_sources_v3/freemidi/genre-dance-eletric/Atomic Kitten - Whole Again.mid badly formated midi bytes, got: b'RIFF\x08K\x00\x00RMIDdata{J\x00\x00'
Midi Exeption: data/midi/midi_sources_v3/freemidi/genre-dance-eletric/Apollo 440 - Lost In Space.mid index out of range
Midi Exeption: data/midi/midi_sources_v3/freemidi/genre-dance-eletric/Tatu - Ya Tvoy Vrag (I'm Your Enemy).mid badly formated midi bytes, got: b'RIFF,\xa3\x00\x00RMIDdata\xc6\xa2\x00\x00'
Midi Exeption: data/midi/midi_sources_v3/freemidi/genre-dance-eletric/Gloria Estefan - Get On Your Feet.mid badly formated midi bytes, got: b'RIFF\x92\xa8\x01\x00RMIDdata\xd8\xa7\x01\x00'
Midi Exeption: data/midi/midi_sources_v3/freemidi/genre-dance-eletric/Gloria Estefan - Mi Tierra.mid badly formated midi bytes, got: b'RIFFz\r\x

In [24]:
p_parse_func = partial(parse_freemidi_songs, genres='pop', source='freemidi')
dir_path = fm_path/'genre-pop'
file_list = get_files(dir_path, extensions=['.mid'], recurse=True)
fm_pop_list = parse_midi_dir(file_list, fm_pop_path, meta_func=p_parse_func)

Midi Exeption: data/midi/midi_sources_v3/freemidi/genre-pop/Cyndi Lauper - Whats Going On.mid badly formated midi bytes, got: b'0&\xb2u\x8ef\xcf\x11\xa6\xd9\x00\xaa\x00b\xcel?\x13\x00\x00'
Midi Exeption: data/midi/midi_sources_v3/freemidi/genre-pop/Cyndi Lauper - Who Let In The Rain.mid badly formated midi bytes, got: b'0&\xb2u\x8ef\xcf\x11\xa6\xd9\x00\xaa\x00b\xcel?\x13\x00\x00'
Midi Exeption: data/midi/midi_sources_v3/freemidi/genre-pop/5th Dimension - One Less Bell To Answere.mid badly formed midi string: missing leading MTrk
Midi Exeption: data/midi/midi_sources_v3/freemidi/genre-pop/Cyndi Lauper - The World Is Stone.mid badly formated midi bytes, got: b'0&\xb2u\x8ef\xcf\x11\xa6\xd9\x00\xaa\x00b\xcel?\x13\x00\x00'
Midi Exeption: data/midi/midi_sources_v3/freemidi/genre-pop/Janet Jackson - Escapade.mid badly formated midi bytes, got: b'error with file'
Midi Exeption: data/midi/midi_sources_v3/freemidi/genre-pop/Cyndi Lauper - Change Of Heart.mid badly formated midi bytes, got: b'0&\

Midi Exeption: data/midi/midi_sources_v3/freemidi/genre-pop/Westlife - Fool Again.mid badly formed midi string: missing leading MTrk
Midi Exeption: data/midi/midi_sources_v3/freemidi/genre-pop/Modern Talking - Megamix 2000.mid badly formated midi bytes, got: b'RIFFp\x91\x02\x00RMIDdata\xd1\x90\x02\x00'
Midi Exeption: data/midi/midi_sources_v3/freemidi/genre-pop/Donna Summer - Bad Girls.mid badly formed midi string: missing leading MTrk
Midi Exeption: data/midi/midi_sources_v3/freemidi/genre-pop/Britney Spears - I Wanna Go.mid index out of range
Midi Exeption: data/midi/midi_sources_v3/freemidi/genre-pop/Gloria Estefan - Abriendo Puertas.mid badly formated midi bytes, got: b'error with file'
Midi Exeption: data/midi/midi_sources_v3/freemidi/genre-pop/Four Seasons - I've Got You Under My Skin.mid badly formed midi string: missing leading MTrk
Midi Exeption: data/midi/midi_sources_v3/freemidi/genre-pop/Wings - Band On The Run.mid badly formed midi string: missing leading MTrk
Midi Exeptio

In [25]:
fmd = load_json(fm_dance_path)
fmp = load_json(fm_pop_path)
fm_all = list(fmd.values())+list(fmp.values())
arr2csv(fm_all, fm_csv)
df = pd.read_csv(fm_csv); df.head()

Unnamed: 0,source,quarter_length,inferred_offset,instruments,artist,midi,title,bpm,genres,seconds,time_signature,inferred_key
0,freemidi,108.0,2.0,"Piano,Piano",Radiohead,data/midi/midi_sources_v3/freemidi/genre-dance...,Fitter Happier,76.0,dance,85.263158,4/4,G minor
1,freemidi,68.0,-5.0,,Tune Up,data/midi/midi_sources_v3/freemidi/genre-dance...,Bounce,142.0,dance,28.732394,,F major
2,freemidi,68.0,-4.0,"StringInstrument,StringInstrument,StringInstru...",Daft Punk,data/midi/midi_sources_v3/freemidi/genre-dance...,The Grid,102.0,dance,40.0,4/4,C# minor
3,freemidi,613/6,3.0,"Flute,Flute,Flute,Flute",Bjork,data/midi/midi_sources_v3/freemidi/genre-dance...,Glora,65.0,dance,,4/4,A major
4,freemidi,192.0,3.0,"Clarinet,Clarinet,Guitar,Guitar,Guitar,Guitar,...",Enigma,data/midi/midi_sources_v3/freemidi/genre-dance...,Wanted,300.0,dance,38.4,4/4,A major


### Gather Cprato

In [26]:
cp_path, cp_json, cp_csv = create_paths('cprato')
list(cp_path.glob('*'))[:5]

[PosixPath('data/midi/midi_sources_v3/cprato/Basto - Again And Again (midi By Carlo Prato) (www.cprato.com).mid'),
 PosixPath('data/midi/midi_sources_v3/cprato/The Weeknd ft. Lana Del Rey - Stargirl Interlude  (midi by Carlo Prato) (www.cprato.com).mid'),
 PosixPath('data/midi/midi_sources_v3/cprato/Two Steps From Hell - Magic of Love  (midi by Carlo Prato) (www.cprato.com).mid'),
 PosixPath('data/midi/midi_sources_v3/cprato/._Sandstorm - Darude  (midi by Carlo Prato) (www.cprato.com).mid'),
 PosixPath('data/midi/midi_sources_v3/cprato/._Karma Fields - Skyline (Original Mix) (midi by Carlo Prato) (www.cprato.com).mid')]

In [27]:
def parse_cprato_songs(fp, genres=None, source=None):
    name = fp.with_suffix('').name.split(' - ')
    artist = name[0]
    title = name[-1].replace('(midi by Carlo Prato) (www.cprato.com)', '')
    metadata = {
        'artist': artist.strip(),
        'title': title.strip(),
        'midi': str(fp),
        'genres': genres,
        'source': source
    }
    return {
        'file_path': fp,
        'metadata': metadata
    }

In [28]:
cp_meta = partial(parse_cprato_songs, genres='EDM,inferred', source='cprato')
file_list = get_files(cp_path, extensions=['.mid'], recurse=True)
cp_md = parse_midi_dir(file_list, cp_json, cp_meta)

In [29]:
cp = load_json(cp_json)
arr2csv(cp.values(), cp_csv)
df = pd.read_csv(cp_csv); df.head()

Unnamed: 0,source,quarter_length,inferred_offset,instruments,artist,midi,title,bpm,genres,seconds,time_signature,inferred_key
0,cprato,100.0,-1,"Flute,Flute,Flute,Piano,Piano,Piano,Fretless B...",Oh Wonder,data/midi/midi_sources_v3/cprato/Oh Wonder - A...,All We Do,131.0,"EDM,inferred",45.801527,4/4,C# major
1,cprato,72.0,4,,Cascada,data/midi/midi_sources_v3/cprato/Cascada - Eve...,Everytime We Touch (Midi By Carlo Prato) (www....,140.0,"EDM,inferred",30.857143,,A- major
2,cprato,68.0,-4,"Piano,Piano,Piano,Piano,Piano,Piano",DuMonde,data/midi/midi_sources_v3/cprato/DuMonde - See...,See The Light,140.0,"EDM,inferred",29.142857,4/4,C# minor
3,cprato,36.0,-5,"Piano,Piano,Piano,Piano,Piano,Piano",Avicii,data/midi/midi_sources_v3/cprato/Avicii - Lone...,Lonely Together,103.0,"EDM,inferred",20.970874,4/4,F major
4,cprato,68.0,4,,Alex Gaudino,data/midi/midi_sources_v3/cprato/Alex Gaudino ...,Calabria (Drunken Monkey Remix) (Midi By Carlo...,143.0,"EDM,inferred",28.531469,,A- major


### Gather MidiWorld

In [30]:
mw_path, mw_json, mw_csv = create_paths('midiworld')

In [31]:
def parse_midiworld_songs(fp):
    name = fp.with_suffix('').name.replace('_', ' ').split(' - ')
    artist = name[0]
    title = name[-1]
    metadata = {
        'artist': artist.strip(),
        'title': title.strip(),
        'midi': str(fp),
        'genres': 'pop,inferred',
        'source': 'midiworld'
    }
    return {
        'file_path': fp,
        'metadata': metadata
    }

In [40]:
file_list = get_files(mw_path/'named_midi', extensions=['.mid'], recurse=True)
file_list = [fp for fp in file_list if fp.stat().st_size/1000 < 150] # over 200mb takes crazy long to analyze

In [None]:
# sorted([fp.stat().st_size/1000 for fp in file_list])[-200:]

In [41]:
mw_md = parse_midi_dir(file_list, mw_json, parse_midiworld_songs)

Midi Exeption: data/midi/midi_sources_v3/midiworld/named_midi/Nirvana_-_Turnaround.mid badly formed midi string: missing leading MTrk
Midi Exeption: data/midi/midi_sources_v3/midiworld/named_midi/Nirvana_-_Swap_Meet.mid badly formed midi string: missing leading MTrk
Midi Exeption: data/midi/midi_sources_v3/midiworld/named_midi/Nirvana_-_All_Apologies.mid badly formed midi string: missing leading MTrk
Midi Exeption: data/midi/midi_sources_v3/midiworld/named_midi/Nirvana_-_Smells_Like_Teen_Spirit.mid badly formed midi string: missing leading MTrk
Midi Exeption: data/midi/midi_sources_v3/midiworld/named_midi/John_Paul_Young_-_Love_is_in_the_Air.mid badly formed midi string: missing leading MTrk
Midi Exeption: data/midi/midi_sources_v3/midiworld/named_midi/The_Offspring_-_No_Hero.mid badly formated midi bytes, got: b'RIFF\x04N\x01\x00RMIDdata\xf7M\x01\x00'
Midi Exeption: data/midi/midi_sources_v3/midiworld/named_midi/Nirvana_-_Son_Of_A_Gun.mid badly formed midi string: missing leading MTrk

In [42]:
mw = load_json(mw_json)
arr2csv(mw.values(), mw_csv)
df = pd.read_csv(mw_csv); df.head()

Unnamed: 0,source,quarter_length,inferred_offset,instruments,artist,midi,title,bpm,genres,seconds,time_signature,inferred_key
0,midiworld,3.5,-2.0,Piano,Kona,data/midi/midi_sources_v3/midiworld/named_midi...,Drumloop,176.0,"pop,inferred",1.193182,4/4,B minor
1,midiworld,88.75,3.0,"Timpani,Taiko,Violin,Contrabass,Voice",TV Themes,data/midi/midi_sources_v3/midiworld/named_midi...,Millenium,127.0,"pop,inferred",41.929134,4/4,A major
2,midiworld,154/3,0.0,"Marimba,Acoustic Bass,Trombone,StringInstrumen...",TV Themes,data/midi/midi_sources_v3/midiworld/named_midi...,Looney Tunes,160.0,"pop,inferred",19.25,4/4,C major
3,midiworld,163.0,-4.0,Piano,Kaito,data/midi/midi_sources_v3/midiworld/named_midi...,Cantarella,144.0,"pop,inferred",67.916667,2/4,C# minor
4,midiworld,191.75,5.0,"Ocarina,Tenor Saxophone,Ocarina,Tenor Saxophone",Nine Inch Nails,data/midi/midi_sources_v3/midiworld/named_midi...,Nothing,90.0,"pop,inferred",127.833333,4/4,G major


### Gather Wikifonia

In [16]:
wf_path, wf_json, wf_csv = create_paths('wikifonia')

In [17]:
def parse_wikifonia_songs(fp):
    name = fp.with_suffix('').name.replace('_', ' ').split(' - ')
    artist = name[0]
    title = name[-1]
    metadata = {
        'artist': artist.strip(),
        'title': title.strip(),
        'mxl': str(fp),
        'genres': 'pop,inferred',
        'source': 'wikifonia'
    }
    return {
        'file_path': fp,
        'metadata': metadata
    }

In [None]:
# Warning: if you get a deadlock: 
# PosixPath('data/midi/midi_sources/midiworld/named_midi/NITRO_BRO_-_IT_WONT_DIE.mid') is broken
file_list = get_files(wf_path, extensions=['.mxl'], recurse=True)
wf_md = parse_midi_dir(file_list, wf_json, parse_wikifonia_songs)

In [19]:
wf = load_json(wf_json)
arr2csv(wf.values(), wf_csv)
df = pd.read_csv(wf_csv); df.head()

Unnamed: 0,inferred_key,midi,genres,title,artist,seconds,quarter_length,instruments,mxl,inferred_offset,time_signature,source,bpm
0,F major,data/midi/midi_sources_fromxml_v3/wikifonia/Ra...,"pop,inferred",The Corner Grocery Store,Raffi,,73.0,Flute,data/midi/midi_sources_v3/wikifonia/Raffi - Th...,-5.0,4/4,wikifonia,
1,B- major,data/midi/midi_sources_fromxml_v3/wikifonia/Je...,"pop,inferred",Spanish Harlem,"Jerry Leiber, Phil Spector",,124.0,,data/midi/midi_sources_v3/wikifonia/Jerry Leib...,2.0,4/4,wikifonia,
2,C major,data/midi/midi_sources_fromxml_v3/wikifonia/Hu...,"pop,inferred",?n az ?jjel nem aludtam egy ?r?t,Hungarian folk song,,48.0,Grand Piano,data/midi/midi_sources_v3/wikifonia/Hungarian ...,0.0,4/4,wikifonia,
3,F minor,data/midi/midi_sources_fromxml_v3/wikifonia/Jo...,"pop,inferred",Mamacita,Joe Henderson,,64.0,,data/midi/midi_sources_v3/wikifonia/Joe Hender...,4.0,4/4,wikifonia,
4,F major,data/midi/midi_sources_fromxml_v3/wikifonia/Un...,"pop,inferred",k GA NAAR VOLENDAM,Unknown,,228.0,Hammond Organ,data/midi/midi_sources_v3/wikifonia/Unknown - ...,-5.0,4/4,wikifonia,


### Yamaha - piano

In [9]:
ec_path, ec_json, ec_csv = create_paths('ecomp')
ec_song = ec_path/'song_list.json'
list(ec_path.glob('*'))[:5]

[PosixPath('data/midi/midi_sources_v3/ecomp/song_list.json'),
 PosixPath('data/midi/midi_sources_v3/ecomp/2017'),
 PosixPath('data/midi/midi_sources_v3/ecomp/2008'),
 PosixPath('data/midi/midi_sources_v3/ecomp/._2002'),
 PosixPath('data/midi/midi_sources_v3/ecomp/._2017')]

In [10]:
ec_songs = load_json(ec_path/'song_list.json')

In [11]:
def parse_ecomp_songs(fp):
    song_info = ec_songs[fp.name]
    fp = fp.with_suffix('.mxl')
    metadata = {
        'artist': song_info['artist'],
        'title': song_info['title'],
        'mxl': str(fp),
        'genres': 'classical',
        'source': 'ecomp'
    }
    return {
        'file_path': fp,
        'metadata': metadata
    }

In [12]:
file_list = get_files(ec_path, extensions=['.mid'], recurse=True)

In [13]:
sorted_files = sorted([fp.with_suffix('.mxl').stat().st_size/1000 for fp in file_list])[-200:]

In [14]:
file_list = [fp for fp in file_list if fp.with_suffix('.mxl').stat().st_size/1000 < 400] # over 200mb takes crazy long to analyze
ec_md = parse_midi_dir(file_list, ec_json, parse_ecomp_songs)

Midi Exeption: data/midi/midi_sources_v3/ecomp/2017/SunY03.mxl got a negative delta time
Midi Exeption: data/midi/midi_sources_v3/ecomp/2006/Tysman02.mxl got a negative delta time
Midi Exeption: data/midi/midi_sources_v3/ecomp/2004/BLINOV01.mxl got a negative delta time
Midi Exeption: data/midi/midi_sources_v3/ecomp/2009/Na03.mxl got a negative delta time
Midi Exeption: data/midi/midi_sources_v3/ecomp/2002/sun01.mxl got a negative delta time


In [15]:
ec = load_json(ec_json)
arr2csv(ec.values(), ec_csv)
df = pd.read_csv(ec_csv); df.head()

Unnamed: 0,inferred_key,midi,genres,title,artist,seconds,quarter_length,instruments,mxl,inferred_offset,time_signature,source,bpm
0,A major,data/midi/midi_sources_fromxml_v3/ecomp/2017/S...,classical,II. Allegro con moto,,123.130435,236.0,"Piano,Piano",data/midi/midi_sources_v3/ecomp/2017/SunY06.mxl,3,4/4,ecomp,115.0
1,F major,data/midi/midi_sources_fromxml_v3/ecomp/2017/T...,classical,VI. Allegro (Fuga),,128.4,428.0,"Piano,Piano",data/midi/midi_sources_v3/ecomp/2017/TuanS10.mxl,-5,4/4,ecomp,200.0
2,A minor,data/midi/midi_sources_fromxml_v3/ecomp/2017/W...,classical,II. Intermezzo in A Minor,,212.264151,375.0,"Piano,Piano",data/midi/midi_sources_v3/ecomp/2017/WangH09.mxl,0,3/4,ecomp,106.0
3,E major,data/midi/midi_sources_fromxml_v3/ecomp/2017/M...,classical,"Prelude and Fugue in E Major, WTC I, BWV 854",Johann Sebastian Bach,152.432432,188.0,"Piano,Piano",data/midi/midi_sources_v3/ecomp/2017/Miyashita...,-4,4/4,ecomp,74.0
4,D minor,data/midi/midi_sources_fromxml_v3/ecomp/2017/K...,classical,I. Ruhig bewegt,,246.571918,71999/240,"Piano,Piano",data/midi/midi_sources_v3/ecomp/2017/KabuliL05...,-5,4/4,ecomp,73.0


### Classic Piano

In [20]:
clc_path, clc_json, clc_csv = create_paths('classic_piano')
list(clc_path.glob('*'))[:5]

[PosixPath('data/midi/midi_sources_v3/classic_piano/liz_rhap15_format0.mxl'),
 PosixPath('data/midi/midi_sources_v3/classic_piano/ty_september_format0.mxl'),
 PosixPath('data/midi/midi_sources_v3/classic_piano/clementi_opus36_2_2_format0.mid'),
 PosixPath('data/midi/midi_sources_v3/classic_piano/mz_333_2_format0.mid'),
 PosixPath('data/midi/midi_sources_v3/classic_piano/haydn_7_1_format0.mid')]

In [21]:
def parse_classic_songs(fp):
    name = fp.with_suffix('').name.split('_')
    artist = name[0]
    title = ' '.join(name[1:])
    metadata = {
        'artist': artist,
        'title': title,
        'mxl': str(fp),
        'genres': 'classical',
        'source': 'classical_piano'
    }
    return {
        'file_path': fp,
        'metadata': metadata
    }

In [22]:
file_list = get_files(clc_path, extensions=['.mxl'], recurse=True)

In [23]:
file_list = [fp for fp in file_list if fp.stat().st_size/1000 < 300] # over 200mb takes crazy long to analyze

In [24]:
clc_md = parse_midi_dir(file_list, clc_json, parse_classic_songs)

In [25]:
clc = load_json(clc_json)
arr2csv(clc.values(), clc_csv)
df = pd.read_csv(clc_csv); df.head()

Unnamed: 0,inferred_key,midi,genres,title,artist,seconds,quarter_length,instruments,mxl,inferred_offset,time_signature,source,bpm
0,G major,data/midi/midi_sources_fromxml_v3/classic_pian...,classical,8 2 format0,haydn,57.682776,96.0,"Piano,Piano",data/midi/midi_sources_v3/classic_piano/haydn_...,5,3/4,classical_piano,100.0
1,F minor,data/midi/midi_sources_fromxml_v3/classic_pian...,classical,format0,chpn-p18,41.887744,85.0,"Piano,Piano",data/midi/midi_sources_v3/classic_piano/chpn-p...,4,4/4,classical_piano,126.0
2,E- minor,data/midi/midi_sources_fromxml_v3/classic_pian...,classical,format0,chpn-p14,28.414025,76.0,"Piano,Piano",data/midi/midi_sources_v3/classic_piano/chpn-p...,6,4/4,classical_piano,156.0
3,F major,data/midi/midi_sources_fromxml_v3/classic_pian...,classical,7 format0,scn15,128.220469,132.0,"Piano,Piano",data/midi/midi_sources_v3/classic_piano/scn15_...,-5,4/4,classical_piano,100.0
4,F major,data/midi/midi_sources_fromxml_v3/classic_pian...,classical,format0,chpn-p23,51.438271,92.0,"Piano,Piano",data/midi/midi_sources_v3/classic_piano/chpn-p...,-5,4/4,classical_piano,150.0


### Creating CSV

In [43]:
all_csvs = [create_paths(s)[-1] for s in sources]
all_dfs = [pd.read_csv(csv) for csv in all_csvs]

In [44]:
merged_df = pd.concat(all_dfs, sort=False)
merged_df = merged_df.reset_index(drop=True); merged_df

Unnamed: 0,source,artist,title,ht_key,parts,ht_offset,ht_bpm,midi,genres,time_signature,...,section,seconds,midi_title,song_url,ht_mode,instruments,bpm,ht_time_signature,inferred_key,mxl
0,hooktheory,weezer,beverly-hills,C,intro-and-verse,0.0,128.0,data/midi/midi_sources_v3/hooktheory/pianoroll...,,,...,intro-and-verse,,My New Song,https://www.hooktheory.com/theorytab/view/weez...,,,,4.0,,
1,hooktheory,willie-nelson,on-the-road-again,E,"verse,bridge",-4.0,112.0,data/midi/midi_sources_v3/hooktheory/pianoroll...,,4/4,...,verse,15.000000,On The Road Again,https://www.hooktheory.com/theorytab/view/will...,1.0,Piano,112.0,4.0,E major,
2,hooktheory,weebl,donkeys,B,"intro,verse",1.0,140.0,data/midi/midi_sources_v3/hooktheory/pianoroll...,,4/4,...,intro,13.714286,Donkeys Intro,https://www.hooktheory.com/theorytab/view/weeb...,1.0,Piano,140.0,4.0,F# major,
3,hooktheory,wayne-sharpe,yu-gi-oh-theme-song,C,"intro,chorus",0.0,128.0,data/midi/midi_sources_v3/hooktheory/pianoroll...,,4/4,...,chorus,15.000000,yu-gi-oh3,https://www.hooktheory.com/theorytab/view/wayn...,1.0,"Piano,Piano",128.0,4.0,C major,
4,hooktheory,wham,last-christmas,Db,"intro,verse,chorus",-1.0,108.0,data/midi/midi_sources_v3/hooktheory/pianoroll...,Holiday,4/4,...,chorus,17.777778,Last Christmas Chorus,https://www.hooktheory.com/theorytab/view/wham...,1.0,"Piano,Piano",108.0,4.0,B- minor,
5,hooktheory,what-a-day,kiefer,D,chorus,-5.0,96.0,data/midi/midi_sources_v3/hooktheory/pianoroll...,Jazz,4/4,...,chorus,10.000000,kiefer,https://www.hooktheory.com/theorytab/view/what...,6.0,"Piano,Piano",96.0,4.0,A minor,
6,hooktheory,willie-nelson,blue-eyes-cryin,E,chorus,-4.0,58.0,data/midi/midi_sources_v3/hooktheory/pianoroll...,,4/4,...,chorus,28.965517,Blue Eyes Cryin,https://www.hooktheory.com/theorytab/view/will...,,"Piano,Piano",58.0,4.0,E major,
7,hooktheory,waka-flocka-flame,no-hands,C#,chorus,-4.0,124.0,data/midi/midi_sources_v3/hooktheory/pianoroll...,Hip-Hop/Rap,4/4,...,chorus,15.483871,No Hands,https://www.hooktheory.com/theorytab/view/waka...,6.0,"Piano,Piano",124.0,4.0,C# minor,
8,hooktheory,weezer,falling-for-you,Eb,"intro,verse,chorus,solo",-3.0,108.0,data/midi/midi_sources_v3/hooktheory/pianoroll...,,4/4,...,intro,11.111111,falling for you intro,https://www.hooktheory.com/theorytab/view/weez...,1.0,"Piano,Piano",108.0,4.0,E- major,
9,hooktheory,wolfgang-gartner,space-junk,F#,pre-chorus,8.0,128.0,data/midi/midi_sources_v3/hooktheory/pianoroll...,,4/4,...,pre-chorus,15.000000,Space Junk,https://www.hooktheory.com/theorytab/view/wolf...,2.0,"Piano,Piano",128.0,4.0,B major,


In [45]:
[df.shape for df in all_dfs], merged_df.shape

([(19876, 21),
  (5784, 12),
  (4698, 12),
  (2538, 13),
  (314, 12),
  (328, 13),
  (6391, 13)],
 (39929, 22))

In [46]:
merged_df.to_csv(all_csv, index=False)