### Create CSV from midi sources

In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import glob
import os
from tqdm import tqdm
from IPython.display import Image, Audio
import traceback

In [3]:
# parallel
from functools import partial
from pathlib import Path

In [4]:
from data_sources import *
from midi_data import *

In [5]:
path = Path('data/midi')
orig_path = path/'midi_sources_v3'
metapath = path/'metadata_v3'
all_csv = metapath/'midi_sources_v3.csv'
metapath.mkdir(parents=True, exist_ok=True)

In [6]:
def create_paths(dirname):
    "Standardize midi_source paths"
    dir_path = orig_path/dirname
    json_path = metapath/f'{dirname}_metadata.json'
    csv_path = metapath/f'{dirname}_metadata.csv'
    return dir_path, json_path, csv_path

In [7]:
sources = ['hooktheory', 'freemidi', 'midiworld', 'ecomp', 'cprato', 'classic_piano', 'wikifonia']

In [8]:
directories = [x for x in path.iterdir() if x.is_dir()]; directories

[PosixPath('data/midi/metadata'),
 PosixPath('data/midi/midi_sources'),
 PosixPath('data/midi/midi_transform_v1'),
 PosixPath('data/midi/midi_sources_fromxml_v3'),
 PosixPath('data/midi/midi_transcribe_v2_shortcont'),
 PosixPath('data/midi/midi_numpy_v2'),
 PosixPath('data/midi/metadata_v3'),
 PosixPath('data/midi/midi_transcribe_v2_longcont'),
 PosixPath('data/midi/midi_sources_v3'),
 PosixPath('data/midi/midi_transcribe_v1'),
 PosixPath('data/midi/midi_npz_v2'),
 PosixPath('data/midi/midi_transcribe_v1_simple'),
 PosixPath('data/midi/midi_transform_v2'),
 PosixPath('data/midi/midi_transcribe_v2_shortdur'),
 PosixPath('data/midi/metadata_v3_old'),
 PosixPath('data/midi/midi_transcribe_v2_longdur'),
 PosixPath('data/midi/midi_transcribe_v2_simple')]

### Remove corrupted file - this causes deadlock with music21 processing

In [9]:
corrf = Path('data/midi/midi_sources/midiworld/named_midi/NITRO_BRO_-_IT_WONT_DIE.mid')
corrf = Path('data/midi/midi_sources/midiworld/named_midi/NITRO_BRO_-_IT_WONT_DIE.mid')

corrupted_files = [
    'midiworld/named_midi/NITRO_BRO_-_IT_WONT_DIE.mid',
    'ecomp/2004/MORET02.mid',
    'ecomp/2006/Mordvinov9.MID',
    'ecomp/2006/Na06.MID',
    'ecomp/2008/Cui01.MID',
    'ecomp/2008/Cui02.MID',
    'ecomp/2008/Cui03.MID',
    'ecomp/2008/Cui04.MID',
    'ecomp/2008/Cui05.MID',
    'ecomp/2008/Cui06.MID',
    'ecomp/2008/Cui07.MID',
    'ecomp/2008/Cui08.MID',
    'ecomp/2008/Tan01.MID',
    'ecomp/2008/Tan02.MID',
    'ecomp/2008/Tan03.MID',
    'ecomp/2018/KaszoS14.MID'
]
for f in corrupted_files:
    fp = path/f
    if fp.exists(): fp.unlink()

### Hooktheory

In [10]:
ht_path, ht_json, ht_csv = create_paths('hooktheory')
ht_song_list = metapath/'hooktheory_key2info.json'

In [11]:
def song_key(s): return '_'.join(s.parts[-3:-1])

In [12]:
ht_song_list

PosixPath('data/midi/metadata_v3/hooktheory_key2info.json')

In [13]:
ht_key2info = load_json(ht_song_list)

if ht_key2info is None:
    song_info = list((ht_path/'xml').glob('*/*/*/*.json'))
    ht_key2info = {song_key(s):json.load(open(s, 'r')) for s in song_info}
    save_json(ht_key2info, ht_song_list)
    len(song_info)

In [14]:
song_json = list((ht_path/'event').glob('*/*/*/*_key.json')); len(song_json) # using json instead of midi for metadata

19876

In [15]:
def get_ht_midifile(json_file):
    return str(json_file.with_suffix('.mid')).replace('event', 'pianoroll').replace('symbol_', '')

In [16]:
def get_hooktheory_attr(fp):
    song_info = ht_key2info[song_key(fp)]
    song_json = json.load(open(fp, 'r'))
    metadata = song_json['metadata']
    artist = fp.parts[-3]
    title = fp.parts[-2]
    section = fp.name.split('_')[0]
    midi_path = get_ht_midifile(fp)
    
    ht_key = metadata['key']
    ht_mode = metadata['mode']
    if ht_mode is None: ht_mode = 'major'
    ht_offset = keyc_offset(ht_key, ht_mode)
    
    # convert stream here
    metadata = {
        'artist': artist,
        'title': title,
        'midi': midi_path,
        'section': section,
        'parts': song_info['section'],
        'song_url': song_info['song_url'],
        'genres': song_info['genres'],
        'midi_title': metadata['title'],
        'source': 'hooktheory',
        'ht_bpm': metadata['BPM'],
        'ht_mode': metadata['mode'],
        'ht_key': metadata['key'],
        'ht_offset': ht_offset,
        'ht_time_signature': metadata['beats_in_measure']
    }
    
    return {
        'file_path': midi_path, # midi path not json path
        'metadata': metadata
    }

In [17]:
# sanity check
# hook_out = get_hooktheory_attr(song_json[1000]); hook_out

In [None]:
ht_metadata = parse_midi_dir(song_json, ht_json, meta_func=get_hooktheory_attr, key_func=get_ht_midifile)

#### Save song_list

In [22]:
ht_metadata = load_json(ht_json)
arr2csv(ht_metadata.values(), ht_csv)
df = pd.read_csv(ht_csv); df.head()

Unnamed: 0,ht_bpm,inferred_offset,quarter_length,section,artist,bpm,song_url,seconds,midi_title,time_signature,...,midi,source,ht_key,parts,instruments,ht_offset,ht_mode,ht_time_signature,inferred_key,title
0,128,0.0,32.0,chorus,wayne-sharpe,128.0,https://www.hooktheory.com/theorytab/view/wayn...,15.0,yu-gi-oh3,4/4,...,data/midi/midi_sources_v3/hooktheory/pianoroll...,hooktheory,C,"intro,chorus","Piano,Piano",0,1.0,4,C major,yu-gi-oh-theme-song
1,85,0.0,36.0,intro,wayne-sharpe,85.0,https://www.hooktheory.com/theorytab/view/wayn...,25.411765,yu-gi-oh,3/4,...,data/midi/midi_sources_v3/hooktheory/pianoroll...,hooktheory,C,"intro,chorus",Piano,0,1.0,3,C major,yu-gi-oh-theme-song
2,96,0.0,16.0,chorus,what-a-day,96.0,https://www.hooktheory.com/theorytab/view/what...,10.0,kiefer,4/4,...,data/midi/midi_sources_v3/hooktheory/pianoroll...,hooktheory,D,chorus,"Piano,Piano",-5,6.0,4,A minor,kiefer
3,152,-5.0,64.0,chorus,whiteflame,152.0,https://www.hooktheory.com/theorytab/view/whit...,25.263158,Senbonzakura,4/4,...,data/midi/midi_sources_v3/hooktheory/pianoroll...,hooktheory,D,"verse,pre-chorus,chorus","Piano,Piano",-5,6.0,4,D minor,senbonzakura
4,108,-1.0,32.0,chorus,wham,108.0,https://www.hooktheory.com/theorytab/view/wham...,17.777778,Last Christmas Chorus,4/4,...,data/midi/midi_sources_v3/hooktheory/pianoroll...,hooktheory,Db,"intro,verse,chorus","Piano,Piano",-1,1.0,4,B- minor,last-christmas


In [23]:
df.shape

(19876, 21)

## FreeMidi

In [15]:
fm_path, _, fm_csv = create_paths('freemidi')
fm_dance_path = metapath/f'freemidi_dance_metadata.json'
fm_pop_path = metapath/f'freemidi_pop_metadata.json'
list(fm_path.glob('*'))

[PosixPath('data/midi/midi_sources/freemidi/genre-disco'),
 PosixPath('data/midi/midi_sources/freemidi/genre-pop'),
 PosixPath('data/midi/midi_sources/freemidi/genre-dance-eletric'),
 PosixPath('data/midi/midi_sources/freemidi/genre-punk'),
 PosixPath('data/midi/midi_sources/freemidi/genre-hip-hop-rap'),
 PosixPath('data/midi/midi_sources/freemidi/genre-rock')]

In [19]:
def parse_freemidi_songs(fp, genres=None, source=None):
    name = fp.with_suffix('').name.split(' - ')
    artist = name[0]
    title = name[-1]
    metadata = {
        'artist': artist.strip(),
        'title': title.strip(),
        'midi': str(fp),
        'genres': genres,
        'source': source
    }
    return {
        'file_path': fp,
        'metadata': metadata
    }

In [None]:
d_parse_func = partial(parse_freemidi_songs, genres='dance', source='freemidi')
dir_path = fm_path/'genre-dance-eletric'
file_list = get_files(dir_path, extensions=['.mid'], recurse=True)
fm_dance_list = parse_midi_dir(file_list, fm_dance_path, meta_func=d_parse_func)

In [None]:
p_parse_func = partial(parse_freemidi_songs, genres='pop', source='freemidi')
dir_path = fm_path/'genre-pop'
file_list = get_files(dir_path, extensions=['.mid'], recurse=True)
fm_pop_list = parse_midi_dir(file_list, fm_pop_path, meta_func=p_parse_func)

In [22]:
fmd = load_json(fm_dance_path)
fmp = load_json(fm_pop_path)
fm_all = list(fmd.values())+list(fmp.values())
arr2csv(fm_all, fm_csv)
df = pd.read_csv(fm_csv); df.head()

Unnamed: 0,seconds,inferred_offset,midi,artist,instruments,inferred_key,genres,bpm,time_signature,source,title
0,28.732394,-5.0,data/midi/midi_sources/freemidi/genre-dance-el...,Tune Up,,F major,dance,142.0,,freemidi,Bounce
1,85.263158,2.0,data/midi/midi_sources/freemidi/genre-dance-el...,Radiohead,"Piano,Piano",G minor,dance,76.0,4/4,freemidi,Fitter Happier
2,40.0,-4.0,data/midi/midi_sources/freemidi/genre-dance-el...,Daft Punk,"StringInstrument,StringInstrument,StringInstru...",C# minor,dance,102.0,4/4,freemidi,The Grid
3,,,data/midi/midi_sources/freemidi/genre-dance-el...,Bjork,,,dance,,,freemidi,Glora
4,38.4,3.0,data/midi/midi_sources/freemidi/genre-dance-el...,Enigma,"Clarinet,Clarinet,Guitar,Guitar,Guitar,Guitar,...",A major,dance,300.0,4/4,freemidi,Wanted


### Gather Cprato

In [16]:
cp_path, cp_json, cp_csv = create_paths('cprato')
list(cp_path.glob('*'))[:5]

[PosixPath('data/midi/midi_sources/cprato/Basto - Again And Again (midi By Carlo Prato) (www.cprato.com).mid'),
 PosixPath('data/midi/midi_sources/cprato/The Weeknd ft. Lana Del Rey - Stargirl Interlude  (midi by Carlo Prato) (www.cprato.com).mid'),
 PosixPath('data/midi/midi_sources/cprato/Two Steps From Hell - Magic of Love  (midi by Carlo Prato) (www.cprato.com).mid'),
 PosixPath('data/midi/midi_sources/cprato/Bermuda Loverz - My Girl (Ladidada) (Rimini Rockaz Radio Edit) (Midi By Carlo Prato) (www.cprato.com).mid'),
 PosixPath('data/midi/midi_sources/cprato/Cascada - Everytime We Touch (Midi By Carlo Prato) (www.cprato.com).mid')]

In [24]:
def parse_cprato_songs(fp, genres=None, source=None):
    name = fp.with_suffix('').name.split(' - ')
    artist = name[0]
    title = name[-1].replace('(midi by Carlo Prato) (www.cprato.com)', '')
    metadata = {
        'artist': artist.strip(),
        'title': title.strip(),
        'midi': str(fp),
        'genres': genres,
        'source': source
    }
    return {
        'file_path': fp,
        'metadata': metadata
    }

In [25]:
cp_meta = partial(parse_cprato_songs, genres='EDM,inferred', source='cprato')
file_list = get_files(cp_path, extensions=['.mid'], recurse=True)
cp_md = parse_midi_dir(file_list, cp_json, cp_meta)

In [26]:
cp = load_json(cp_json)
arr2csv(cp.values(), cp_csv)
df = pd.read_csv(cp_csv); df.head()

Unnamed: 0,seconds,inferred_offset,midi,artist,instruments,inferred_key,genres,bpm,time_signature,source,title
0,28.531469,4,data/midi/midi_sources/cprato/Alex Gaudino - C...,Alex Gaudino,,A- major,"EDM,inferred",143.0,,cprato,Calabria (Drunken Monkey Remix) (Midi By Carlo...
1,45.801527,-1,data/midi/midi_sources/cprato/Oh Wonder - All ...,Oh Wonder,"Flute,Flute,Flute,Piano,Piano,Piano,Fretless B...",C# major,"EDM,inferred",131.0,4/4,cprato,All We Do
2,30.447761,0,data/midi/midi_sources/cprato/Martin Garrix & ...,Martin Garrix & Bebe Rexha,,A minor,"EDM,inferred",134.0,4/4,cprato,In The Name Of Love
3,27.2,1,data/midi/midi_sources/cprato/Andy SVGE - Evol...,Andy SVGE,"Piano,Piano,Piano,Piano,Piano,Piano",B major,"EDM,inferred",150.0,4/4,cprato,Evolving
4,29.142857,-4,data/midi/midi_sources/cprato/DuMonde - See Th...,DuMonde,"Piano,Piano,Piano,Piano,Piano,Piano",C# minor,"EDM,inferred",140.0,4/4,cprato,See The Light


### Gather MidiWorld

In [19]:
mw_path, mw_json, mw_csv = create_paths('midiworld')

In [20]:
def parse_midiworld_songs(fp):
    name = fp.with_suffix('').name.replace('_', ' ').split(' - ')
    artist = name[0]
    title = name[-1]
    metadata = {
        'artist': artist.strip(),
        'title': title.strip(),
        'midi': str(fp),
        'genres': 'pop,inferred',
        'source': 'midiworld'
    }
    return {
        'file_path': fp,
        'metadata': metadata
    }

In [None]:
# Warning: if you get a deadlock: 
# PosixPath('data/midi/midi_sources/midiworld/named_midi/NITRO_BRO_-_IT_WONT_DIE.mid') is broken
file_list = get_files(mw_path/'named_midi', extensions=['.mid'], recurse=True)[3000:]
mw_md = parse_midi_dir(file_list, mw_json, parse_midiworld_songs)

Midi Exeption: data/midi/midi_sources_v3/midiworld/named_midi/Gabi_Fratucello_-_.mid badly formated midi bytes, got: b'ID3\x04\x00\x00\x00\x00\x01\x00TXXX\x00\x00\x00\x12\x00\x00'
Midi Exeption: data/midi/midi_sources_v3/midiworld/named_midi/The_Offspring_-_No_Hero.mid badly formated midi bytes, got: b'RIFF\x04N\x01\x00RMIDdata\xf7M\x01\x00'
Midi Exeption: data/midi/midi_sources_v3/midiworld/named_midi/Nirvana_-_Son_Of_A_Gun.mid badly formed midi string: missing leading MTrk
Midi Exeption: data/midi/midi_sources_v3/midiworld/named_midi/Nirvana_-_Dumb.mid index out of range
Midi Exeption: data/midi/midi_sources_v3/midiworld/named_midi/Nirvana_-_Mr._Moustache.mid badly formed midi string: missing leading MTrk
Midi Exeption: data/midi/midi_sources_v3/midiworld/named_midi/Nirvana_-_Lounge_Act.mid badly formed midi string: missing leading MTrk
Midi Exeption: data/midi/midi_sources_v3/midiworld/named_midi/Third_Eye_Blind_-_Semi_Charmed_Life.mid badly formated midi bytes, got: b'RIFFV\x0c\x01

In [45]:
mw = load_json(mw_json)
arr2csv(mw.values(), mw_csv)
df = pd.read_csv(mw_csv); df.head()

Unnamed: 0,bpm,title,midi,inferred_offset,genres,artist,seconds,time_signature,inferred_key,source,instruments
0,176.0,Drumloop,data/midi/midi_sources/midiworld/named_midi/Ko...,-2.0,"pop,inferred",Kona,1.193182,4/4,B minor,midiworld,Piano
1,127.0,Millenium,data/midi/midi_sources/midiworld/named_midi/TV...,3.0,"pop,inferred",TV Themes,41.929134,4/4,A major,midiworld,"Timpani,Taiko,Violin,Contrabass,Voice"
2,160.0,Looney Tunes,data/midi/midi_sources/midiworld/named_midi/TV...,0.0,"pop,inferred",TV Themes,19.25,4/4,C major,midiworld,"Marimba,Acoustic Bass,Trombone,StringInstrumen..."
3,108.0,Diddy Kong,data/midi/midi_sources/midiworld/named_midi/Vi...,-2.0,"pop,inferred",Video Game Themes,73.333333,4/4,B minor,midiworld,"Fretless Bass,Harmonica"
4,105.0,,data/midi/midi_sources/midiworld/named_midi/Ji...,-5.0,"pop,inferred",Jigsaw,104.806742,4/4,D minor,midiworld,"StringInstrument,Horn,Trombone,StringInstrumen..."


### Gather Wikifonia

In [10]:
wf_path, wf_json, wf_csv = create_paths('wikifonia')

In [11]:
def parse_wikifonia_songs(fp):
    name = fp.with_suffix('').name.replace('_', ' ').split(' - ')
    artist = name[0]
    title = name[-1]
    metadata = {
        'artist': artist.strip(),
        'title': title.strip(),
        'mxl': str(fp),
        'genres': 'pop,inferred',
        'source': 'wikifonia'
    }
    return {
        'file_path': fp,
        'metadata': metadata
    }

In [12]:
# Warning: if you get a deadlock: 
# PosixPath('data/midi/midi_sources/midiworld/named_midi/NITRO_BRO_-_IT_WONT_DIE.mid') is broken
file_list = get_files(wf_path, extensions=['.mxl'], recurse=True)
wf_md = parse_midi_dir(file_list, wf_json, parse_wikifonia_songs)

Midi Exeption: data/midi/midi_sources_v3/wikifonia/Django Reinhardt - Douce Ambiance.mxl local variable 'lyricLanguage' referenced before assignment
Midi Exeption: data/midi/midi_sources_v3/wikifonia/Charlie Parker - Scrapple From The Apple.mxl Degree not in specified chord: 9




Midi Exeption: data/midi/midi_sources_v3/wikifonia/Lucas Secon, Wayne Hector - I Hate This Part.mxl chr() arg not in range(0x110000)
Midi Exeption: data/midi/midi_sources_v3/wikifonia/Bob Dorough, Dave Frishberg - Listen Here.mxl chr() arg not in range(0x110000)
Midi Exeption: data/midi/midi_sources_v3/wikifonia/Green, Karn, Lombardo - Coquette.mxl chr() arg not in range(0x110000)
Midi Exeption: data/midi/midi_sources_v3/wikifonia/George Gershwin, Ira Gershwin - I Got Rhythm.mxl chr() arg not in range(0x110000)
Midi Exeption: data/midi/midi_sources_v3/wikifonia/Alfred James Ellis - The Chicken.mxl chr() arg not in range(0x110000)
Midi Exeption: data/midi/midi_sources_v3/wikifonia/Bach ? - Four Part Choralforwikifoniachannel04quant.mxl local variable 'lyricLanguage' referenced before assignment
Midi Exeption: data/midi/midi_sources_v3/wikifonia/blah3 - blah.mxl failed to get likely keys for Stream component
Midi Exeption: data/midi/midi_sources_v3/wikifonia/Monty Python - Always look on

Midi Exeption: data/midi/midi_sources_v3/wikifonia/Theme - Star Trek The Next Generation theme.mxl float division by zero
Midi Exeption: data/midi/midi_sources_v3/wikifonia/May Brahe, Helen Taylor - Bless This House.mxl chr() arg not in range(0x110000)
Midi Exeption: data/midi/midi_sources_v3/wikifonia/Al Stillman, Ervin Drake, Irvin Graham, Jimmy Shirl - I Believe.mxl chr() arg not in range(0x110000)
Midi Exeption: data/midi/midi_sources_v3/wikifonia/Frank Foster, Ella Fitzgerald - Shiny Stockings.mxl chr() arg not in range(0x110000)
Midi Exeption: data/midi/midi_sources_v3/wikifonia/George Gershwin, Ira Gershwin - How Long Has This Been Going On.mxl chr() arg not in range(0x110000)
Midi Exeption: data/midi/midi_sources_v3/wikifonia/James Van Heusen, Sammy Cahn - All The Way.mxl chr() arg not in range(0x110000)
Midi Exeption: data/midi/midi_sources_v3/wikifonia/Baden Powell - A Felicidade.mxl chr() arg not in range(0x110000)
Midi Exeption: data/midi/midi_sources_v3/wikifonia/Tom Petty



Midi Exeption: data/midi/midi_sources_v3/wikifonia/Joe Harnell - The Lonely Man.mxl Degree not in specified chord: 11
Midi Exeption: data/midi/midi_sources_v3/wikifonia/Chico  Batera - Bonan?a.mxl chr() arg not in range(0x110000)
Midi Exeption: data/midi/midi_sources_v3/wikifonia/L. van Beethoven - Himno a la Alegria.mxl local variable 'lyricLanguage' referenced before assignment
Midi Exeption: data/midi/midi_sources_v3/wikifonia/Michel PLATRE - BRANLE DES PIERRES.mxl chr() arg not in range(0x110000)
Midi Exeption: data/midi/midi_sources_v3/wikifonia/ARR. by Peter Sweeney - WONDERFUL TONIGHT.mxl local variable 'lyricLanguage' referenced before assignment
Midi Exeption: data/midi/midi_sources_v3/wikifonia/Harry Warren, Al Dubin - September in the Rain.mxl chr() arg not in range(0x110000)
Midi Exeption: data/midi/midi_sources_v3/wikifonia/James Van Heusen, Johnny Burke - Here's That Rainy Day.mxl chr() arg not in range(0x110000)
Midi Exeption: data/midi/midi_sources_v3/wikifonia/Arthur J

Midi Exeption: data/midi/midi_sources_v3/wikifonia/Michel PLATRE - LES VERNES.mxl chr() arg not in range(0x110000)
Midi Exeption: data/midi/midi_sources_v3/wikifonia/Per Warming, Jens Rosenda - Du kom med alt det der var dig.mxl local variable 'lyricLanguage' referenced before assignment
Midi Exeption: data/midi/midi_sources_v3/wikifonia/Antonio Carlos Jobim, Ray Gilbert (English Lyrics) - Dindi.mxl Degree not in specified chord: 9
Midi Exeption: data/midi/midi_sources_v3/wikifonia/Ron Linnebach - Blue Waltz.mxl chr() arg not in range(0x110000)
Midi Exeption: data/midi/midi_sources_v3/wikifonia/S.Sgt Barry Sadler - The Ballad of the Green Berets.mxl float division by zero
Midi Exeption: data/midi/midi_sources_v3/wikifonia/Bud Powell - Bouncin' with Bud.mxl Degree not in specified chord: 13
Midi Exeption: data/midi/midi_sources_v3/wikifonia/Green Day - 21 guns.mxl chr() arg not in range(0x110000)
Midi Exeption: data/midi/midi_sources_v3/wikifonia/Traditional - Down By The Riverside.mxl 

In [13]:
wf = load_json(wf_json)
arr2csv(wf.values(), wf_csv)
df = pd.read_csv(wf_csv); df.head()

Unnamed: 0,midi,title,seconds,time_signature,instruments,source,mxl,genres,bpm,quarter_length,artist,inferred_key,inferred_offset
0,,Douce Ambiance,,,,wikifonia,data/midi/midi_sources_v3/wikifonia/Django Rei...,"pop,inferred",,,Django Reinhardt,,
1,,Scrapple From The Apple,,,,wikifonia,data/midi/midi_sources_v3/wikifonia/Charlie Pa...,"pop,inferred",,,Charlie Parker,,
2,data/midi/midi_sources_fromxml_v3/wikifonia/Hu...,"Hej, Varg?n? k?poszt?t f?z",,3/4,Voice,wikifonia,data/midi/midi_sources_v3/wikifonia/Hungarian ...,"pop,inferred",,24.0,Hungarian folk song,D minor,-5.0
3,data/midi/midi_sources_fromxml_v3/wikifonia/Hu...,?n az ?jjel nem aludtam egy ?r?t,,4/4,Grand Piano,wikifonia,data/midi/midi_sources_v3/wikifonia/Hungarian ...,"pop,inferred",,48.0,Hungarian folk song,C major,0.0
4,data/midi/midi_sources_fromxml_v3/wikifonia/Jo...,Mamacita,,4/4,,wikifonia,data/midi/midi_sources_v3/wikifonia/Joe Hender...,"pop,inferred",,64.0,Joe Henderson,F minor,4.0


### Yamaha - piano

In [9]:
ec_path, ec_json, ec_csv = create_paths('ecomp')
ec_song = ec_path/'song_list.json'
list(ec_path.glob('*'))[:5]

[PosixPath('data/midi/midi_sources_v3/ecomp/song_list.json'),
 PosixPath('data/midi/midi_sources_v3/ecomp/2017'),
 PosixPath('data/midi/midi_sources_v3/ecomp/2008'),
 PosixPath('data/midi/midi_sources_v3/ecomp/._2002'),
 PosixPath('data/midi/midi_sources_v3/ecomp/._2017')]

In [10]:
ec_songs = load_json(ec_path/'song_list.json')

In [11]:
def parse_ecomp_songs(fp):
    song_info = ec_songs[fp.name]
    fp = fp.with_suffix('.mxl')
    metadata = {
        'artist': song_info['artist'],
        'title': song_info['title'],
        'mxl': str(fp),
        'genres': 'classical',
        'source': 'ecomp'
    }
    return {
        'file_path': fp,
        'metadata': metadata
    }

In [12]:
file_list = get_files(ec_path, extensions=['.mid'], recurse=True)

In [13]:
sorted_files = sorted([fp.with_suffix('.mxl').stat().st_size/1000 for fp in file_list])[-200:]

In [14]:
file_list = [fp for fp in file_list if fp.with_suffix('.mxl').stat().st_size/1000 < 400] # over 200mb takes crazy long to analyze
ec_md = parse_midi_dir(file_list, ec_json, parse_ecomp_songs)

Midi Exeption: data/midi/midi_sources_v3/ecomp/2017/SunY03.mxl got a negative delta time
Midi Exeption: data/midi/midi_sources_v3/ecomp/2006/Tysman02.mxl got a negative delta time
Midi Exeption: data/midi/midi_sources_v3/ecomp/2004/BLINOV01.mxl got a negative delta time
Midi Exeption: data/midi/midi_sources_v3/ecomp/2009/Na03.mxl got a negative delta time
Midi Exeption: data/midi/midi_sources_v3/ecomp/2002/sun01.mxl got a negative delta time


In [15]:
ec = load_json(ec_json)
arr2csv(ec.values(), ec_csv)
df = pd.read_csv(ec_csv); df.head()

Unnamed: 0,source,title,mxl,seconds,time_signature,instruments,quarter_length,midi,genres,artist,inferred_key,bpm,inferred_offset
0,ecomp,II. Allegro con moto,data/midi/midi_sources_v3/ecomp/2017/SunY06.mxl,123.130435,4/4,"Piano,Piano",236.0,data/midi/midi_sources_fromxml_v3/ecomp/2017/S...,classical,,A major,115.0,3
1,ecomp,IV. Intermezzo in E Major,data/midi/midi_sources_v3/ecomp/2017/WangH11.mxl,257.727273,3/4,"Piano,Piano",378.0,data/midi/midi_sources_fromxml_v3/ecomp/2017/W...,classical,,C# minor,88.0,-4
2,ecomp,VI. Allegro (Fuga),data/midi/midi_sources_v3/ecomp/2017/TuanS10.mxl,128.4,4/4,"Piano,Piano",428.0,data/midi/midi_sources_fromxml_v3/ecomp/2017/T...,classical,,F major,200.0,-5
3,ecomp,II. Intermezzo in A Minor,data/midi/midi_sources_v3/ecomp/2017/WangH09.mxl,212.264151,3/4,"Piano,Piano",375.0,data/midi/midi_sources_fromxml_v3/ecomp/2017/W...,classical,,A minor,106.0,0
4,ecomp,II. Sehr lebhaft,data/midi/midi_sources_v3/ecomp/2017/KabuliL06...,177.6,4/4,"Piano,Piano",444.0,data/midi/midi_sources_fromxml_v3/ecomp/2017/K...,classical,,B- major,150.0,2


### Classic Piano

In [14]:
clc_path, clc_json, clc_csv = create_paths('classic_piano')
list(clc_path.glob('*'))[:5]

[PosixPath('data/midi/midi_sources_v3/classic_piano/liz_rhap15_format0.mxl'),
 PosixPath('data/midi/midi_sources_v3/classic_piano/ty_september_format0.mxl'),
 PosixPath('data/midi/midi_sources_v3/classic_piano/clementi_opus36_2_2_format0.mid'),
 PosixPath('data/midi/midi_sources_v3/classic_piano/mz_333_2_format0.mid'),
 PosixPath('data/midi/midi_sources_v3/classic_piano/haydn_7_1_format0.mid')]

In [15]:
def parse_classic_songs(fp):
    name = fp.with_suffix('').name.split('_')
    artist = name[0]
    title = ' '.join(name[1:])
    metadata = {
        'artist': artist,
        'title': title,
        'mxl': str(fp),
        'genres': 'classical',
        'source': 'classical_piano'
    }
    return {
        'file_path': fp,
        'metadata': metadata
    }

In [16]:
file_list = get_files(clc_path, extensions=['.mxl'], recurse=True)

In [17]:
file_list = [fp for fp in file_list if fp.stat().st_size/1000 < 300] # over 200mb takes crazy long to analyze

In [18]:
clc_md = parse_midi_dir(file_list, clc_json, parse_classic_songs)

In [19]:
clc = load_json(clc_json)
arr2csv(clc.values(), clc_csv)
df = pd.read_csv(clc_csv); df.head()

Unnamed: 0,midi,title,seconds,time_signature,instruments,source,mxl,genres,bpm,quarter_length,artist,inferred_key,inferred_offset
0,data/midi/midi_sources_fromxml_v3/classic_pian...,format0,29.635549,6/8,"Piano,Piano",classical_piano,data/midi/midi_sources_v3/classic_piano/chpn-p...,classical,111.0,81.0,chpn-p11,B major,1
1,data/midi/midi_sources_fromxml_v3/classic_pian...,3 format0,31.903386,2/4,"Piano,Piano",classical_piano,data/midi/midi_sources_v3/classic_piano/scn15_...,classical,115.0,64.0,scn15,E minor,5
2,data/midi/midi_sources_fromxml_v3/classic_pian...,2 format0,245.746646,6/8,"Piano,Piano",classical_piano,data/midi/midi_sources_v3/classic_piano/waldst...,classical,20.0,84.0,waldstein,C major,0
3,data/midi/midi_sources_fromxml_v3/classic_pian...,halling format0,50.626664,2/4,"Piano,Piano",classical_piano,data/midi/midi_sources_v3/classic_piano/grieg_...,classical,100.0,74.0,grieg,G minor,2
4,data/midi/midi_sources_fromxml_v3/classic_pian...,format0,88.579181,4/4,"Piano,Piano",classical_piano,data/midi/midi_sources_v3/classic_piano/chpn-p...,classical,35.0,48.0,chpn-p9,E minor,5


### Creating CSV

In [9]:
all_csvs = [create_paths(s)[-1] for s in sources]
all_dfs = [pd.read_csv(csv) for csv in all_csvs]

In [10]:
merged_df = pd.concat(all_dfs, sort=False)
merged_df = merged_df.reset_index(drop=True); merged_df

Unnamed: 0,source,title,midi_title,inferred_offset,inferred_key,ht_offset,ht_mode,artist,seconds,bpm,section,ht_key,instruments,time_signature,ht_time_signature,ht_bpm,song_url,midi,parts,genres
0,hooktheory,yu-gi-oh-theme-song,yu-gi-oh3,0.0,C major,0.0,1.0,wayne-sharpe,15.000000,128.00,chorus,C,"Piano,Piano",4/4,4.0,128.0,https://www.hooktheory.com/theorytab/view/wayn...,data/midi/midi_sources/hooktheory/pianoroll/w/...,"intro,chorus",
1,hooktheory,beverly-hills,My New Song,,,0.0,,weezer,,,intro-and-verse,C,,,4.0,128.0,https://www.hooktheory.com/theorytab/view/weez...,data/midi/midi_sources/hooktheory/pianoroll/w/...,intro-and-verse,
2,hooktheory,falling-for-you,falling for you intro,-3.0,E- major,-3.0,1.0,weezer,11.111111,108.00,intro,Eb,"Piano,Piano",4/4,4.0,108.0,https://www.hooktheory.com/theorytab/view/weez...,data/midi/midi_sources/hooktheory/pianoroll/w/...,"intro,verse,chorus,solo",
3,hooktheory,buddy-holly,Buddy Holly,4.0,A- major,4.0,1.0,weezer,43.388430,121.00,solo,Ab,"Piano,Piano",4/4,4.0,121.0,https://www.hooktheory.com/theorytab/view/weez...,data/midi/midi_sources/hooktheory/pianoroll/w/...,"verse,pre-chorus,chorus,bridge,solo","Pop,Rock"
4,hooktheory,dreams-of-grandeur,dreams of grandeur,-2.0,B minor,3.0,1.0,wavves,21.333333,180.00,verse,A,"Piano,Piano",4/4,4.0,180.0,https://www.hooktheory.com/theorytab/view/wavv...,data/midi/midi_sources/hooktheory/pianoroll/w/...,verse,
5,hooktheory,la-girlz,LA Girlz,4.0,A- major,4.0,1.0,weezer,12.203390,118.00,chorus,Ab,"Piano,Piano",3/4,3.0,118.0,https://www.hooktheory.com/theorytab/view/weez...,data/midi/midi_sources/hooktheory/pianoroll/w/...,"verse,chorus",
6,hooktheory,the-freaking-fcc,Freakin FCC Bridge,0.0,C major,0.0,1.0,walter-murphy,17.142857,126.00,bridge,C,"Piano,Piano",4/4,4.0,126.0,https://www.hooktheory.com/theorytab/view/walt...,data/midi/midi_sources/hooktheory/pianoroll/w/...,"intro-and-verse,bridge",Soundtrack
7,hooktheory,west-wing-suite,snuffy,5.0,G major,5.0,1.0,wg-snuffy-walden,39.069767,86.00,instrumental,G,"Piano,Piano",4/4,4.0,86.0,https://www.hooktheory.com/theorytab/view/wg-s...,data/midi/midi_sources/hooktheory/pianoroll/w/...,instrumental,
8,hooktheory,family-guy-theme-song,Family Guy - Theme Song,-5.0,F major,-5.0,1.0,walter-murphy,32.578125,128.00,verse,F,"Piano,Piano",4/4,4.0,128.0,https://www.hooktheory.com/theorytab/view/walt...,data/midi/midi_sources/hooktheory/pianoroll/w/...,verse,
9,hooktheory,last-christmas,Last Christmas Verse,-1.0,B- minor,-1.0,1.0,wham,35.555556,108.00,verse,Db,"Piano,Piano",4/4,4.0,108.0,https://www.hooktheory.com/theorytab/view/wham...,data/midi/midi_sources/hooktheory/pianoroll/w/...,"intro,verse,chorus",Holiday


In [11]:
[df.shape for df in all_dfs], merged_df.shape

([(19876, 20), (5797, 11), (4715, 11), (2715, 11), (314, 11), (329, 11)],
 (33746, 20))

In [12]:
merged_df.to_csv(all_csv, index=False)