### Create CSV from midi sources

In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import glob
import os
from tqdm import tqdm
from IPython.display import Image, Audio
import traceback

In [3]:
import pandas as pd
from fastai.data_block import get_files

In [4]:
# parallel
from functools import partial
from pathlib import Path

In [5]:
import sys
sys.path.insert(0, '../../')

In [6]:
from src.data_sources import *
from src.midi_data import *

In [7]:
version = 'v15'
data_path = Path('data/midi')
version_path = data_path/version
orig_path = version_path/'midi_sources'
metapath = version_path/'metadata'
combined_csv = metapath/'combined.csv'
all_csv = metapath/'midi_sources.csv'
metapath.mkdir(parents=True, exist_ok=True)

In [8]:
def create_paths(dirname):
    "Standardize midi_source paths"
    dir_path = orig_path/dirname
    csv_path = metapath/f'{dirname}_metadata.csv'
    return dir_path, csv_path

In [9]:
sources = ['hooktheory', 'hooktheory_c', 'freemidi', 'midiworld', 'ecomp', 'cprato', 'classic_piano', 'classical_archives', 'musescore', 'wikifonia', 'lmd_clean', '130k_reddit']

In [10]:
version_path.relative_to(data_path)

PosixPath('v15')

In [11]:
def relative_path(filepath):
    return str(Path(filepath).relative_to(version_path))

### Remove corrupted file - this causes deadlock with music21 processing

In [12]:
corrupted_files = [
    'midiworld/named_midi/NITRO_BRO_-_IT_WONT_DIE.mid',
    'ecomp/2004/MORET02.mid',
    'ecomp/2006/Mordvinov9.MID',
    'ecomp/2006/Na06.MID',
    'ecomp/2008/Cui01.MID',
    'ecomp/2008/Cui02.MID',
    'ecomp/2008/Cui03.MID',
    'ecomp/2008/Cui04.MID',
    'ecomp/2008/Cui05.MID',
    'ecomp/2008/Cui06.MID',
    'ecomp/2008/Cui07.MID',
    'ecomp/2008/Cui08.MID',
    'ecomp/2008/Tan01.MID',
    'ecomp/2008/Tan02.MID',
    'ecomp/2008/Tan03.MID',
    'ecomp/2018/KaszoS14.MID',
    'midiworld/named_midi/Rob_Zombie_-_Demonoid_Phenomenon.mid',
    'midiworld/named_midi/Rob_Zombie_-_Demonoid_Phenomenon.mxl',
]
for f in corrupted_files:
    fp = orig_path/f
    if fp.exists(): fp.unlink()

In [13]:
import hashlib

In [14]:

def arr2csv(arr, out_file):
    "Convert metadata array to csv"
    all_keys = {k for d in arr for k in d.keys()}
    arr = [format_values(x) for x in arr]
    with open(out_file, 'w') as f:
        dict_writer = csv.DictWriter(f, list(all_keys))
        dict_writer.writeheader()
        dict_writer.writerows(arr)

In [15]:

def directory2csv(files, meta_func, csv_path):
    "Iterate through midi_source dir and map file to metadata"
    
    def get_meta(fp):
        # over 350mb takes crazy long to analyze
        size = fp.stat().st_size/1000
        if fp.suffix == 'mid' and size > 350: 
#             print('Removing mid over 350mb', fp, dixr)
            return None
        if fp.suffix == 'mxl' and size > 420: 
#             print('Removing mxl over 420mb', fp, size)
            return None
#         try:
        m = meta_func(fp)
        if m: m['md5'] = hashlib.md5(open(fp,'rb').read()).hexdigest()
        return m
#         except Exception as e:
#             print('Error:', fp, e)
#         return None
    
    mlist = [get_meta(fp) for fp in files]
    mlist = [x for x in mlist if x is not None]
    arr2csv(mlist, csv_path)
    return mlist

### Hooktheory

In [26]:
ht_cat = 'hooktheory'
ht_path, ht_csv = create_paths(ht_cat)
ht_midi_list = list((ht_path/'pianoroll').glob('*/*/*/*_key_original.mid')); 
len(ht_midi_list)

20076

In [17]:
ht_cat = 'hooktheory_c'
ht_path, ht_csv = create_paths(ht_cat)
ht_path = ht_path.with_name('hooktheory')
ht_midi_list = list((ht_path/'pianoroll').glob('*/*/*/*_key_cmajor.mid'));
len(ht_midi_list)

20076

In [27]:
ht_song_list = metapath/'hooktheory_key2info.json'

In [28]:
def song_key(s): return '_'.join(s.parts[-3:-1])

In [29]:
ht_key2info = load_json(ht_song_list)

if ht_key2info is None:
    song_info = list((ht_path/'xml').glob('*/*/*/*.json'))
    ht_key2info = {song_key(s):json.load(open(s, 'r')) for s in song_info}
    save_json(ht_key2info, ht_song_list)
len(ht_key2info)

12008

In [30]:
# ht_midi_list = list((ht_path/'pianoroll').glob('*/*/*/*_key_cmajor.mid')); 

In [31]:
def get_ht_jsonfile(midi_file): # using json instead of midi for metadata
    return str(midi_file.with_suffix('.json')).replace('pianoroll', 'event').replace('_key', '_symbol_key')

In [32]:
def get_hooktheory_attr(fp):
    song_info = ht_key2info[song_key(fp)]
    song_json = json.load(open(get_ht_jsonfile(fp), 'r'))
    metadata = song_json['metadata']
    artist = fp.parts[-3]
    title = fp.parts[-2]
    section = fp.name.split('_')[0]
    
    ht_key = metadata['key']
    ht_mode = metadata['mode']
    if ht_mode is None: ht_mode = 'major'
    ht_offset = keyc_offset(ht_key, ht_mode)
    
    # convert stream here
    return {
        'artist': artist,
        'title': title,
        'midi': relative_path(fp),
        'section': section,
        'parts': song_info['section'],
        'song_url': song_info['song_url'],
        'genres': song_info['genres'],
        'midi_title': metadata['title'],
        'source': ht_cat,
        'ht_bpm': metadata['BPM'],
        'ht_mode': metadata['mode'],
        'ht_key': metadata['key'],
        'ht_offset': ht_offset,
        'ht_time_signature': metadata['beats_in_measure']
    }

In [33]:
# sanity check
# hook_out = get_hooktheory_attr(song_json[1000]); hook_out

In [34]:
ht_metadata = directory2csv(ht_midi_list, 
                            meta_func=get_hooktheory_attr, 
                            csv_path=ht_csv)
df = pd.read_csv(ht_csv); df.head()

Unnamed: 0,ht_time_signature,ht_offset,midi,section,parts,ht_bpm,ht_mode,midi_title,artist,song_url,genres,source,ht_key,md5,title
0,4,0,midi_sources/hooktheory/pianoroll/w/wayne-shar...,chorus,"intro,chorus",128,1.0,yu-gi-oh3,wayne-sharpe,https://www.hooktheory.com/theorytab/view/wayn...,,hooktheory,C,bf1f29e5ff84e3e93e37fb873bfb590e,yu-gi-oh-theme-song
1,3,0,midi_sources/hooktheory/pianoroll/w/wayne-shar...,intro,"intro,chorus",85,1.0,yu-gi-oh,wayne-sharpe,https://www.hooktheory.com/theorytab/view/wayn...,,hooktheory,C,055f80ad67f64edb14a85ca8fbfe8c29,yu-gi-oh-theme-song
2,4,-5,midi_sources/hooktheory/pianoroll/w/what-a-day...,chorus,chorus,96,6.0,kiefer,what-a-day,https://www.hooktheory.com/theorytab/view/what...,Jazz,hooktheory,D,197f96f5d181f6ce1e2c5ab04ac1ff87,kiefer
3,4,-5,midi_sources/hooktheory/pianoroll/w/whiteflame...,pre-chorus,"verse,pre-chorus,chorus",152,6.0,senbonzakura - pre-Pre-Chorus,whiteflame,https://www.hooktheory.com/theorytab/view/whit...,"J-Pop,Pop",hooktheory,D,9e7ce13a35f1314423a9a6d5a5287a4a,senbonzakura
4,4,-5,midi_sources/hooktheory/pianoroll/w/whiteflame...,verse,"verse,pre-chorus,chorus",152,6.0,Senbonzakura,whiteflame,https://www.hooktheory.com/theorytab/view/whit...,"J-Pop,Pop",hooktheory,D,d5aaf79d0989222f1362f9f46c540a27,senbonzakura


## FreeMidi

In [35]:
fm_path, fm_csv = create_paths('freemidi')
fm_dance_path = metapath/f'freemidi_dance_metadata.json'
fm_pop_path = metapath/f'freemidi_pop_metadata.json'
list(fm_path.glob('*'))

[PosixPath('data/midi/v15/midi_sources/freemidi/genre-disco'),
 PosixPath('data/midi/v15/midi_sources/freemidi/genre-pop'),
 PosixPath('data/midi/v15/midi_sources/freemidi/genre-dance-eletric'),
 PosixPath('data/midi/v15/midi_sources/freemidi/genre-punk'),
 PosixPath('data/midi/v15/midi_sources/freemidi/genre-hip-hop-rap'),
 PosixPath('data/midi/v15/midi_sources/freemidi/genre-rock')]

In [36]:
def parse_freemidi_songs(fp, genres=None, source=None):
    name = fp.with_suffix('').name.split(' - ')
    artist = name[0]
    title = name[-1]
    return {
        'artist': artist.strip(),
        'title': title.strip(),
        'midi': relative_path(fp),
        'genres': genres,
        'source': source
    }

In [37]:
d_parse_func = partial(parse_freemidi_songs, genres='dance', source='freemidi')
dir_path = fm_path/'genre-dance-eletric'
file_list = get_files(dir_path, extensions=['.mid'], recurse=True)
fm_dance_list = directory2csv(file_list, meta_func=d_parse_func, csv_path=fm_dance_path)

In [38]:
p_parse_func = partial(parse_freemidi_songs, genres='pop', source='freemidi')
dir_path = fm_path/'genre-pop'
file_list = get_files(dir_path, extensions=['.mid'], recurse=True)
fm_pop_list = directory2csv(file_list, meta_func=p_parse_func, csv_path=fm_pop_path)

In [39]:
fm_all = fm_dance_list + fm_pop_list
arr2csv(fm_all, fm_csv)
df = pd.read_csv(fm_csv); df.head()

Unnamed: 0,midi,artist,genres,source,md5,title
0,midi_sources/freemidi/genre-dance-eletric/Veng...,Vengaboys,dance,freemidi,eb504f29b1a10567814f198e7e049d15,Up And Down
1,midi_sources/freemidi/genre-dance-eletric/ATB ...,ATB,dance,freemidi,7c461c21684baee9946019c0ed7ce102,Dont stop
2,midi_sources/freemidi/genre-dance-eletric/Mado...,Madonna,dance,freemidi,ac1e447bff339c29bccbaee3deb13b24,Dress You Up
3,midi_sources/freemidi/genre-dance-eletric/Aqua...,Aqua,dance,freemidi,d0306034dbbb4bbc31a95e3232e5fb73,Dr Jones
4,midi_sources/freemidi/genre-dance-eletric/Tune...,Tune Up,dance,freemidi,996662d57a8e3236b36285c54093697e,Bounce


### Gather Lakh Midi Dataset

In [40]:
lmd_path, lmd_csv = create_paths('lmd_clean')

In [41]:
def parse_lmd_songs(fp):
    artist = fp.parts[-2]
    title = fp.parts[-1]
    return {
        'artist': artist.strip(),
        'title': title.strip(),
        'midi': relative_path(fp),
        'genres': 'pop,inferred',
        'source': 'lmd'
    }

In [42]:
file_list = get_files(lmd_path, extensions=['.mid'], recurse=True)
lmd_md = directory2csv(file_list, meta_func=parse_lmd_songs, csv_path=lmd_csv)
df = pd.read_csv(lmd_csv); df.head()

Unnamed: 0,midi,artist,genres,source,md5,title
0,midi_sources/lmd_clean/Peter Maffay/Du.mid,Peter Maffay,"pop,inferred",lmd,6d2ac0d68f5976b161afca8ce061d376,Du.mid
1,midi_sources/lmd_clean/Peter Maffay/Josie.mid,Peter Maffay,"pop,inferred",lmd,6ccac8947814b6faa132cb5bec7a3bdf,Josie.mid
2,midi_sources/lmd_clean/Anne Murray/Snowbird.mid,Anne Murray,"pop,inferred",lmd,f5069f36a7e56475d7f706ed2d2f8517,Snowbird.mid
3,midi_sources/lmd_clean/Anne Murray/You Needed ...,Anne Murray,"pop,inferred",lmd,48419c2acdc476094487157582829781,You Needed Me.mid
4,midi_sources/lmd_clean/The Tremeloes/Silence I...,The Tremeloes,"pop,inferred",lmd,3befa396df58762e746c4288fa851f03,Silence Is Golden.mid


### Gather 130k Reddit

In [43]:
reddit_path, reddit_csv = create_paths('130k_reddit')

In [44]:
def parse_reddit_songs(fp):
    name = fp.with_suffix('').name.replace('_', ' ').split(' - ')
    if len(name) == 1:
        artist = fp.parts[-1]
        title = name[0]
    else:
        artist = name[0]
        title = name[-1]
    return {
        'artist': artist.strip(),
        'title': title.strip(),
        'midi': relative_path(fp),
        'genres': 'anything,inferred',
        'source': 'reddit'
    }

In [45]:
file_list = get_files(reddit_path, extensions=['.mid'], recurse=True)
file_list = [fp for fp in file_list if fp.stat().st_size/1000 < 400] # over 200mb takes crazy long to analyze
reddit_md = directory2csv(file_list, meta_func=parse_reddit_songs, csv_path=reddit_csv)
df = pd.read_csv(reddit_csv); df.head()

Unnamed: 0,midi,artist,genres,source,md5,title
0,midi_sources/130k_reddit/Jazz_www.thejazzpage....,phasedance.mid,"anything,inferred",reddit,c175323dbdff4b676588609081bf5606,phasedance
1,midi_sources/130k_reddit/Jazz_www.thejazzpage....,IGotRhythm.MID,"anything,inferred",reddit,912b07a01ae9b81bc0d86118e3972a47,IGotRhythm
2,midi_sources/130k_reddit/Jazz_www.thejazzpage....,Cheek_To_Cheek.mid,"anything,inferred",reddit,53136c05b1dd56a9f11367f8cdda5c2e,Cheek To Cheek
3,midi_sources/130k_reddit/Jazz_www.thejazzpage....,16goingon17.mid,"anything,inferred",reddit,31ddfcdb86c20e4e67cbaa3363c88309,16goingon17
4,midi_sources/130k_reddit/Jazz_www.thejazzpage....,poinciana.mid,"anything,inferred",reddit,d15dd01250feb42f3b17251c56e6721e,poinciana


### Gather Cprato

In [46]:
cp_path, cp_csv = create_paths('cprato')
# list(cp_path.glob('*'))[:5]

In [47]:
def parse_cprato_songs(fp, genres=None, source=None):
    name = fp.with_suffix('').name.split(' - ')
    artist = name[0]
    title = name[-1].replace('(midi by Carlo Prato) (www.cprato.com)', '')
    return {
        'artist': artist.strip(),
        'title': title.strip(),
        'midi': relative_path(fp),
        'genres': genres,
        'source': source
    }

In [48]:
cp_meta = partial(parse_cprato_songs, genres='EDM,inferred', source='cprato')
file_list = get_files(cp_path, extensions=['.mid'], recurse=True)
cp_md = directory2csv(file_list, meta_func=cp_meta, csv_path=cp_csv)
df = pd.read_csv(cp_csv); df.head()

Unnamed: 0,midi,artist,genres,source,md5,title
0,midi_sources/cprato/Basto - Again And Again (m...,Basto,"EDM,inferred",cprato,44ea7e9b46e04ba6f4836f00b3cc50a3,Again And Again (midi By Carlo Prato) (www.cpr...
1,midi_sources/cprato/The Weeknd ft. Lana Del Re...,The Weeknd ft. Lana Del Rey,"EDM,inferred",cprato,d67ead892ee2c92cfbb5306bd47c9a0f,Stargirl Interlude
2,midi_sources/cprato/Two Steps From Hell - Magi...,Two Steps From Hell,"EDM,inferred",cprato,222db08d4744ab9a53ca0d9c6c6e5113,Magic of Love
3,midi_sources/cprato/Bermuda Loverz - My Girl (...,Bermuda Loverz,"EDM,inferred",cprato,2befd21ebd0f0c779f7fb436ed828ba1,My Girl (Ladidada) (Rimini Rockaz Radio Edit) ...
4,midi_sources/cprato/Cascada - Everytime We Tou...,Cascada,"EDM,inferred",cprato,b53bfa6f4ab72df165e44263d50a4cbd,Everytime We Touch (Midi By Carlo Prato) (www....


### Gather MidiWorld

In [49]:
mw_path, mw_csv = create_paths('midiworld')

In [50]:
def parse_midiworld_songs(fp):
    name = fp.with_suffix('').name.replace('_', ' ').split(' - ')
    artist = name[0]
    title = name[-1]
    return {
        'artist': artist.strip(),
        'title': title.strip(),
        'midi': relative_path(fp),
        'genres': 'pop,inferred',
        'source': 'midiworld'
    }

In [51]:
file_list = get_files(mw_path/'named_midi', extensions=['.mid'], recurse=True)
mw_md = directory2csv(file_list, meta_func=parse_midiworld_songs, csv_path=mw_csv)
df = pd.read_csv(mw_csv); df.head()

Unnamed: 0,midi,artist,genres,source,md5,title
0,midi_sources/midiworld/named_midi/The_Carpente...,The Carpenters,"pop,inferred",midiworld,6d6e23b4f0e44537f8b5309ffeaa1880,Rainy Days and Mondays
1,midi_sources/midiworld/named_midi/Joan_Jett_-_...,Joan Jett,"pop,inferred",midiworld,19efd3ac590d3aede49d2e9e62209115,I Hate Myself for Loving You
2,midi_sources/midiworld/named_midi/George_Harri...,George Harrison,"pop,inferred",midiworld,bea4eba9aa4e8154ab01108b2b808e3c,When We Was Fab
3,midi_sources/midiworld/named_midi/Video_Game_T...,Video Game Themes,"pop,inferred",midiworld,dfbd9c523e1846767746285281d5e971,Diddy Kong
4,midi_sources/midiworld/named_midi/The_Corrs_-_...,The Corrs,"pop,inferred",midiworld,2445fa5424432de2a40ece46cbbc853c,Someday


### Gather Wikifonia

In [52]:
wf_path, wf_csv = create_paths('wikifonia')

In [53]:
def parse_wikifonia_songs(fp):
    name = fp.with_suffix('').name.replace('_', ' ').split(' - ')
    artist = name[0]
    title = name[-1]
    return {
        'artist': artist.strip(),
        'title': title.strip(),
        'mxl': relative_path(fp),
        'genres': 'pop,inferred',
        'source': 'wikifonia'
    }

In [54]:
# Warning: if you get a deadlock: 
# PosixPath('data/midi/midi_sources/midiworld/named_midi/NITRO_BRO_-_IT_WONT_DIE.mid') is broken
file_list = get_files(wf_path, extensions=['.mxl'], recurse=True)
wf_md = directory2csv(file_list, meta_func=parse_wikifonia_songs, csv_path=wf_csv)
df = pd.read_csv(wf_csv); df.head()

Unnamed: 0,artist,genres,source,md5,mxl,title
0,Tommy Dorsey,"pop,inferred",wikifonia,f3418afa104cd03604c8831123e086e4,midi_sources/wikifonia/Tommy Dorsey - Swingin'...,Swingin' on Nothin'
1,Amanda McBroom,"pop,inferred",wikifonia,a01bde25baf5c5a91b9f6235bf019890,midi_sources/wikifonia/Amanda McBroom - The Ro...,The Rose
2,"Hans Leo Hassler, From the Latin","pop,inferred",wikifonia,4beb6aafed89a87a8171da906d8b5ff8,"midi_sources/wikifonia/Hans Leo Hassler, From ...",O Sacred Head Now Wounded
3,"Arthur Siegel, June Carroll","pop,inferred",wikifonia,a739908a6505fcbe8c7d9bf7b1e47492,"midi_sources/wikifonia/Arthur Siegel, June Car...",Love Is A Simple Thing
4,Unknow,"pop,inferred",wikifonia,cb2beddd4a86439dcf2d80b5fc6b4e4f,midi_sources/wikifonia/Unknow - KINDERLIEDJES ...,KINDERLIEDJES MEDLEY


### Gather Musescore

In [55]:
ms_path, ms_csv = create_paths('musescore')
ms_songs = load_json(ms_path/'song_map.json')

In [56]:
def get_number(num_str):
    num_str = num_str.replace(',','').split(' ')[0]
    return int(num_str)

In [57]:
def parse_musescore_songs(fp):
    score_id = fp.with_suffix('').name
    if score_id not in ms_songs: return None
    
    meta = ms_songs[score_id]
    parts = get_number(meta['parts'])
    views = get_number(meta['views'])
    if parts > 2 or views < 150: return None
    
    return {
        'artist': meta['author'].strip(),
        'title': meta['title'].strip(),
        'mxl': relative_path(fp),
        'genres': 'classical,pop,inferred',
        'source': 'musescore'
    }

In [58]:
# Warning: if you get a deadlock: 
# PosixPath('data/midi/midi_sources/midiworld/named_midi/NITRO_BRO_-_IT_WONT_DIE.mid') is broken
file_list = get_files(ms_path, extensions=['.mxl'], recurse=True)
ms_list = directory2csv(file_list, meta_func=parse_musescore_songs, csv_path=ms_csv)
df = pd.read_csv(ms_csv); df.head()

Unnamed: 0,artist,genres,source,md5,mxl,title
0,000@xn--80akgejic5ahko1h.xn--p1ai,"classical,pop,inferred",musescore,36fb6c38927e5e025f7f8427770cf8fa,midi_sources/musescore/data/2985741.mxl,Упражнение 2
1,sam027,"classical,pop,inferred",musescore,b666753133bc72910840a97bddb1fbf9,midi_sources/musescore/data/1425126.mxl,Place de la République - Coeur de pirate
2,CrazyClique,"classical,pop,inferred",musescore,83de066be1fa555a986376170c467f99,midi_sources/musescore/data/5370824.mxl,Beyond The Trees - Original Composition
3,Mjmatthews51,"classical,pop,inferred",musescore,a85e407d2427234ee6bf5aa7ae5ce5e9,midi_sources/musescore/data/1195001.mxl,Sister Sadie
4,Spencer Vanderkley,"classical,pop,inferred",musescore,d1f72939369c106948993bba9b4fc0a9,midi_sources/musescore/data/4621586.mxl,My Top 20 Film Soundtracks Medley


### Yamaha - piano

In [59]:
ec_path, ec_csv = create_paths('ecomp')
ec_songs = load_json(ec_path/'song_list.json')
# list(ec_path.glob('*'))[:5]

In [60]:
def parse_ecomp_songs(fp):
    song_info = ec_songs[fp.stem]
    return {
        'artist': song_info['artist'],
        'title': song_info['title'],
        'mxl': relative_path(fp),
        'genres': 'classical',
        'source': 'ecomp'
    }

In [61]:
file_list = get_files(ec_path, extensions=['.mxl'], recurse=True)

In [62]:
ec_md = directory2csv(file_list, meta_func=parse_ecomp_songs, csv_path=ec_csv)
df = pd.read_csv(ec_csv); df.head()

Unnamed: 0,artist,genres,source,md5,mxl,title
0,Johann Sebastian Bach,classical,ecomp,39099f086fdc79c2828c13274fcd0f25,midi_sources/ecomp/2017/SirajA01.mxl,"Prelude and Fugue in E-flat Major, WTC II, ..."
1,Moritz Moszkowski,classical,ecomp,fe22a58bcff66e8b724b71c10b5465b6,midi_sources/ecomp/2017/LiC05.mxl,"Chanson Boheme de l'Opera ""Carmen"" by Georg..."
2,Nikolai Kapustin,classical,ecomp,8b38131646b8d71dc10269d3e2d608d0,midi_sources/ecomp/2017/WangY05.mxl,Concert Etude Op. 40 No. 3
3,,classical,ecomp,2637be3aec226a2fb74e86bb7a1fde81,midi_sources/ecomp/2017/SunY05.mxl,I. Con moto agitato. Andante. Con moto agitato
4,Giuseppe Scarlatti,classical,ecomp,c154b4269f8c8cecdb5b2972e3e2d831,midi_sources/ecomp/2017/ZhangE06.mxl,"Sonata in G Major, K. 455"


### Classic Piano

In [63]:
clc_path, clc_csv = create_paths('classic_piano')
# list(clc_path.glob('*'))[:5]

In [64]:
def parse_classic_songs(fp):
    name = fp.with_suffix('').name.split('_')
    artist = name[0]
    title = ' '.join(name[1:])
    return {
        'artist': artist,
        'title': title,
        'mxl': relative_path(fp),
        'genres': 'classical',
        'source': 'classical_piano'
    }

In [65]:
file_list = get_files(clc_path, extensions=['.mxl'], recurse=True)

In [66]:
clc_md = directory2csv(file_list, meta_func=parse_classic_songs, csv_path=clc_csv)
df = pd.read_csv(clc_csv); df.head()

Unnamed: 0,artist,genres,source,md5,mxl,title
0,liz,classical,classical_piano,93d730fce9a3e2fabe1a0ea3e2cf80b6,midi_sources/classic_piano/liz_rhap15_format0.mxl,rhap15 format0
1,ty,classical,classical_piano,d45bf8f2b4b0242879305a8423a3cb99,midi_sources/classic_piano/ty_september_format...,september format0
2,schumm-3,classical,classical_piano,2238d47b4162b55bafee79b16af46a37,midi_sources/classic_piano/schumm-3_format0.mxl,format0
3,chpn,classical,classical_piano,3d5c30ba63b886de6555a5d4911cf55f,midi_sources/classic_piano/chpn_op33_4_format0...,op33 4 format0
4,grieg,classical,classical_piano,51758e4fb8b37d0389148d37652b1d58,midi_sources/classic_piano/grieg_spring_format...,spring format0


### Classical Music Archives

In [67]:
cma_path, cma_csv = create_paths('classical_archives')
# list(cma_path.glob('*'))[:5]

In [68]:
def parse_cma_songs(fp):
    name = fp.with_suffix('').name
    return {
        'artist': name,
        'title': name,
        'mxl': relative_path(fp),
        'genres': 'classical',
        'source': 'classical_archives'
    }

In [69]:
file_list = get_files(cma_path, extensions=['.mxl'], recurse=True); len(file_list)

14671

In [70]:
cma_md = directory2csv(file_list, meta_func=parse_cma_songs, csv_path=cma_csv)
df = pd.read_csv(cma_csv); df.head()

Unnamed: 0,artist,genres,source,md5,mxl,title
0,jsrjeuxd,classical,classical_archives,d14444f06de8a7ad6bec95c98afa566c,midi_sources/classical_archives/021/jsrjeuxd.mxl,jsrjeuxd
1,men26,classical,classical_archives,42c8738df5fa98fa5b715d058890e376,midi_sources/classical_archives/021/men26.mxl,men26
2,szecheny,classical,classical_archives,0af7005d072bc22ea2f188569adcfa9d,midi_sources/classical_archives/021/szecheny.mxl,szecheny
3,acocored,classical,classical_archives,e8842eb5b97285673075043aaab09ca8,midi_sources/classical_archives/021/acocored.mxl,acocored
4,op73_2_3,classical,classical_archives,be5fea22e3d87257123c98090d9f4c6c,midi_sources/classical_archives/021/op73_2_3.mxl,op73_2_3


### Creating CSV

In [71]:
combined_csvs = [create_paths(s)[-1] for s in sources if s != 'hooktheory_c']
dfs = [pd.read_csv(csv) for csv in combined_csvs]

In [72]:
combined_df = pd.concat(dfs, sort=False)
combined_df = combined_df.reset_index(drop=True); combined_df.head()

Unnamed: 0,ht_time_signature,ht_offset,midi,section,parts,ht_bpm,ht_mode,midi_title,artist,song_url,genres,source,ht_key,md5,title,mxl
0,4.0,0.0,midi_sources/hooktheory/pianoroll/w/wayne-shar...,chorus,"intro,chorus",128.0,1.0,yu-gi-oh3,wayne-sharpe,https://www.hooktheory.com/theorytab/view/wayn...,,hooktheory,C,bf1f29e5ff84e3e93e37fb873bfb590e,yu-gi-oh-theme-song,
1,3.0,0.0,midi_sources/hooktheory/pianoroll/w/wayne-shar...,intro,"intro,chorus",85.0,1.0,yu-gi-oh,wayne-sharpe,https://www.hooktheory.com/theorytab/view/wayn...,,hooktheory,C,055f80ad67f64edb14a85ca8fbfe8c29,yu-gi-oh-theme-song,
2,4.0,-5.0,midi_sources/hooktheory/pianoroll/w/what-a-day...,chorus,chorus,96.0,6.0,kiefer,what-a-day,https://www.hooktheory.com/theorytab/view/what...,Jazz,hooktheory,D,197f96f5d181f6ce1e2c5ab04ac1ff87,kiefer,
3,4.0,-5.0,midi_sources/hooktheory/pianoroll/w/whiteflame...,pre-chorus,"verse,pre-chorus,chorus",152.0,6.0,senbonzakura - pre-Pre-Chorus,whiteflame,https://www.hooktheory.com/theorytab/view/whit...,"J-Pop,Pop",hooktheory,D,9e7ce13a35f1314423a9a6d5a5287a4a,senbonzakura,
4,4.0,-5.0,midi_sources/hooktheory/pianoroll/w/whiteflame...,verse,"verse,pre-chorus,chorus",152.0,6.0,Senbonzakura,whiteflame,https://www.hooktheory.com/theorytab/view/whit...,"J-Pop,Pop",hooktheory,D,d5aaf79d0989222f1362f9f46c540a27,senbonzakura,


In [73]:
deduped_df = combined_df.drop_duplicates(subset=['md5'], keep='first') # 

Midiworld - 90% duplicates with rest  
Freemidi - 50% duplicates with rest  
LMD - 70% duplicates with rest

In [74]:
# from collections import Counter
# # No dedups
# print(Counter(merged_df.source.values))
# # replacing reddit
# print(Counter(merged_df.drop_duplicates(subset=['md5'], keep='first').source.values))
# # reddit replace else
# print(Counter(merged_df.drop_duplicates(subset=['md5'], keep='last').source.values))
# # Midiworld - 90% duplicates with rest, 
# # Freemidi - 50% duplicates with rest,
# # LMD - 70% duplicates with rest

In [75]:
from collections import Counter
Counter(deduped_df.source.values)

Counter({'hooktheory': 19882,
         'freemidi': 5168,
         'midiworld': 4109,
         'ecomp': 2735,
         'cprato': 312,
         'classical_piano': 329,
         'classical_archives': 14671,
         'musescore': 11502,
         'wikifonia': 6391,
         'lmd': 13568,
         'reddit': 98683})

In [76]:
# combined df does not contain hooktheory_c files
[df.shape for df in dfs], combined_df.shape, deduped_df.shape

([(20076, 15),
  (5784, 6),
  (4711, 6),
  (2735, 6),
  (314, 6),
  (329, 6),
  (14671, 6),
  (11504, 6),
  (6391, 6),
  (17243, 6),
  (128419, 6)],
 (212177, 16),
 (177350, 16))

In [82]:
hooktheory_c_csv = pd.read_csv(create_paths('hooktheory_c')[-1])
out_df = pd.concat([deduped_df, hooktheory_c_csv], sort=False); out_df.shape

(197426, 16)

In [78]:
out_df.to_csv(combined_csv, index=False)

### Convert MXL to Midi

Makes it easier for us to process in part 2

In [83]:
from src.data_sources import process_all

In [84]:
df = pd.read_csv(combined_csv); df.head()
all_records = df.to_dict(orient='records'); len(all_records)

197426

In [88]:
def mxl2midi_func(metadata):
    result = metadata.copy()
    if not isinstance(result.get('mxl'), str): return result

    input_path = version_path/metadata['mxl']
    out_file = Path(str(metadata['mxl']).replace('midi_sources/', 'midi_sources/from_mxl/')).with_suffix('.mid')
    output_path = version_path/out_file
    
    if not output_path.exists():
        try:
            output_path.parent.mkdir(parents=True, exist_ok=True)
            stream = file2stream(input_path)
            stream.write('midi', fp=output_path)
            print('Encoded:', output_path)
        except Exception:
#             print(traceback.format_exc())
            return result
        
    result['midi'] = out_file
    return result

In [89]:
def timeout_func(data, seconds):
    print("Timeout:", seconds, data.get('mxl'))

In [90]:
processed = process_all(mxl2midi_func, all_records, timeout=600, timeout_func=timeout_func)

Encoded: data/midi/v15/midi_sources/from_mxl/ecomp/2006/Schneider03.mid
Encoded: data/midi/v15/midi_sources/from_mxl/ecomp/2017/LiuY04.mid
Encoded: data/midi/v15/midi_sources/from_mxl/ecomp/2008/Abdelmola03.mid
Encoded: data/midi/v15/midi_sources/from_mxl/ecomp/2017/WangY04.mid
Encoded: data/midi/v15/midi_sources/from_mxl/ecomp/2006/Zusko06.mid
Encoded: data/midi/v15/midi_sources/from_mxl/ecomp/2017/ZhangE08.mid
Encoded: data/midi/v15/midi_sources/from_mxl/ecomp/2017/YuP04.mid
Encoded: data/midi/v15/midi_sources/from_mxl/ecomp/2006/Bach03.mid
Encoded: data/midi/v15/midi_sources/from_mxl/ecomp/2006/Namirovsky05.mid
Encoded: data/midi/v15/midi_sources/from_mxl/ecomp/2006/Lee_E03.mid
Encoded: data/midi/v15/midi_sources/from_mxl/ecomp/2008/Broberg03.mid
Encoded: data/midi/v15/midi_sources/from_mxl/ecomp/2006/Yanagitani03.mid
Encoded: data/midi/v15/midi_sources/from_mxl/ecomp/2006/Avdeeva05.mid
Encoded: data/midi/v15/midi_sources/from_mxl/ecomp/2017/LiuC04.mid
Encoded: data/midi/v15/midi_so

Timeout: 600 midi_sources/ecomp/2006/Dulu07.mxl
Timeout: 600 midi_sources/ecomp/2006/Shybayeva07.mxl
Timeout: 600 midi_sources/ecomp/2006/Shamray10.mxl
Encoded: data/midi/v15/midi_sources/from_mxl/ecomp/2004/KARYAG05.mid
Timeout: 600 midi_sources/ecomp/2006/Arciglione05.mxl
Timeout: 600 midi_sources/ecomp/2006/Huangci08.mxl
Timeout: 600 midi_sources/ecomp/2006/Yanagitani08.mxl
Encoded: data/midi/v15/midi_sources/from_mxl/ecomp/2009/Tak07.mid
Encoded: data/midi/v15/midi_sources/from_mxl/ecomp/2004/BENABD05.mid
Timeout: 600 midi_sources/ecomp/2006/DeTurck11.mxl
Encoded: data/midi/v15/midi_sources/from_mxl/ecomp/2004/ADIG03.mid
Timeout: 600 midi_sources/ecomp/2004/EVSTIO04.mxl
Timeout: 600 midi_sources/ecomp/2004/USHIKI06.mxl
Timeout: 600 midi_sources/ecomp/2004/POTAMO03.mxl
Encoded: data/midi/v15/midi_sources/from_mxl/ecomp/2004/LEE_K05.mid
Encoded: data/midi/v15/midi_sources/from_mxl/ecomp/2004/KOLESS03.mid
Timeout: 600 midi_sources/ecomp/2004/KRASNI15.mxl
Timeout: 600 midi_sources/ecom

Timeout: 600 midi_sources/ecomp/2014/MaximovI02.mxl
Encoded: data/midi/v15/midi_sources/from_mxl/ecomp/2014/ParkS11.mid
Encoded: data/midi/v15/midi_sources/from_mxl/ecomp/2014/HuangSW07.mid
Encoded: data/midi/v15/midi_sources/from_mxl/ecomp/2015/MunA09.mid
Timeout: 600 midi_sources/ecomp/2014/WangY11.mxl
Timeout: 600 midi_sources/ecomp/2014/KharselM12.mxl
Timeout: 600 midi_sources/ecomp/2014/LeungM11.mxl
Encoded: data/midi/v15/midi_sources/from_mxl/ecomp/2014/LiuY11.mid
Encoded: data/midi/v15/midi_sources/from_mxl/ecomp/2014/ChernovA20.mid
Encoded: data/midi/v15/midi_sources/from_mxl/ecomp/2015/ChenW04.mid
Timeout: 600 midi_sources/ecomp/2014/PrjevalskayaM23.mxl
Encoded: data/midi/v15/midi_sources/from_mxl/ecomp/2014/KimHJ06.mid
Encoded: data/midi/v15/midi_sources/from_mxl/ecomp/2014/KimHJ01.mid
Timeout: 600 midi_sources/ecomp/2014/DupreeF14.mxl
Timeout: 600 midi_sources/ecomp/2014/ShevchenkoO04.mxl
Timeout: 600 midi_sources/ecomp/2014/GonzalezJ09.mxl
Timeout: 600 midi_sources/ecomp/20

Timeout: 600 midi_sources/ecomp/2002/dvorkine03.mxl
Encoded: data/midi/v15/midi_sources/from_mxl/ecomp/2018/KotysV01.mid
Encoded: data/midi/v15/midi_sources/from_mxl/ecomp/2018/LeeS01.mid
Timeout: 600 midi_sources/ecomp/2002/huang01.mxl
Timeout: 600 midi_sources/ecomp/2002/yamaguchi01.mxl
Timeout: 600 midi_sources/ecomp/2002/hireche03.mxl
Encoded: data/midi/v15/midi_sources/from_mxl/ecomp/2018/ChowK06.mid
Timeout: 600 midi_sources/ecomp/2002/hireche04.mxl
Timeout: 600 midi_sources/ecomp/2002/soloman02.mxl
Encoded: data/midi/v15/midi_sources/from_mxl/ecomp/2018/MaslovV01.mid
Encoded: data/midi/v15/midi_sources/from_mxl/classical_archives/021/gp_bal4.mid
Timeout: 600 midi_sources/ecomp/2002/sun04.mxl
Encoded: data/midi/v15/midi_sources/from_mxl/ecomp/2018/HouY05.mid
Timeout: 600 midi_sources/ecomp/2002/dossin02.mxl
Encoded: data/midi/v15/midi_sources/from_mxl/ecomp/2018/MustakimovT17.mid
Encoded: data/midi/v15/midi_sources/from_mxl/ecomp/2002/mamriev03.mid
Encoded: data/midi/v15/midi_sou

Encoded: data/midi/v15/midi_sources/from_mxl/classical_archives/3/caprixo.mid
Encoded: data/midi/v15/midi_sources/from_mxl/classical_archives/0/anspgpol.mid
Encoded: data/midi/v15/midi_sources/from_mxl/classical_archives/011/lvbsym92.mid
Encoded: data/midi/v15/midi_sources/from_mxl/classical_archives/023/bach_in.mid
Encoded: data/midi/v15/midi_sources/from_mxl/classical_archives/021/rachson.mid
Encoded: data/midi/v15/midi_sources/from_mxl/classical_archives/3/brhms73a.mid
Encoded: data/midi/v15/midi_sources/from_mxl/classical_archives/014/brck8c.mid
Encoded: data/midi/v15/midi_sources/from_mxl/classical_archives/7/mthm26b.mid
Timeout: 600 midi_sources/classical_archives/021/rubsonat.mxl
Timeout: 600 midi_sources/classical_archives/021/aida_20.mxl
Encoded: data/midi/v15/midi_sources/from_mxl/classical_archives/6/schsin3.mid




Encoded: data/midi/v15/midi_sources/from_mxl/classical_archives/5/mazeppa.mid
Encoded: data/midi/v15/midi_sources/from_mxl/classical_archives/011/chabespa.mid
Encoded: data/midi/v15/midi_sources/from_mxl/classical_archives/7/norma.mid
Encoded: data/midi/v15/midi_sources/from_mxl/classical_archives/5/polliszt.mid
Encoded: data/midi/v15/midi_sources/from_mxl/classical_archives/031/tmntbare.mid
Encoded: data/midi/v15/midi_sources/from_mxl/classical_archives/3/frsymph1.mid
Encoded: data/midi/v15/midi_sources/from_mxl/classical_archives/023/bach_inv.mid
Encoded: data/midi/v15/midi_sources/from_mxl/classical_archives/4/sch1-1.mid
Encoded: data/midi/v15/midi_sources/from_mxl/classical_archives/024/beetho-1.mid
Encoded: data/midi/v15/midi_sources/from_mxl/classical_archives/023/faustact.mid
Encoded: data/midi/v15/midi_sources/from_mxl/classical_archives/3/beet9mv4.mid
Encoded: data/midi/v15/midi_sources/from_mxl/classical_archives/1/b9-1-maa.mid
Encoded: data/midi/v15/midi_sources/from_mxl/cla



Encoded: data/midi/v15/midi_sources/from_mxl/musescore/data/5297185.mid
Timeout: 600 midi_sources/classical_archives/4/rach3-23.mxl
Timeout: 600 midi_sources/classical_archives/4/carnaval.mxl




Encoded: data/midi/v15/midi_sources/from_mxl/classical_archives/3/besym6-3.mid




Encoded: data/midi/v15/midi_sources/from_mxl/classical_archives/1/blanik.mid
Encoded: data/midi/v15/midi_sources/from_mxl/classical_archives/1/ch-et-10.mid
Timeout: 600 midi_sources/classical_archives/023/tris1act.mxl
Encoded: data/midi/v15/midi_sources/from_mxl/classical_archives/013/alassio3.mid
Encoded: data/midi/v15/midi_sources/from_mxl/classical_archives/9/be-pv20r.mid
Encoded: data/midi/v15/midi_sources/from_mxl/classical_archives/014/jsrap4gm.mid
Encoded: data/midi/v15/midi_sources/from_mxl/classical_archives/2/tch-4-1.mid
Encoded: data/midi/v15/midi_sources/from_mxl/classical_archives/2/duksrcap.mid
Encoded: data/midi/v15/midi_sources/from_mxl/classical_archives/3/bsfanctr.mid
Encoded: data/midi/v15/midi_sources/from_mxl/classical_archives/9/beop53n1.mid
Encoded: data/midi/v15/midi_sources/from_mxl/classical_archives/1/gmb5m4.mid
Encoded: data/midi/v15/midi_sources/from_mxl/classical_archives/9/kv334.mid
Timeout: 600 midi_sources/classical_archives/023/act_5.mxl
Encoded: data/

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

IOPub message rate exceed

In [37]:
# converted = [(p['midi'], p['source']) for p in processed if isinstance(p.get('mxl'), str)]

# converted

In [91]:
from src.data_sources import arr2csv
arr2csv(processed, all_csv)
df = pd.read_csv(all_csv); df.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,ht_time_signature,ht_offset,midi,section,parts,ht_bpm,title,midi_title,artist,song_url,genres,source,ht_key,md5,mxl,ht_mode
0,4.0,0.0,midi_sources/hooktheory/pianoroll/w/wayne-shar...,chorus,"intro,chorus",128.0,yu-gi-oh-theme-song,yu-gi-oh3,wayne-sharpe,https://www.hooktheory.com/theorytab/view/wayn...,,hooktheory,C,bf1f29e5ff84e3e93e37fb873bfb590e,,1.0
1,3.0,0.0,midi_sources/hooktheory/pianoroll/w/wayne-shar...,intro,"intro,chorus",85.0,yu-gi-oh-theme-song,yu-gi-oh,wayne-sharpe,https://www.hooktheory.com/theorytab/view/wayn...,,hooktheory,C,055f80ad67f64edb14a85ca8fbfe8c29,,1.0
2,4.0,-5.0,midi_sources/hooktheory/pianoroll/w/what-a-day...,chorus,chorus,96.0,kiefer,kiefer,what-a-day,https://www.hooktheory.com/theorytab/view/what...,Jazz,hooktheory,D,197f96f5d181f6ce1e2c5ab04ac1ff87,,6.0
3,4.0,-5.0,midi_sources/hooktheory/pianoroll/w/whiteflame...,pre-chorus,"verse,pre-chorus,chorus",152.0,senbonzakura,senbonzakura - pre-Pre-Chorus,whiteflame,https://www.hooktheory.com/theorytab/view/whit...,"J-Pop,Pop",hooktheory,D,9e7ce13a35f1314423a9a6d5a5287a4a,,6.0
4,4.0,-5.0,midi_sources/hooktheory/pianoroll/w/whiteflame...,verse,"verse,pre-chorus,chorus",152.0,senbonzakura,Senbonzakura,whiteflame,https://www.hooktheory.com/theorytab/view/whit...,"J-Pop,Pop",hooktheory,D,d5aaf79d0989222f1362f9f46c540a27,,6.0


In [92]:
df.shape

(197182, 16)