### Create CSV from midi sources

In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import glob
import os
from tqdm import tqdm
from IPython.display import Image, Audio
import traceback

In [3]:
# parallel
from functools import partial
from pathlib import Path

In [4]:
import sys
sys.path.insert(0, '../../')

In [5]:
from src.data_sources import *
from src.midi_data import *

In [6]:
version = 'v10'
data_path = Path('data/midi')
version_path = data_path/version
orig_path = version_path/'midi_sources'
metapath = version_path/'metadata'
all_csv = metapath/'midi_sources.csv'
metapath.mkdir(parents=True, exist_ok=True)

In [7]:
def create_paths(dirname):
    "Standardize midi_source paths"
    dir_path = orig_path/dirname
    csv_path = metapath/f'{dirname}_metadata.csv'
    return dir_path, csv_path

In [8]:
sources = ['hooktheory', 'hooktheory_c', 'freemidi', 'midiworld', 'ecomp', 'cprato', 'classic_piano', 'musescore', 'wikifonia', 'lmd_clean', '130k_reddit']

In [9]:
version_path.relative_to(data_path)

PosixPath('v10')

In [10]:
def relative_path(filepath):
    return str(Path(filepath).relative_to(version_path))

### Remove corrupted file - this causes deadlock with music21 processing

In [11]:
corrupted_files = [
    'midiworld/named_midi/NITRO_BRO_-_IT_WONT_DIE.mid',
    'ecomp/2004/MORET02.mid',
    'ecomp/2006/Mordvinov9.MID',
    'ecomp/2006/Na06.MID',
    'ecomp/2008/Cui01.MID',
    'ecomp/2008/Cui02.MID',
    'ecomp/2008/Cui03.MID',
    'ecomp/2008/Cui04.MID',
    'ecomp/2008/Cui05.MID',
    'ecomp/2008/Cui06.MID',
    'ecomp/2008/Cui07.MID',
    'ecomp/2008/Cui08.MID',
    'ecomp/2008/Tan01.MID',
    'ecomp/2008/Tan02.MID',
    'ecomp/2008/Tan03.MID',
    'ecomp/2018/KaszoS14.MID',
    'midiworld/named_midi/Rob_Zombie_-_Demonoid_Phenomenon.mid',
    'midiworld/named_midi/Rob_Zombie_-_Demonoid_Phenomenon.mxl',
]
for f in corrupted_files:
    fp = orig_path/f
    if fp.exists(): fp.unlink()

In [12]:
import hashlib

In [13]:

def arr2csv(arr, out_file):
    "Convert metadata array to csv"
    all_keys = {k for d in arr for k in d.keys()}
    arr = [format_values(x) for x in arr]
    with open(out_file, 'w') as f:
        dict_writer = csv.DictWriter(f, list(all_keys))
        dict_writer.writeheader()
        dict_writer.writerows(arr)

In [14]:

def directory2csv(files, meta_func, csv_path):
    "Iterate through midi_source dir and map file to metadata"
    
    def get_meta(fp):
        # over 350mb takes crazy long to analyze
        size = fp.stat().st_size/1000
        if fp.suffix == 'mid' and size > 350: 
#             print('Removing mid over 350mb', fp, dixr)
            return None
        if fp.suffix == 'mxl' and size > 420: 
#             print('Removing mxl over 420mb', fp, size)
            return None
#         try:
        m = meta_func(fp)
        if m: m['md5'] = hashlib.md5(open(fp,'rb').read()).hexdigest()
        return m
#         except Exception as e:
#             print('Error:', fp, e)
#         return None
    
    mlist = [get_meta(fp) for fp in files]
    mlist = [x for x in mlist if x is not None]
    arr2csv(mlist, csv_path)
    return mlist

### Hooktheory

In [15]:
ht_cat = 'hooktheory'
ht_path, ht_csv = create_paths(ht_cat)
ht_midi_list = list((ht_path/'pianoroll').glob('*/*/*/*_key_original.mid')); 
len(ht_midi_list)

20076

In [25]:
ht_cat = 'hooktheory_c'
ht_path, ht_csv = create_paths(ht_cat)
ht_path = ht_path.with_name('hooktheory')
ht_midi_list = list((ht_path/'pianoroll').glob('*/*/*/*_key_cmajor.mid'));
len(ht_midi_list)

20076

In [26]:
ht_song_list = metapath/'hooktheory_key2info.json'

In [27]:
def song_key(s): return '_'.join(s.parts[-3:-1])

In [28]:
ht_key2info = load_json(ht_song_list)

if ht_key2info is None:
    song_info = list((ht_path/'xml').glob('*/*/*/*.json'))
    ht_key2info = {song_key(s):json.load(open(s, 'r')) for s in song_info}
    save_json(ht_key2info, ht_song_list)
len(ht_key2info)

12008

In [29]:
# ht_midi_list = list((ht_path/'pianoroll').glob('*/*/*/*_key_cmajor.mid')); 

In [30]:
def get_ht_jsonfile(midi_file): # using json instead of midi for metadata
    return str(midi_file.with_suffix('.json')).replace('pianoroll', 'event').replace('_key', '_symbol_key')

In [31]:
def get_hooktheory_attr(fp):
    song_info = ht_key2info[song_key(fp)]
    song_json = json.load(open(get_ht_jsonfile(fp), 'r'))
    metadata = song_json['metadata']
    artist = fp.parts[-3]
    title = fp.parts[-2]
    section = fp.name.split('_')[0]
    
    ht_key = metadata['key']
    ht_mode = metadata['mode']
    if ht_mode is None: ht_mode = 'major'
    ht_offset = keyc_offset(ht_key, ht_mode)
    
    # convert stream here
    return {
        'artist': artist,
        'title': title,
        'midi': relative_path(fp),
        'section': section,
        'parts': song_info['section'],
        'song_url': song_info['song_url'],
        'genres': song_info['genres'],
        'midi_title': metadata['title'],
        'source': ht_cat,
        'ht_bpm': metadata['BPM'],
        'ht_mode': metadata['mode'],
        'ht_key': metadata['key'],
        'ht_offset': ht_offset,
        'ht_time_signature': metadata['beats_in_measure']
    }

In [32]:
# sanity check
# hook_out = get_hooktheory_attr(song_json[1000]); hook_out

In [33]:
ht_metadata = directory2csv(ht_midi_list, 
                            meta_func=get_hooktheory_attr, 
                            csv_path=ht_csv)
df = pd.read_csv(ht_csv); df.head()

Unnamed: 0,section,genres,md5,artist,parts,ht_time_signature,ht_bpm,midi,ht_mode,title,ht_offset,song_url,midi_title,ht_key,source
0,chorus,,bf1f29e5ff84e3e93e37fb873bfb590e,wayne-sharpe,"intro,chorus",4,128,midi_sources/hooktheory/pianoroll/w/wayne-shar...,1,yu-gi-oh-theme-song,0,https://www.hooktheory.com/theorytab/view/wayn...,yu-gi-oh3,C,hooktheory_c
1,intro,,055f80ad67f64edb14a85ca8fbfe8c29,wayne-sharpe,"intro,chorus",3,85,midi_sources/hooktheory/pianoroll/w/wayne-shar...,1,yu-gi-oh-theme-song,0,https://www.hooktheory.com/theorytab/view/wayn...,yu-gi-oh,C,hooktheory_c
2,chorus,Jazz,e7f70964a2538187833d4c43677e17c0,what-a-day,chorus,4,96,midi_sources/hooktheory/pianoroll/w/what-a-day...,1,kiefer,0,https://www.hooktheory.com/theorytab/view/what...,kiefer,C,hooktheory_c
3,chorus,"J-Pop,Pop",2e37814fb0f75420b22303feff538a9f,whiteflame,"verse,pre-chorus,chorus",4,152,midi_sources/hooktheory/pianoroll/w/whiteflame...,1,senbonzakura,0,https://www.hooktheory.com/theorytab/view/whit...,Senbonzakura,C,hooktheory_c
4,verse,"J-Pop,Pop",b95497f37fa462cf885b9afeb635ef0e,whiteflame,"verse,pre-chorus,chorus",4,152,midi_sources/hooktheory/pianoroll/w/whiteflame...,1,senbonzakura,0,https://www.hooktheory.com/theorytab/view/whit...,Senbonzakura,C,hooktheory_c


#### Save song_list

## FreeMidi

In [23]:
fm_path, fm_csv = create_paths('freemidi')
fm_dance_path = metapath/f'freemidi_dance_metadata.json'
fm_pop_path = metapath/f'freemidi_pop_metadata.json'
list(fm_path.glob('*'))

[PosixPath('data/midi/v10/midi_sources/freemidi/genre-disco'),
 PosixPath('data/midi/v10/midi_sources/freemidi/genre-pop'),
 PosixPath('data/midi/v10/midi_sources/freemidi/genre-dance-eletric'),
 PosixPath('data/midi/v10/midi_sources/freemidi/genre-punk'),
 PosixPath('data/midi/v10/midi_sources/freemidi/genre-hip-hop-rap'),
 PosixPath('data/midi/v10/midi_sources/freemidi/genre-rock')]

In [24]:
def parse_freemidi_songs(fp, genres=None, source=None):
    name = fp.with_suffix('').name.split(' - ')
    artist = name[0]
    title = name[-1]
    return {
        'artist': artist.strip(),
        'title': title.strip(),
        'midi': relative_path(fp),
        'genres': genres,
        'source': source
    }

In [25]:
d_parse_func = partial(parse_freemidi_songs, genres='dance', source='freemidi')
dir_path = fm_path/'genre-dance-eletric'
file_list = get_files(dir_path, extensions=['.mid'], recurse=True)
fm_dance_list = directory2csv(file_list, meta_func=d_parse_func, csv_path=fm_dance_path)

In [26]:
p_parse_func = partial(parse_freemidi_songs, genres='pop', source='freemidi')
dir_path = fm_path/'genre-pop'
file_list = get_files(dir_path, extensions=['.mid'], recurse=True)
fm_pop_list = directory2csv(file_list, meta_func=p_parse_func, csv_path=fm_pop_path)

In [27]:
fm_all = fm_dance_list + fm_pop_list
arr2csv(fm_all, fm_csv)
df = pd.read_csv(fm_csv); df.head()

Unnamed: 0,midi,genres,artist,md5,source,title
0,midi_sources/freemidi/genre-dance-eletric/Veng...,dance,Vengaboys,eb504f29b1a10567814f198e7e049d15,freemidi,Up And Down
1,midi_sources/freemidi/genre-dance-eletric/ATB ...,dance,ATB,7c461c21684baee9946019c0ed7ce102,freemidi,Dont stop
2,midi_sources/freemidi/genre-dance-eletric/Mado...,dance,Madonna,ac1e447bff339c29bccbaee3deb13b24,freemidi,Dress You Up
3,midi_sources/freemidi/genre-dance-eletric/Aqua...,dance,Aqua,d0306034dbbb4bbc31a95e3232e5fb73,freemidi,Dr Jones
4,midi_sources/freemidi/genre-dance-eletric/Tune...,dance,Tune Up,996662d57a8e3236b36285c54093697e,freemidi,Bounce


### Gather Lakh Midi Dataset

In [28]:
lmd_path, lmd_csv = create_paths('lmd_clean')

In [29]:
def parse_lmd_songs(fp):
    artist = fp.parts[-2]
    title = fp.parts[-1]
    return {
        'artist': artist.strip(),
        'title': title.strip(),
        'midi': relative_path(fp),
        'genres': 'pop,inferred',
        'source': 'lmd'
    }

In [30]:
file_list = get_files(lmd_path, extensions=['.mid'], recurse=True)
lmd_md = directory2csv(file_list, meta_func=parse_lmd_songs, csv_path=lmd_csv)
df = pd.read_csv(lmd_csv); df.head()

Unnamed: 0,midi,genres,artist,md5,source,title
0,midi_sources/lmd_clean/Peter Maffay/Du.mid,"pop,inferred",Peter Maffay,6d2ac0d68f5976b161afca8ce061d376,lmd,Du.mid
1,midi_sources/lmd_clean/Peter Maffay/Josie.mid,"pop,inferred",Peter Maffay,6ccac8947814b6faa132cb5bec7a3bdf,lmd,Josie.mid
2,midi_sources/lmd_clean/Anne Murray/Snowbird.mid,"pop,inferred",Anne Murray,f5069f36a7e56475d7f706ed2d2f8517,lmd,Snowbird.mid
3,midi_sources/lmd_clean/Anne Murray/You Needed ...,"pop,inferred",Anne Murray,48419c2acdc476094487157582829781,lmd,You Needed Me.mid
4,midi_sources/lmd_clean/The Tremeloes/Silence I...,"pop,inferred",The Tremeloes,3befa396df58762e746c4288fa851f03,lmd,Silence Is Golden.mid


### Gather 130k Reddit

In [31]:
reddit_path, reddit_csv = create_paths('130k_reddit')

In [32]:
def parse_reddit_songs(fp):
    name = fp.with_suffix('').name.replace('_', ' ').split(' - ')
    if len(name) == 1:
        artist = fp.parts[-1]
        title = name[0]
    else:
        artist = name[0]
        title = name[-1]
    return {
        'artist': artist.strip(),
        'title': title.strip(),
        'midi': relative_path(fp),
        'genres': 'anything,inferred',
        'source': 'reddit'
    }

In [33]:
file_list = get_files(reddit_path, extensions=['.mid'], recurse=True)
file_list = [fp for fp in file_list if fp.stat().st_size/1000 < 400] # over 200mb takes crazy long to analyze
reddit_md = directory2csv(file_list, meta_func=parse_reddit_songs, csv_path=reddit_csv)
df = pd.read_csv(reddit_csv); df.head()

Unnamed: 0,midi,genres,artist,md5,source,title
0,midi_sources/130k_reddit/Jazz_www.thejazzpage....,"anything,inferred",phasedance.mid,c175323dbdff4b676588609081bf5606,reddit,phasedance
1,midi_sources/130k_reddit/Jazz_www.thejazzpage....,"anything,inferred",IGotRhythm.MID,912b07a01ae9b81bc0d86118e3972a47,reddit,IGotRhythm
2,midi_sources/130k_reddit/Jazz_www.thejazzpage....,"anything,inferred",Cheek_To_Cheek.mid,53136c05b1dd56a9f11367f8cdda5c2e,reddit,Cheek To Cheek
3,midi_sources/130k_reddit/Jazz_www.thejazzpage....,"anything,inferred",16goingon17.mid,31ddfcdb86c20e4e67cbaa3363c88309,reddit,16goingon17
4,midi_sources/130k_reddit/Jazz_www.thejazzpage....,"anything,inferred",poinciana.mid,d15dd01250feb42f3b17251c56e6721e,reddit,poinciana


### Gather Cprato

In [34]:
cp_path, cp_csv = create_paths('cprato')
# list(cp_path.glob('*'))[:5]

In [35]:
def parse_cprato_songs(fp, genres=None, source=None):
    name = fp.with_suffix('').name.split(' - ')
    artist = name[0]
    title = name[-1].replace('(midi by Carlo Prato) (www.cprato.com)', '')
    return {
        'artist': artist.strip(),
        'title': title.strip(),
        'midi': relative_path(fp),
        'genres': genres,
        'source': source
    }

In [36]:
cp_meta = partial(parse_cprato_songs, genres='EDM,inferred', source='cprato')
file_list = get_files(cp_path, extensions=['.mid'], recurse=True)
cp_md = directory2csv(file_list, meta_func=cp_meta, csv_path=cp_csv)
df = pd.read_csv(cp_csv); df.head()

Unnamed: 0,midi,genres,artist,md5,source,title
0,midi_sources/cprato/Basto - Again And Again (m...,"EDM,inferred",Basto,44ea7e9b46e04ba6f4836f00b3cc50a3,cprato,Again And Again (midi By Carlo Prato) (www.cpr...
1,midi_sources/cprato/The Weeknd ft. Lana Del Re...,"EDM,inferred",The Weeknd ft. Lana Del Rey,d67ead892ee2c92cfbb5306bd47c9a0f,cprato,Stargirl Interlude
2,midi_sources/cprato/Two Steps From Hell - Magi...,"EDM,inferred",Two Steps From Hell,222db08d4744ab9a53ca0d9c6c6e5113,cprato,Magic of Love
3,midi_sources/cprato/Bermuda Loverz - My Girl (...,"EDM,inferred",Bermuda Loverz,2befd21ebd0f0c779f7fb436ed828ba1,cprato,My Girl (Ladidada) (Rimini Rockaz Radio Edit) ...
4,midi_sources/cprato/Cascada - Everytime We Tou...,"EDM,inferred",Cascada,b53bfa6f4ab72df165e44263d50a4cbd,cprato,Everytime We Touch (Midi By Carlo Prato) (www....


### Gather MidiWorld

In [37]:
mw_path, mw_csv = create_paths('midiworld')

In [38]:
def parse_midiworld_songs(fp):
    name = fp.with_suffix('').name.replace('_', ' ').split(' - ')
    artist = name[0]
    title = name[-1]
    return {
        'artist': artist.strip(),
        'title': title.strip(),
        'midi': relative_path(fp),
        'genres': 'pop,inferred',
        'source': 'midiworld'
    }

In [39]:
file_list = get_files(mw_path/'named_midi', extensions=['.mid'], recurse=True)
mw_md = directory2csv(file_list, meta_func=parse_midiworld_songs, csv_path=mw_csv)
df = pd.read_csv(mw_csv); df.head()

Unnamed: 0,midi,genres,artist,md5,source,title
0,midi_sources/midiworld/named_midi/The_Carpente...,"pop,inferred",The Carpenters,6d6e23b4f0e44537f8b5309ffeaa1880,midiworld,Rainy Days and Mondays
1,midi_sources/midiworld/named_midi/Joan_Jett_-_...,"pop,inferred",Joan Jett,19efd3ac590d3aede49d2e9e62209115,midiworld,I Hate Myself for Loving You
2,midi_sources/midiworld/named_midi/George_Harri...,"pop,inferred",George Harrison,bea4eba9aa4e8154ab01108b2b808e3c,midiworld,When We Was Fab
3,midi_sources/midiworld/named_midi/Video_Game_T...,"pop,inferred",Video Game Themes,dfbd9c523e1846767746285281d5e971,midiworld,Diddy Kong
4,midi_sources/midiworld/named_midi/The_Corrs_-_...,"pop,inferred",The Corrs,2445fa5424432de2a40ece46cbbc853c,midiworld,Someday


### Gather Wikifonia

In [40]:
wf_path, wf_csv = create_paths('wikifonia')

In [41]:
def parse_wikifonia_songs(fp):
    name = fp.with_suffix('').name.replace('_', ' ').split(' - ')
    artist = name[0]
    title = name[-1]
    return {
        'artist': artist.strip(),
        'title': title.strip(),
        'mxl': relative_path(fp),
        'genres': 'pop,inferred',
        'source': 'wikifonia'
    }

In [42]:
# Warning: if you get a deadlock: 
# PosixPath('data/midi/midi_sources/midiworld/named_midi/NITRO_BRO_-_IT_WONT_DIE.mid') is broken
file_list = get_files(wf_path, extensions=['.mxl'], recurse=True)
wf_md = directory2csv(file_list, meta_func=parse_wikifonia_songs, csv_path=wf_csv)
df = pd.read_csv(wf_csv); df.head()

Unnamed: 0,mxl,genres,md5,artist,source,title
0,midi_sources/wikifonia/Tommy Dorsey - Swingin'...,"pop,inferred",f3418afa104cd03604c8831123e086e4,Tommy Dorsey,wikifonia,Swingin' on Nothin'
1,midi_sources/wikifonia/Amanda McBroom - The Ro...,"pop,inferred",a01bde25baf5c5a91b9f6235bf019890,Amanda McBroom,wikifonia,The Rose
2,"midi_sources/wikifonia/Hans Leo Hassler, From ...","pop,inferred",4beb6aafed89a87a8171da906d8b5ff8,"Hans Leo Hassler, From the Latin",wikifonia,O Sacred Head Now Wounded
3,"midi_sources/wikifonia/Arthur Siegel, June Car...","pop,inferred",a739908a6505fcbe8c7d9bf7b1e47492,"Arthur Siegel, June Carroll",wikifonia,Love Is A Simple Thing
4,midi_sources/wikifonia/Unknow - KINDERLIEDJES ...,"pop,inferred",cb2beddd4a86439dcf2d80b5fc6b4e4f,Unknow,wikifonia,KINDERLIEDJES MEDLEY


### Gather Musescore

In [13]:
ms_path, ms_csv = create_paths('musescore')
ms_songs = load_json(ms_path/'song_map.json')

In [14]:
def get_number(num_str):
    num_str = num_str.replace(',','').split(' ')[0]
    return int(num_str)

In [15]:
def parse_musescore_songs(fp):
    score_id = fp.with_suffix('').name
    if score_id not in ms_songs: return None
    
    meta = ms_songs[score_id]
    parts = get_number(meta['parts'])
    views = get_number(meta['views'])
    if parts > 2 or views < 90: return None
    
    return {
        'artist': meta['author'].strip(),
        'title': meta['title'].strip(),
        'mxl': relative_path(fp),
        'genres': 'classical,pop,inferred',
        'source': 'musescore'
    }

In [21]:
# Warning: if you get a deadlock: 
# PosixPath('data/midi/midi_sources/midiworld/named_midi/NITRO_BRO_-_IT_WONT_DIE.mid') is broken
file_list = get_files(ms_path, extensions=['.mxl'], recurse=True)
ms_list = directory2csv(file_list, meta_func=parse_musescore_songs, csv_path=ms_csv)
df = pd.read_csv(ms_csv); df.head()

Unnamed: 0,mxl,genres,artist,md5,source,title
0,midi_sources/musescore/data/2985741.mxl,"classical,pop,inferred",000@xn--80akgejic5ahko1h.xn--p1ai,36fb6c38927e5e025f7f8427770cf8fa,musescore,Упражнение 2
1,midi_sources/musescore/data/1425126.mxl,"classical,pop,inferred",sam027,b666753133bc72910840a97bddb1fbf9,musescore,Place de la République - Coeur de pirate
2,midi_sources/musescore/data/5370824.mxl,"classical,pop,inferred",CrazyClique,83de066be1fa555a986376170c467f99,musescore,Beyond The Trees - Original Composition
3,midi_sources/musescore/data/1195001.mxl,"classical,pop,inferred",Mjmatthews51,a85e407d2427234ee6bf5aa7ae5ce5e9,musescore,Sister Sadie
4,midi_sources/musescore/data/4621586.mxl,"classical,pop,inferred",Spencer Vanderkley,d1f72939369c106948993bba9b4fc0a9,musescore,My Top 20 Film Soundtracks Medley


### Yamaha - piano

In [48]:
ec_path, ec_csv = create_paths('ecomp')
ec_songs = load_json(ec_path/'song_list.json')
# list(ec_path.glob('*'))[:5]

In [49]:
def parse_ecomp_songs(fp):
    song_info = ec_songs[fp.stem]
    return {
        'artist': song_info['artist'],
        'title': song_info['title'],
        'mxl': relative_path(fp),
        'genres': 'classical',
        'source': 'ecomp'
    }

In [50]:
file_list = get_files(ec_path, extensions=['.mxl'], recurse=True)

In [51]:
ec_md = directory2csv(file_list, meta_func=parse_ecomp_songs, csv_path=ec_csv)
df = pd.read_csv(ec_csv); df.head()

Unnamed: 0,mxl,genres,md5,artist,source,title
0,midi_sources/ecomp/2017/SirajA01.mxl,classical,39099f086fdc79c2828c13274fcd0f25,Johann Sebastian Bach,ecomp,"Prelude and Fugue in E-flat Major, WTC II, ..."
1,midi_sources/ecomp/2017/LiC05.mxl,classical,fe22a58bcff66e8b724b71c10b5465b6,Moritz Moszkowski,ecomp,"Chanson Boheme de l'Opera ""Carmen"" by Georg..."
2,midi_sources/ecomp/2017/WangY05.mxl,classical,8b38131646b8d71dc10269d3e2d608d0,Nikolai Kapustin,ecomp,Concert Etude Op. 40 No. 3
3,midi_sources/ecomp/2017/SunY05.mxl,classical,2637be3aec226a2fb74e86bb7a1fde81,,ecomp,I. Con moto agitato. Andante. Con moto agitato
4,midi_sources/ecomp/2017/ZhangE06.mxl,classical,c154b4269f8c8cecdb5b2972e3e2d831,Giuseppe Scarlatti,ecomp,"Sonata in G Major, K. 455"


### Classic Piano

In [52]:
clc_path, clc_csv = create_paths('classic_piano')
# list(clc_path.glob('*'))[:5]

In [53]:
def parse_classic_songs(fp):
    name = fp.with_suffix('').name.split('_')
    artist = name[0]
    title = ' '.join(name[1:])
    return {
        'artist': artist,
        'title': title,
        'mxl': relative_path(fp),
        'genres': 'classical',
        'source': 'classical_piano'
    }

In [54]:
file_list = get_files(clc_path, extensions=['.mxl'], recurse=True)

In [55]:
clc_md = directory2csv(file_list, meta_func=parse_classic_songs, csv_path=clc_csv)
df = pd.read_csv(clc_csv); df.head()

Unnamed: 0,mxl,genres,md5,artist,source,title
0,midi_sources/classic_piano/liz_rhap15_format0.mxl,classical,93d730fce9a3e2fabe1a0ea3e2cf80b6,liz,classical_piano,rhap15 format0
1,midi_sources/classic_piano/ty_september_format...,classical,d45bf8f2b4b0242879305a8423a3cb99,ty,classical_piano,september format0
2,midi_sources/classic_piano/schumm-3_format0.mxl,classical,2238d47b4162b55bafee79b16af46a37,schumm-3,classical_piano,format0
3,midi_sources/classic_piano/chpn_op33_4_format0...,classical,3d5c30ba63b886de6555a5d4911cf55f,chpn,classical_piano,op33 4 format0
4,midi_sources/classic_piano/grieg_spring_format...,classical,51758e4fb8b37d0389148d37652b1d58,grieg,classical_piano,spring format0


### Creating CSV

In [35]:
all_csvs = [create_paths(s)[-1] for s in sources]
all_dfs = [pd.read_csv(csv) for csv in all_csvs]

In [36]:
merged_df = pd.concat(all_dfs, sort=False)
merged_df = merged_df.reset_index(drop=True); merged_df.head()

Unnamed: 0,song_url,ht_bpm,ht_time_signature,midi,genres,artist,md5,section,ht_key,source,parts,midi_title,title,ht_offset,ht_mode,mxl
0,https://www.hooktheory.com/theorytab/view/wayn...,128.0,4.0,midi_sources/hooktheory/pianoroll/w/wayne-shar...,,wayne-sharpe,bf1f29e5ff84e3e93e37fb873bfb590e,chorus,C,hooktheory,"intro,chorus",yu-gi-oh3,yu-gi-oh-theme-song,0.0,1.0,
1,https://www.hooktheory.com/theorytab/view/wayn...,85.0,3.0,midi_sources/hooktheory/pianoroll/w/wayne-shar...,,wayne-sharpe,055f80ad67f64edb14a85ca8fbfe8c29,intro,C,hooktheory,"intro,chorus",yu-gi-oh,yu-gi-oh-theme-song,0.0,1.0,
2,https://www.hooktheory.com/theorytab/view/what...,96.0,4.0,midi_sources/hooktheory/pianoroll/w/what-a-day...,Jazz,what-a-day,197f96f5d181f6ce1e2c5ab04ac1ff87,chorus,D,hooktheory,chorus,kiefer,kiefer,-5.0,6.0,
3,https://www.hooktheory.com/theorytab/view/whit...,152.0,4.0,midi_sources/hooktheory/pianoroll/w/whiteflame...,"J-Pop,Pop",whiteflame,9e7ce13a35f1314423a9a6d5a5287a4a,pre-chorus,D,hooktheory,"verse,pre-chorus,chorus",senbonzakura - pre-Pre-Chorus,senbonzakura,-5.0,6.0,
4,https://www.hooktheory.com/theorytab/view/whit...,152.0,4.0,midi_sources/hooktheory/pianoroll/w/whiteflame...,"J-Pop,Pop",whiteflame,d5aaf79d0989222f1362f9f46c540a27,verse,D,hooktheory,"verse,pre-chorus,chorus",Senbonzakura,senbonzakura,-5.0,6.0,


In [37]:
deduped = merged_df.drop_duplicates(subset=['md5'], keep='first') # 

Midiworld - 90% duplicates with rest  
Freemidi - 50% duplicates with rest  
LMD - 70% duplicates with rest

In [65]:
# from collections import Counter
# # No dedups
# print(Counter(merged_df.source.values))
# # replacing reddit
# print(Counter(merged_df.drop_duplicates(subset=['md5'], keep='first').source.values))
# # reddit replace else
# print(Counter(merged_df.drop_duplicates(subset=['md5'], keep='last').source.values))
# # Midiworld - 90% duplicates with rest, 
# # Freemidi - 50% duplicates with rest,
# # LMD - 70% duplicates with rest

In [38]:
from collections import Counter
Counter(deduped.source.values)

Counter({'hooktheory': 19882,
         'hooktheory_c': 18286,
         'freemidi': 5168,
         'midiworld': 4109,
         'ecomp': 2735,
         'cprato': 312,
         'classical_piano': 329,
         'musescore': 12253,
         'wikifonia': 6391,
         'lmd': 13568,
         'reddit': 98683})

In [39]:
[df.shape for df in all_dfs], merged_df.shape, deduped.shape

([(20076, 15),
  (20076, 15),
  (5784, 6),
  (4711, 6),
  (2735, 6),
  (314, 6),
  (329, 6),
  (12256, 6),
  (6391, 6),
  (17243, 6),
  (128419, 6)],
 (218334, 16),
 (181716, 16))

In [40]:
deduped.to_csv(all_csv, index=False)