In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import sys
sys.path.insert(0, '../../')
from src.encode_data import *
from src.midi_data import *
from src.data_sources import process_all, arr2csv
from src.midi_transform import *
from src.fastai_data import *
from src.unilm import S2SFileProcessor, S2SPreloader

In [3]:
import traceback
import time

## Standardize and reformat raw midi files before encoding to text
- Transform key to C major
- Remove unused instruments
- Combine multiple tracks with the same instrument into a single part
- Melody, Piano, String

### Load midi data

In [4]:
version = 'v16'
data_path = Path('data/midi')
version_path = data_path/version

In [5]:
import pandas as pd

In [6]:
# out_dir = 'midi_encode'
# duet_only = False
out_dir = 's2s_encode'
duet_only = True

In [7]:
source_dir = 'midi_sources'
source_csv = version_path/'metadata'/f'{source_dir}.csv'
out_csv = version_path/out_dir/f'{out_dir}.csv'
out_csv.parent.mkdir(parents=True, exist_ok=True)
source_csv, out_csv

(PosixPath('data/midi/v16/metadata/midi_sources.csv'),
 PosixPath('data/midi/v16/s2s_encode/s2s_encode.csv'))

In [8]:
# num_comps = 2 # note, duration
cutoff = 5 # max instruments
min_variation = 3 # minimum number of different midi notes played
# max_dur = 128

### Encoding midi to numpy

In [9]:
df = pd.read_csv(source_csv); df.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,ht_time_signature,ht_offset,midi,section,parts,ht_bpm,title,midi_title,artist,song_url,genres,source,ht_key,md5,mxl,ht_mode
0,4.0,0.0,midi_sources/hooktheory/pianoroll/w/wayne-shar...,chorus,"intro,chorus",128.0,yu-gi-oh-theme-song,yu-gi-oh3,wayne-sharpe,https://www.hooktheory.com/theorytab/view/wayn...,,hooktheory,C,bf1f29e5ff84e3e93e37fb873bfb590e,,1.0
1,3.0,0.0,midi_sources/hooktheory/pianoroll/w/wayne-shar...,intro,"intro,chorus",85.0,yu-gi-oh-theme-song,yu-gi-oh,wayne-sharpe,https://www.hooktheory.com/theorytab/view/wayn...,,hooktheory,C,055f80ad67f64edb14a85ca8fbfe8c29,,1.0
2,4.0,-5.0,midi_sources/hooktheory/pianoroll/w/what-a-day...,chorus,chorus,96.0,kiefer,kiefer,what-a-day,https://www.hooktheory.com/theorytab/view/what...,Jazz,hooktheory,D,197f96f5d181f6ce1e2c5ab04ac1ff87,,6.0
3,4.0,-5.0,midi_sources/hooktheory/pianoroll/w/whiteflame...,pre-chorus,"verse,pre-chorus,chorus",152.0,senbonzakura,senbonzakura - pre-Pre-Chorus,whiteflame,https://www.hooktheory.com/theorytab/view/whit...,"J-Pop,Pop",hooktheory,D,9e7ce13a35f1314423a9a6d5a5287a4a,,6.0
4,4.0,-5.0,midi_sources/hooktheory/pianoroll/w/whiteflame...,verse,"verse,pre-chorus,chorus",152.0,senbonzakura,Senbonzakura,whiteflame,https://www.hooktheory.com/theorytab/view/whit...,"J-Pop,Pop",hooktheory,D,d5aaf79d0989222f1362f9f46c540a27,,6.0


In [10]:
all_records = df.to_dict(orient='records'); len(all_records)

197182

In [11]:
def part_enc(chordarr, part, input_path):
    partarr = chordarr[:,part:part+1,:]
    # Part 3. Chord array to numpy
    npenc = chordarr2npenc(partarr)
    
    if npenc[-1, -1] > MAX_NOTE_DUR: 
        print('Part is shorter than song. Trimming end:', input_path)
        npenc[-1, -1] = MAX_NOTE_DUR
    if not is_valid_npenc(npenc, min_notes=8, input_path=input_path): return None
    
    return npenc

In [12]:
def process_metadata(metadata):
    result = metadata.copy()
    
    # Part 1. Compress tracks/instruments
    if not isinstance(metadata.get('midi'), str): return None
    
    input_path = version_path/metadata['midi']
    extension = input_path.suffix.lower()
    if not input_path.exists(): 
        print('Input path does not exist:', input_path, metadata)
        return result
    
    # Get outfile and check if it exists
    out_file = Path(str(input_path).replace(f'/{source_dir}/', f'/{out_dir}/'))
    out_file = out_file.with_suffix('.npy')
    out_file.parent.mkdir(parents=True, exist_ok=True)
    if out_file.exists(): 
        result['numpy'] = str(out_file.relative_to(version_path))
        return result
    
    npenc = transform_midi(input_path)
    if npenc is None: return result
    np.save(out_file, npenc)
    result['numpy'] = str(out_file.relative_to(version_path))
    return result

In [13]:
def transform_midi(midi_file):
    input_path = midi_file
    
    try: 
        if num_piano_tracks(input_path) not in [1, 2]: return None
        input_file = compress_midi_file(input_path, min_variation=min_variation, cutoff=cutoff) # remove non note tracks and standardize instruments
        if not input_file: return None
    except Exception as e:
        if 'badly form' in str(e): return None # ignore badly formatted midi errors
        if 'out of range' in str(e): return None # ignore badly formatted midi errors
        print('Error parsing midi', input_path, e)
        return None
        
    # Part 2. Compress rests and long notes
    stream = file2stream(input_file) # 1.
    try:
        chordarr = stream2chordarr(stream) # 2. max_dur = quarter_len * sample_freq (4). 128 = 8 bars
    except Exception as e:
        print('Could not encode to chordarr:', input_path, e)
#         print(traceback.format_exc())
        return None
    
    chord_trim = trim_chordarr_rests(chordarr)
    chord_short = shorten_chordarr_rests(chord_trim)
    delta_trim = chord_trim.shape[0] - chord_short.shape[0]
#     if delta_trim > 300: 
#         print(f'Removed {delta_trim} rests from {input_path}. Skipping song')
#         return None
    chordarr = chord_short
    
    _,num_parts,_ = chordarr.shape
    if num_parts != 2: return None
    
    parts = [part_enc(chordarr, i, input_path) for i in range(num_parts)]
    for p in parts: 
        if p is None: return None
    
    return np.array(parts)

In [14]:
# transform_midi(piano_file)
midi_mxl_file = version_path/'midi_sources/from_mxl/musescore/data/49143.mid'
input_file = midi_mxl_file
stream = file2stream(input_file) # 1.
chordarr = stream2chordarr(stream)

In [15]:
chordarr.shape

(3061, 2, 128)

In [16]:
transform_midi(midi_mxl_file).shape

(2,)

In [17]:
def try_process_metadata(metadata):
    try:
        return process_metadata(metadata)
    except Exception:
#         print(traceback.format_exc())
        return None

In [18]:
# # sanity check
import random
for r in random.sample(all_records, 10):
    process_metadata(r)

In [19]:
def timeout_func(data, seconds):
    print("Timeout:", seconds, data.get('midi'))

In [20]:
processed = process_all(try_process_metadata, all_records, timeout=300, timeout_func=timeout_func)

Part is shorter than song. Trimming end: data/midi/v16/midi_sources/hooktheory/pianoroll/w/willie-nelson/you-were-always-on-my-mind/verse_key_original.mid
Part is shorter than song. Trimming end: data/midi/v16/midi_sources/hooktheory/pianoroll/w/war/low-rider/intro-and-verse_key_original.mid
Part is shorter than song. Trimming end: data/midi/v16/midi_sources/hooktheory/pianoroll/w/wowaka/rollin-girl/verse_key_original.mid


Part is shorter than song. Trimming end: data/midi/v16/midi_sources/hooktheory/pianoroll/y/yo-la-tengo/ohm/verse_key_original.mid
Part is shorter than song. Trimming end: data/midi/v16/midi_sources/hooktheory/pianoroll/y/yevgueni/welkenraedt/intro-and-verse_key_original.mid
Part is shorter than song. Trimming end: data/midi/v16/midi_sources/hooktheory/pianoroll/j/jose-gonzales/step-out/intro_key_original.mid
Part is shorter than song. Trimming end: data/midi/v16/midi_sources/hooktheory/pianoroll/j/judee-sill/crayon-angels/verse_key_original.mid
Part is shorter than song. Trimming end: data/midi/v16/midi_sources/hooktheory/pianoroll/j/jelly-roll-morton/milenburg-joys/instrumental_key_original.mid
Part is shorter than song. Trimming end: data/midi/v16/midi_sources/hooktheory/pianoroll/j/jon-foreman/white-as-snow/intro_key_original.mid
Part is shorter than song. Trimming end: data/midi/v16/midi_sources/hooktheory/pianoroll/j/jon-foreman/inheritance/pre-chorus-and-chorus_key_original.mid
P

Part is shorter than song. Trimming end: data/midi/v16/midi_sources/hooktheory/pianoroll/n/nintendo/kirby-air-ride---machine-passage/verse_key_original.mid
Part is shorter than song. Trimming end: data/midi/v16/midi_sources/hooktheory/pianoroll/i/iq/leap-of-faith/intro-and-verse_key_original.mid
Part is shorter than song. Trimming end: data/midi/v16/midi_sources/hooktheory/pianoroll/h/homestuck/unite-synchronization/chorus_key_original.mid
Part is shorter than song. Trimming end: data/midi/v16/midi_sources/hooktheory/pianoroll/p/pet-shop-boys/you-were-always-on-my-mind/verse_key_original.mid
Part is shorter than song. Trimming end: data/midi/v16/midi_sources/hooktheory/pianoroll/p/paul-mccartney/sing-the-changes/intro_key_original.mid
Part is shorter than song. Trimming end: data/midi/v16/midi_sources/hooktheory/pianoroll/p/pendulum/salt-in-the-wounds/verse_key_original.mid
Part is shorter than song. Trimming end: data/midi/v16/midi_sources/hooktheory/pianoroll/p/pendulum/salt-in-the-w

npenc exceeds max 481 duration: 714 data/midi/v16/midi_sources/freemidi/genre-pop/R.E.M. - Ignoreland.mid
npenc exceeds max 481 duration: 840 data/midi/v16/midi_sources/freemidi/genre-pop/Sade - Siempre Hay Esperanza.mid
npenc exceeds max 481 duration: 816 data/midi/v16/midi_sources/freemidi/genre-pop/Sade - Siempre Hay Esperanza.mid
npenc exceeds max 481 duration: 972 data/midi/v16/midi_sources/freemidi/genre-pop/Ace of Base - Beautiful Life.mid
npenc exceeds max 481 duration: 987 data/midi/v16/midi_sources/freemidi/genre-pop/98 Degrees - The Hardest Thing.mid
npenc exceeds max 481 duration: 2496 data/midi/v16/midi_sources/freemidi/genre-pop/98 Degrees - The Hardest Thing.mid
npenc exceeds max 481 duration: 1248 data/midi/v16/midi_sources/freemidi/genre-pop/R.E.M. - Orange Crush.mid
Error parsing midi data/midi/v16/midi_sources/freemidi/genre-pop/PSY - gangnam style KILLYURSLEF.mid ord() expected string of length 1, but int found
Part is shorter than song. Trimming end: data/midi/v16/

npenc exceeds max 481 duration: 492 data/midi/v16/midi_sources/from_mxl/ecomp/2006/Bach01.mid
npenc exceeds max 481 duration: 900 data/midi/v16/midi_sources/from_mxl/ecomp/2006/Mordvinov02.mid
npenc exceeds max 481 duration: 726 data/midi/v16/midi_sources/from_mxl/ecomp/2008/Lisiecki15.mid
npenc exceeds max 481 duration: 492 data/midi/v16/midi_sources/from_mxl/ecomp/2009/Albright02.mid
npenc exceeds max 481 duration: 585 data/midi/v16/midi_sources/from_mxl/ecomp/2004/ZHOU03.mid
npenc exceeds max 481 duration: 744 data/midi/v16/midi_sources/from_mxl/ecomp/2015/LiYZ09.mid
npenc exceeds max 481 duration: 528 data/midi/v16/midi_sources/from_mxl/ecomp/2009/Gasanov10.mid
npenc exceeds max 481 duration: 597 data/midi/v16/midi_sources/from_mxl/ecomp/2006/Izzard03.mid
npenc exceeds max 481 duration: 531 data/midi/v16/midi_sources/from_mxl/ecomp/2009/Gasanov10.mid
npenc exceeds max 481 duration: 528 data/midi/v16/midi_sources/from_mxl/ecomp/2009/Staupe03.mid
npenc exceeds max 481 duration: 716 d

npenc exceeds max 481 duration: 894 data/midi/v16/midi_sources/from_mxl/ecomp/2006/Yarden10.mid
npenc exceeds max 481 duration: 528 data/midi/v16/midi_sources/from_mxl/ecomp/2009/Gintov01.mid
npenc exceeds max 481 duration: 702 data/midi/v16/midi_sources/from_mxl/ecomp/2009/Wilshire03.mid
npenc exceeds max 481 duration: 894 data/midi/v16/midi_sources/from_mxl/ecomp/2004/YOO09.mid
npenc exceeds max 481 duration: 708 data/midi/v16/midi_sources/from_mxl/ecomp/2006/Shybayeva04.mid
npenc exceeds max 481 duration: 516 data/midi/v16/midi_sources/from_mxl/ecomp/2011/Zhou05.mid
npenc exceeds max 481 duration: 3894 data/midi/v16/midi_sources/from_mxl/ecomp/2015/LuoJ11.mid
npenc exceeds max 481 duration: 1152 data/midi/v16/midi_sources/from_mxl/classical_archives/021/bl109_08.mid
npenc exceeds max 481 duration: 771 data/midi/v16/midi_sources/from_mxl/classical_archives/021/bl109_08.mid
npenc exceeds max 481 duration: 711 data/midi/v16/midi_sources/from_mxl/ecomp/2009/Uiasiuk02.mid
npenc exceeds m

Part is shorter than song. Trimming end: data/midi/v16/midi_sources/from_mxl/classical_archives/012/htsminue.mid
npenc exceeds max 481 duration: 2196 data/midi/v16/midi_sources/from_mxl/classical_archives/012/htsminue.mid
npenc exceeds max 481 duration: 1896 data/midi/v16/midi_sources/from_mxl/classical_archives/012/htsminue.mid
npenc exceeds max 481 duration: 516 data/midi/v16/midi_sources/from_mxl/classical_archives/0/bcpe554a.mid
npenc exceeds max 481 duration: 780 data/midi/v16/midi_sources/from_mxl/ecomp/2004/YOO07.mid
npenc exceeds max 481 duration: 584 data/midi/v16/midi_sources/from_mxl/ecomp/2014/LiuY05.mid
npenc exceeds max 481 duration: 834 data/midi/v16/midi_sources/from_mxl/classical_archives/0/mussson1.mid
npenc exceeds max 481 duration: 576 data/midi/v16/midi_sources/from_mxl/classical_archives/0/hmoonl3.mid
npenc exceeds max 481 duration: 632 data/midi/v16/midi_sources/from_mxl/ecomp/2009/Sekino04.mid
npenc exceeds max 481 duration: 552 data/midi/v16/midi_sources/from_m

Part is shorter than song. Trimming end: data/midi/v16/midi_sources/from_mxl/musescore/data/5226027.mid
npenc exceeds max 481 duration: 798 data/midi/v16/midi_sources/from_mxl/musescore/data/2300731.mid
npenc exceeds max 481 duration: 576 data/midi/v16/midi_sources/from_mxl/musescore/data/679436.mid
npenc exceeds max 481 duration: 516 data/midi/v16/midi_sources/from_mxl/musescore/data/1066661.mid
Part is shorter than song. Trimming end: data/midi/v16/midi_sources/from_mxl/musescore/data/583016.mid
npenc exceeds max 481 duration: 960 data/midi/v16/midi_sources/from_mxl/musescore/data/4677121.mid
npenc exceeds max 481 duration: 978 data/midi/v16/midi_sources/from_mxl/musescore/data/3003356.mid
npenc exceeds max 481 duration: 576 data/midi/v16/midi_sources/from_mxl/musescore/data/2058256.mid
npenc exceeds max 481 duration: 588 data/midi/v16/midi_sources/from_mxl/classical_archives/2/chaccone.mid
npenc exceeds max 481 duration: 588 data/midi/v16/midi_sources/from_mxl/musescore/data/55286.m

npenc exceeds max 481 duration: 576 data/midi/v16/midi_sources/from_mxl/musescore/data/5148398.mid
Part is shorter than song. Trimming end: data/midi/v16/midi_sources/from_mxl/musescore/data/3449871.mid
npenc exceeds max 481 duration: 756 data/midi/v16/midi_sources/from_mxl/musescore/data/114435.mid
Part is shorter than song. Trimming end: data/midi/v16/midi_sources/from_mxl/musescore/data/1809796.mid
Part is shorter than song. Trimming end: data/midi/v16/midi_sources/from_mxl/musescore/data/3924956.mid
npenc exceeds max 481 duration: 750 data/midi/v16/midi_sources/from_mxl/musescore/data/3924956.mid
Part is shorter than song. Trimming end: data/midi/v16/midi_sources/from_mxl/musescore/data/4597716.mid
Part is shorter than song. Trimming end: data/midi/v16/midi_sources/from_mxl/musescore/data/4282981.mid
Part is shorter than song. Trimming end: data/midi/v16/midi_sources/from_mxl/musescore/data/2672486.mid
npenc exceeds max 481 duration: 1266 data/midi/v16/midi_sources/from_mxl/musesco

npenc exceeds max 481 duration: 1728 data/midi/v16/midi_sources/from_mxl/musescore/data/842546.mid
npenc exceeds max 481 duration: 840 data/midi/v16/midi_sources/from_mxl/musescore/data/4288921.mid
npenc exceeds max 481 duration: 843 data/midi/v16/midi_sources/from_mxl/musescore/data/1208786.mid
npenc exceeds max 481 duration: 492 data/midi/v16/midi_sources/from_mxl/musescore/data/4635631.mid
Part is shorter than song. Trimming end: data/midi/v16/midi_sources/from_mxl/musescore/data/2520566.mid
Part is shorter than song. Trimming end: data/midi/v16/midi_sources/from_mxl/musescore/data/64668.mid
npenc exceeds max 481 duration: 1107 data/midi/v16/midi_sources/from_mxl/musescore/data/64668.mid
npenc exceeds max 481 duration: 594 data/midi/v16/midi_sources/from_mxl/musescore/data/175266.mid
Part is shorter than song. Trimming end: data/midi/v16/midi_sources/from_mxl/musescore/data/74716.mid
npenc exceeds max 481 duration: 576 data/midi/v16/midi_sources/from_mxl/musescore/data/1407036.mid
n

npenc exceeds max 481 duration: 588 data/midi/v16/midi_sources/from_mxl/musescore/data/5250847.mid
Part is shorter than song. Trimming end: data/midi/v16/midi_sources/from_mxl/musescore/data/1463281.mid
npenc exceeds max 481 duration: 576 data/midi/v16/midi_sources/from_mxl/musescore/data/4988830.mid
npenc exceeds max 481 duration: 820 data/midi/v16/midi_sources/from_mxl/musescore/data/972271.mid
npenc exceeds max 481 duration: 504 data/midi/v16/midi_sources/from_mxl/musescore/data/53049.mid
npenc exceeds max 481 duration: 612 data/midi/v16/midi_sources/from_mxl/musescore/data/403246.mid
npenc exceeds max 481 duration: 504 data/midi/v16/midi_sources/from_mxl/musescore/data/5286490.mid
npenc exceeds max 481 duration: 672 data/midi/v16/midi_sources/from_mxl/musescore/data/3437061.mid
Part is shorter than song. Trimming end: data/midi/v16/midi_sources/from_mxl/musescore/data/657236.mid
npenc exceeds max 481 duration: 1392 data/midi/v16/midi_sources/from_mxl/musescore/data/657236.mid
npenc

npenc exceeds max 481 duration: 1959 data/midi/v16/midi_sources/lmd_clean/Oscar Peterson/When Your Lover Has Gone.mid
Error parsing midi data/midi/v16/midi_sources/lmd_clean/Jackson Michael/Thriller.3.mid ord() expected string of length 1, but int found
Error parsing midi data/midi/v16/midi_sources/lmd_clean/Jackson Michael/The Girl Is Mine.3.mid ord() expected string of length 1, but int found
Error parsing midi data/midi/v16/midi_sources/lmd_clean/Jackson Michael/Don't Stop 'Til You Get Enough.mid ord() expected string of length 1, but int found
npenc exceeds max 481 duration: 948 data/midi/v16/midi_sources/lmd_clean/Simon & Garfunkel/Scarborough Fair.1.mid
npenc exceeds max 481 duration: 1116 data/midi/v16/midi_sources/lmd_clean/Jean Michel Jarre/Calypso (Part 1).2.mid
Error parsing midi data/midi/v16/midi_sources/lmd_clean/Los Del Rio/Macarena.1.mid ord() expected string of length 1, but int found
npenc exceeds max 481 duration: 654 data/midi/v16/midi_sources/lmd_clean/Black Sabbat

npenc exceeds max 481 duration: 510 data/midi/v16/midi_sources/130k_reddit/P/P/Paganini-Variation-Nr-1.mid
Part is shorter than song. Trimming end: data/midi/v16/midi_sources/130k_reddit/P/P/Paganini-Variation-Nr-1.mid
npenc exceeds max 481 duration: 654 data/midi/v16/midi_sources/130k_reddit/P/P/Paganini-Variation-Nr-1.mid
Part is shorter than song. Trimming end: data/midi/v16/midi_sources/130k_reddit/P/P/puertorico_potpourri.mid
npenc exceeds max 481 duration: 960 data/midi/v16/midi_sources/130k_reddit/P/P/puertorico_potpourri.mid
Part is shorter than song. Trimming end: data/midi/v16/midi_sources/130k_reddit/P/P/panon.mid
Error parsing midi data/midi/v16/midi_sources/130k_reddit/P/P/prgybess.mid ord() expected string of length 1, but int found
Part is shorter than song. Trimming end: data/midi/v16/midi_sources/130k_reddit/P/P/Princess Witches - Fallen Tears.mid
npenc exceeds max 481 duration: 1200 data/midi/v16/midi_sources/130k_reddit/P/P/Princess Witches - Fallen Tears.mid
npenc e

npenc exceeds max 481 duration: 786 data/midi/v16/midi_sources/130k_reddit/Guitar_midkar.com_MIDIRip/MidiStudio/Hits/WhiterShade.mid
npenc exceeds max 481 duration: 2304 data/midi/v16/midi_sources/130k_reddit/Guitar_midkar.com_MIDIRip/MidiStudio/JPT/WhyDidIChooseYou.mid
npenc exceeds max 481 duration: 984 data/midi/v16/midi_sources/130k_reddit/Guitar_midkar.com_MIDIRip/MidiStudio/MWebb/The_rose.mid
npenc exceeds max 481 duration: 1443 data/midi/v16/midi_sources/130k_reddit/Guitar_midkar.com_MIDIRip/MidiStudio/JPT/YouMustBelieveInSpring.mid
npenc exceeds max 481 duration: 1212 data/midi/v16/midi_sources/130k_reddit/Guitar_midkar.com_MIDIRip/MidiStudio/GConfrey/GoodNightMy.mid
Part is shorter than song. Trimming end: data/midi/v16/midi_sources/130k_reddit/Guitar_midkar.com_MIDIRip/MidiStudio/GConfrey/TheFolksWhoLiveOnTheHill.mid
npenc exceeds max 481 duration: 2451 data/midi/v16/midi_sources/130k_reddit/Guitar_midkar.com_MIDIRip/MidiStudio/GConfrey/TheFolksWhoLiveOnTheHill.mid
Part is sh

Error parsing midi data/midi/v16/midi_sources/130k_reddit/F/F/for_you_blue.mid ord() expected string of length 1, but int found
npenc exceeds max 481 duration: 774 data/midi/v16/midi_sources/130k_reddit/F/F/foolishgamesp.mid
Part is shorter than song. Trimming end: data/midi/v16/midi_sources/130k_reddit/F/F/foolishgamesp.mid
npenc exceeds max 481 duration: 1926 data/midi/v16/midi_sources/130k_reddit/F/F/foolishgamesp.mid
npenc exceeds max 481 duration: 576 data/midi/v16/midi_sources/130k_reddit/F/F/Final Fantasy Crystal Chronicles - Shella.mid
Part is shorter than song. Trimming end: data/midi/v16/midi_sources/130k_reddit/F/F/for_art_and_love.mid
npenc exceeds max 481 duration: 579 data/midi/v16/midi_sources/130k_reddit/F/F/for_art_and_love.mid
Part is shorter than song. Trimming end: data/midi/v16/midi_sources/130k_reddit/F/F/Fred_Baker_-_Requiem__OCEANIC_20090531122532.mid
npenc exceeds max 481 duration: 768 data/midi/v16/midi_sources/130k_reddit/F/F/Fred_Baker_-_Requiem__OCEANIC_200

Error parsing midi data/midi/v16/midi_sources/130k_reddit/Classical_Guitar_classicalguitarmidi.com_MIDIRip/Pernambuco_Brasileirinho.mid ord() expected string of length 1, but int found
npenc exceeds max 481 duration: 522 data/midi/v16/midi_sources/130k_reddit/Classical Archives - The Greats (MIDI)/Classical Piano Midis/Beethoven/Piano Sonatina in G Op79.mid
npenc exceeds max 481 duration: 564 data/midi/v16/midi_sources/130k_reddit/Classical Archives - The Greats (MIDI)/Classical Piano Midis/Chaminade/Concertino for Flute and Piano.mid
npenc exceeds max 481 duration: 895 data/midi/v16/midi_sources/130k_reddit/Classical Archives - The Greats (MIDI)/Classical Piano Midis/Hummel/Piano Sonata Opus.94.mid
npenc exceeds max 481 duration: 2703 data/midi/v16/midi_sources/130k_reddit/Classical Archives - The Greats (MIDI)/Mendelsonn/Variations serieuses op54.mid
Error parsing midi data/midi/v16/midi_sources/130k_reddit/C/C/cantina13.mid ord() expected string of length 1, but int found
npenc exce

Part is shorter than song. Trimming end: data/midi/v16/midi_sources/130k_reddit/A/A/Armin van buuren - Full Focus (Ummet Ozcan Remix).mid
npenc exceeds max 481 duration: 714 data/midi/v16/midi_sources/130k_reddit/A/A/Armin van buuren - Full Focus (Ummet Ozcan Remix).mid
npenc exceeds max 481 duration: 868 data/midi/v16/midi_sources/130k_reddit/A/A/autumn_leaves2.mid
npenc exceeds max 481 duration: 2196 data/midi/v16/midi_sources/130k_reddit/A/A/alb_se5.mid
npenc exceeds max 481 duration: 672 data/midi/v16/midi_sources/130k_reddit/A/A/anything_goes.mid
npenc exceeds max 481 duration: 1452 data/midi/v16/midi_sources/130k_reddit/A/A/Another - Another Main Theme.mid
npenc exceeds max 481 duration: 864 data/midi/v16/midi_sources/130k_reddit/A/A/allforthebest.mid
npenc exceeds max 481 duration: 1656 data/midi/v16/midi_sources/130k_reddit/A/A/assaggia.mid
npenc exceeds max 481 duration: 1032 data/midi/v16/midi_sources/130k_reddit/A/A/assaggia.mid
Part is shorter than song. Trimming end: data/

Error parsing midi data/midi/v16/midi_sources/130k_reddit/D/D/DION.Move any mountain.mid ord() expected string of length 1, but int found
Part is shorter than song. Trimming end: data/midi/v16/midi_sources/130k_reddit/D/D/danube04.mid
npenc exceeds max 481 duration: 1188 data/midi/v16/midi_sources/130k_reddit/D/D/danube04.mid
npenc exceeds max 481 duration: 600 data/midi/v16/midi_sources/130k_reddit/D/D/danube04.mid
npenc exceeds max 481 duration: 1095 data/midi/v16/midi_sources/130k_reddit/D/D/d-lamb4.mid
npenc exceeds max 481 duration: 1923 data/midi/v16/midi_sources/130k_reddit/7/7steps2.mid
npenc exceeds max 481 duration: 748 data/midi/v16/midi_sources/130k_reddit/Ragtime_rtpress.com_MIDIRip/rdeer/GesuB_sk.mid
npenc exceeds max 481 duration: 1368 data/midi/v16/midi_sources/130k_reddit/Ragtime_rtpress.com_MIDIRip/rdeer/XmasC_sk.mid
Part is shorter than song. Trimming end: data/midi/v16/midi_sources/130k_reddit/Y/Y/YAZOO.Only you.mid
npenc exceeds max 481 duration: 498 data/midi/v16/

npenc exceeds max 481 duration: 606 data/midi/v16/midi_sources/130k_reddit/H/H/han_som_reiste.mid
npenc exceeds max 481 duration: 768 data/midi/v16/midi_sources/130k_reddit/H/H/Htech_01.mid
Part is shorter than song. Trimming end: data/midi/v16/midi_sources/130k_reddit/H/H/Headhunterz_-_Just_Say_My_Name__kisgabber_20101114214748.mid
npenc exceeds max 481 duration: 768 data/midi/v16/midi_sources/130k_reddit/H/H/Headhunterz_-_Just_Say_My_Name__kisgabber_20101114214748.mid
npenc exceeds max 481 duration: 1536 data/midi/v16/midi_sources/130k_reddit/H/H/Have-You-Met-Miss-Jones.mid
npenc exceeds max 481 duration: 552 data/midi/v16/midi_sources/130k_reddit/H/H/himno02.mid
npenc exceeds max 481 duration: 1392 data/midi/v16/midi_sources/130k_reddit/I/I/itcameupon2.mid
npenc exceeds max 481 duration: 648 data/midi/v16/midi_sources/130k_reddit/H/H/hmcello1.mid
Could not encode to chordarr: data/midi/v16/midi_sources/130k_reddit/I/I/i_ran.mid 
Timeout: 300 midi_sources/130k_reddit/D/D/dou_01.mid
n

npenc exceeds max 481 duration: 1292 data/midi/v16/midi_sources/130k_reddit/W/W/whereislove.mid
Part is shorter than song. Trimming end: data/midi/v16/midi_sources/130k_reddit/W/W/whereislove.mid
npenc exceeds max 481 duration: 720 data/midi/v16/midi_sources/130k_reddit/1/16SOMEWH.MID
Part is shorter than song. Trimming end: data/midi/v16/midi_sources/130k_reddit/1/16SOMEWH.MID
npenc exceeds max 481 duration: 1188 data/midi/v16/midi_sources/130k_reddit/1/16SOMEWH.MID
npenc exceeds max 481 duration: 2556 data/midi/v16/midi_sources/130k_reddit/W/W/Wind-Beneath-My-Wings.mid
npenc exceeds max 481 duration: 816 data/midi/v16/midi_sources/130k_reddit/T/T/the_ways.mid
npenc exceeds max 481 duration: 537 data/midi/v16/midi_sources/130k_reddit/T/T/Tiesto - Adagio For Strings.mid
Part is shorter than song. Trimming end: data/midi/v16/midi_sources/130k_reddit/T/T/Teequee_-_Crazy__darXide_20070613202958.mid
npenc exceeds max 481 duration: 768 data/midi/v16/midi_sources/130k_reddit/T/T/Teequee_-_Cr

Part is shorter than song. Trimming end: data/midi/v16/midi_sources/hooktheory/pianoroll/j/jon-foreman/white-as-snow/intro_key_cmajor.mid
Part is shorter than song. Trimming end: data/midi/v16/midi_sources/hooktheory/pianoroll/j/jon-foreman/run-free/verse_key_cmajor.mid
Part is shorter than song. Trimming end: data/midi/v16/midi_sources/hooktheory/pianoroll/j/jon-foreman/inheritance/pre-chorus-and-chorus_key_cmajor.mid
Part is shorter than song. Trimming end: data/midi/v16/midi_sources/hooktheory/pianoroll/j/jon-foreman/june-and-johnny/intro-and-verse_key_cmajor.mid
Part is shorter than song. Trimming end: data/midi/v16/midi_sources/hooktheory/pianoroll/g/gizmondo-studios/sticky-balls-theme-song/verse_key_cmajor.mid
Part is shorter than song. Trimming end: data/midi/v16/midi_sources/hooktheory/pianoroll/l/lady-gaga/mary-jane-holland/intro_key_cmajor.mid
Part is shorter than song. Trimming end: data/midi/v16/midi_sources/hooktheory/pianoroll/u/u2/i-still-havent-found-what-im-looking-for

Part is shorter than song. Trimming end: data/midi/v16/midi_sources/hooktheory/pianoroll/p/pendulum/salt-in-the-wounds/instrumental_key_cmajor.mid
Part is shorter than song. Trimming end: data/midi/v16/midi_sources/hooktheory/pianoroll/p/pendulum/salt-in-the-wounds/verse_key_cmajor.mid
Part is shorter than song. Trimming end: data/midi/v16/midi_sources/hooktheory/pianoroll/p/peter-paul-and-mary/i-dig-rock-and-roll-music/verse_key_cmajor.mid
Part is shorter than song. Trimming end: data/midi/v16/midi_sources/hooktheory/pianoroll/p/pink-floyd/in-the-flesh/intro_key_cmajor.mid
Part is shorter than song. Trimming end: data/midi/v16/midi_sources/hooktheory/pianoroll/p/pearl-jam/indifference/verse_key_cmajor.mid
Part is shorter than song. Trimming end: data/midi/v16/midi_sources/hooktheory/pianoroll/t/the-flaming-lips/psychiatric-exploration-of-the-fetus-with-needles/intro_key_cmajor.mid
Part is shorter than song. Trimming end: data/midi/v16/midi_sources/hooktheory/pianoroll/t/this-town-need

In [29]:
arr2csv(processed, out_csv); len(processed)

196520

In [30]:
df = pd.read_csv(out_csv); df.head()

Unnamed: 0,ht_mode,title,genres,md5,artist,midi,ht_key,song_url,numpy,mxl,source,midi_title,parts,ht_offset,ht_time_signature,section,ht_bpm
0,1.0,yu-gi-oh-theme-song,,bf1f29e5ff84e3e93e37fb873bfb590e,wayne-sharpe,midi_sources/hooktheory/pianoroll/w/wayne-shar...,C,https://www.hooktheory.com/theorytab/view/wayn...,,,hooktheory,yu-gi-oh3,"intro,chorus",0.0,4.0,chorus,128.0
1,1.0,yu-gi-oh-theme-song,,055f80ad67f64edb14a85ca8fbfe8c29,wayne-sharpe,midi_sources/hooktheory/pianoroll/w/wayne-shar...,C,https://www.hooktheory.com/theorytab/view/wayn...,,,hooktheory,yu-gi-oh,"intro,chorus",0.0,3.0,intro,85.0
2,6.0,kiefer,Jazz,197f96f5d181f6ce1e2c5ab04ac1ff87,what-a-day,midi_sources/hooktheory/pianoroll/w/what-a-day...,D,https://www.hooktheory.com/theorytab/view/what...,s2s_encode/hooktheory/pianoroll/w/what-a-day/k...,,hooktheory,kiefer,chorus,-5.0,4.0,chorus,96.0
3,6.0,senbonzakura,"J-Pop,Pop",9e7ce13a35f1314423a9a6d5a5287a4a,whiteflame,midi_sources/hooktheory/pianoroll/w/whiteflame...,D,https://www.hooktheory.com/theorytab/view/whit...,s2s_encode/hooktheory/pianoroll/w/whiteflame/s...,,hooktheory,senbonzakura - pre-Pre-Chorus,"verse,pre-chorus,chorus",-5.0,4.0,pre-chorus,152.0
4,6.0,senbonzakura,"J-Pop,Pop",d5aaf79d0989222f1362f9f46c540a27,whiteflame,midi_sources/hooktheory/pianoroll/w/whiteflame...,D,https://www.hooktheory.com/theorytab/view/whit...,s2s_encode/hooktheory/pianoroll/w/whiteflame/s...,,hooktheory,Senbonzakura,"verse,pre-chorus,chorus",-5.0,4.0,verse,152.0


In [31]:
len([f for f in df.numpy.values if isinstance(f, str)])

47630

In [32]:
from collections import Counter

In [33]:
df[df.numpy.notnull()]

Unnamed: 0,ht_mode,title,genres,md5,artist,midi,ht_key,song_url,numpy,mxl,source,midi_title,parts,ht_offset,ht_time_signature,section,ht_bpm
2,6.0,kiefer,Jazz,197f96f5d181f6ce1e2c5ab04ac1ff87,what-a-day,midi_sources/hooktheory/pianoroll/w/what-a-day...,D,https://www.hooktheory.com/theorytab/view/what...,s2s_encode/hooktheory/pianoroll/w/what-a-day/k...,,hooktheory,kiefer,chorus,-5.0,4.0,chorus,96.0
3,6.0,senbonzakura,"J-Pop,Pop",9e7ce13a35f1314423a9a6d5a5287a4a,whiteflame,midi_sources/hooktheory/pianoroll/w/whiteflame...,D,https://www.hooktheory.com/theorytab/view/whit...,s2s_encode/hooktheory/pianoroll/w/whiteflame/s...,,hooktheory,senbonzakura - pre-Pre-Chorus,"verse,pre-chorus,chorus",-5.0,4.0,pre-chorus,152.0
4,6.0,senbonzakura,"J-Pop,Pop",d5aaf79d0989222f1362f9f46c540a27,whiteflame,midi_sources/hooktheory/pianoroll/w/whiteflame...,D,https://www.hooktheory.com/theorytab/view/whit...,s2s_encode/hooktheory/pianoroll/w/whiteflame/s...,,hooktheory,Senbonzakura,"verse,pre-chorus,chorus",-5.0,4.0,verse,152.0
5,6.0,senbonzakura,"J-Pop,Pop",e0c189ee753b30c4758d85211f13c189,whiteflame,midi_sources/hooktheory/pianoroll/w/whiteflame...,D,https://www.hooktheory.com/theorytab/view/whit...,s2s_encode/hooktheory/pianoroll/w/whiteflame/s...,,hooktheory,Senbonzakura,"verse,pre-chorus,chorus",-5.0,4.0,chorus,152.0
6,1.0,last-christmas,Holiday,38e38402443506e326b76536e8e327a0,wham,midi_sources/hooktheory/pianoroll/w/wham/last-...,Db,https://www.hooktheory.com/theorytab/view/wham...,s2s_encode/hooktheory/pianoroll/w/wham/last-ch...,,hooktheory,Last Christmas Verse,"intro,verse,chorus",-1.0,4.0,verse,108.0
9,1.0,freedom,,60fa29cfec107df27b053cf9708823d5,wham,midi_sources/hooktheory/pianoroll/w/wham/freed...,C,https://www.hooktheory.com/theorytab/view/wham...,s2s_encode/hooktheory/pianoroll/w/wham/freedom...,,hooktheory,Freedom Chorus,chorus,0.0,4.0,chorus,128.0
11,1.0,west-wing-suite,,a856dff6c54398544c217104d047abe0,wg-snuffy-walden,midi_sources/hooktheory/pianoroll/w/wg-snuffy-...,G,https://www.hooktheory.com/theorytab/view/wg-s...,s2s_encode/hooktheory/pianoroll/w/wg-snuffy-wa...,,hooktheory,snuffy,instrumental,5.0,4.0,instrumental,86.0
16,1.0,family-guy---theme-song,"Jazz,Soundtrack",07fe0adbcf00ce06bfe166b66036d897,walter-murphy,midi_sources/hooktheory/pianoroll/w/walter-mur...,F,https://www.hooktheory.com/theorytab/view/walt...,s2s_encode/hooktheory/pianoroll/w/walter-murph...,,hooktheory,Family Guy - Theme Song,chorus,-5.0,4.0,chorus,128.0
17,1.0,american-dad---theme-song,Soundtrack,1d78ebcb686b399adf74964103b41f7a,walter-murphy,midi_sources/hooktheory/pianoroll/w/walter-mur...,Bb,https://www.hooktheory.com/theorytab/view/walt...,s2s_encode/hooktheory/pianoroll/w/walter-murph...,,hooktheory,American Dad - Theme Song,chorus,2.0,4.0,chorus,149.0
18,1.0,the-freaking-fcc,Soundtrack,d8b94e560404a9cde7fa3ec4e8effaef,walter-murphy,midi_sources/hooktheory/pianoroll/w/walter-mur...,C,https://www.hooktheory.com/theorytab/view/walt...,s2s_encode/hooktheory/pianoroll/w/walter-murph...,,hooktheory,The Freaking FCC,"intro-and-verse,bridge",0.0,4.0,intro-and-verse,126.0


In [34]:
Counter(df[df.numpy.notnull()].source.values)

Counter({'hooktheory': 15026,
         'freemidi': 50,
         'midiworld': 64,
         'ecomp': 2357,
         'cprato': 123,
         'classical_piano': 319,
         'classical_archives': 2393,
         'musescore': 6867,
         'wikifonia': 29,
         'lmd': 128,
         'reddit': 5133,
         'hooktheory_c': 15141})

In [27]:
len(df[df.numpy.notnull()].source.values)

47630

In [19]:
Counter(df[df.numpy.notnull()].source.values)

Counter({'hooktheory': 17813,
         'freemidi': 28,
         'midiworld': 44,
         'ecomp': 2226,
         'cprato': 96,
         'classical_piano': 318,
         'classical_archives': 2350,
         'musescore': 6674,
         'wikifonia': 33,
         'lmd': 78,
         'reddit': 3917,
         'hooktheory_c': 17943})

In [20]:
len(df[df.numpy.notnull()].source.values)

51520

In [21]:
Counter(df.source.values)

Counter({'hooktheory': 19830,
         'freemidi': 5164,
         'midiworld': 4097,
         'ecomp': 2242,
         'cprato': 305,
         'classical_piano': 320,
         'classical_archives': 14546,
         'musescore': 10526,
         'wikifonia': 6345,
         'lmd': 13555,
         'reddit': 98396,
         'hooktheory_c': 20024})

## Convert to hooktheory databunch

In [35]:
def get_files(csv):
    files = csv['numpy']
    flist = [Path(version_path/f) for f in files.values if isinstance(f, str)]
    flist = [f for f in flist if f.exists()]
    return flist

In [36]:
def create_databunch(files, cache_name, batch_size=32, load_cached=False):
    if load_cached and (out_path/f'{cache_name}/train_ids.npy').exists():
        data = MusicDataBunch.load(out_path, bs=batch_size, cache_name=cache_name)
    else:
        ps = [S2SFileProcessor()]
        data = (MusicItemList(items=files, path=out_path, processor=ps)
                .split_by_rand_pct(0.01, seed=6)
                .label_const(label_cls=LMLabelList))
#         data.x._bunch = MusicDataBunch
        data = data.databunch(bs=batch_size, preloader_cls=S2SPreloader)
        data.save(cache_name)
    return data

In [37]:
out_path = version_path/out_dir

In [38]:
csv = df

In [39]:
hook_csv = csv.loc[csv.source.isin(['hooktheory'])]
hook_files = get_files(hook_csv); len(hook_files)
hook_data = create_databunch(hook_files, cache_name='tmp/hook')

In [40]:
hook_csv = csv.loc[csv.source.isin(['hooktheory_c'])]
hook_files = get_files(hook_csv); len(hook_files)
hook_data = create_databunch(hook_files, cache_name='tmp/hook_c')

In [41]:
lq_csv = csv.loc[csv.source.isin(['reddit', 'classical_piano', 'ecomp', 'midiworld', 'freemidi', 'lmd', 'cprato', 'wikifonia', 'classical_archives'])]
lq_files = get_files(lq_csv); len(lq_files)
lq_data = create_databunch(lq_files, cache_name='tmp/lq')

In [42]:
hq_csv = csv.loc[csv.source.isin(['hooktheory', 'musescore'])]
hq_files = get_files(hq_csv); len(hq_files)
hq_data = create_databunch(hq_files, cache_name='tmp/hq')

In [43]:
len(hook_data.train_dl.dl.dataset)

14835

In [44]:
all_files = get_files(csv); len(all_files)
all_data = create_databunch(all_files, cache_name='tmp/all')

In [45]:
import random
sample_data = create_databunch(random.sample(all_files, 1000), cache_name='tmp/sample')

## Load data

In [47]:
single_tfm = partial(to_single_stream, vocab=vocab)
load_data =  MusicDataBunch.load(path=out_path, cache_name='tmp/hook_c', preloader_cls=S2SPreloader, train_tfms=[single_tfm])



Tried: 0,1,2,3,4...
  warn(warn_msg)


In [48]:
load_data =  MusicDataBunch.load(path=out_path, cache_name='tmp/hook_c', preloader_cls=S2SPreloader)

In [49]:
load_data.one_batch()

(tensor([[  6, 622,  88,  ..., 143,   8, 143],
         [  5, 622,  55,  ..., 161,  51, 161],
         [  5, 622,  65,  ..., 149,  61, 149],
         ...,
         [  5, 622,  66,  ..., 185,  75, 185],
         [  5, 622,  64,  ..., 173,  61, 173],
         [  5, 622,  63,  ..., 185,  60, 185]]),
 tensor([[  3,   5, 622,  ..., 167,  78, 155],
         [  3,   6, 622,  ..., 143,   8, 143],
         [  3,   6, 622,  ..., 140,   8, 140],
         ...,
         [  3,   6, 622,  ..., 143,  87, 143],
         [  3,   6, 622,  ..., 149,   8, 149],
         [  3,   6, 622,  ..., 149,  75, 197]]))

In [None]:
# ps = [S2SFileProcessor()]

# single_tfm = partial(to_single_stream, vocab=vocab)
# data = (MusicItemList(items=hook_files[:100], path=out_path, processor=ps, tfms=[single_tfm])
#         .split_by_rand_pct(0.01, seed=6)
#         .label_const(label_cls=LMLabelList))
# data.x._bunch = MusicDataBunch

In [78]:
# data.x.tfms = [single_tfm]

In [None]:
data = data.databunch(bs=4, preloader_cls=S2SPreloader, train_tfms=[single_tfm])

In [80]:
out = data.train_dl.dl.dataset[0]

In [None]:
data.one_batch()