In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
import os
os.chdir('../../../')

In [15]:
from musicautobot.utils.file_processing import *
from musicautobot.utils.midifile import *
from musicautobot.numpy_encode import *

In [3]:
import traceback
import time

## Standardize and reformat raw midi files before encoding to text
- Transform key to C major
- Remove unused instruments
- Combine multiple tracks with the same instrument into a single part
- Melody, Piano, String

### Load midi data

In [4]:
version = 'v20'
data_path = Path('data/midi')
version_path = data_path/version

In [5]:
import pandas as pd

In [6]:
# sf_path = f'sf{SAMPLE_FREQ}'
# out_dir = Path(f'{sf_path}/midi_encode')

duet_only = True
out_dir = Path('piano_duet') if duet_only else Path('midi_encode')

In [7]:
source_dir = 'midi_sources'
source_csv = version_path/'metadata'/f'{source_dir}.csv'
out_csv = version_path/out_dir/f'{out_dir.name}.csv'
out_csv.parent.mkdir(parents=True, exist_ok=True)
source_csv, out_csv

(PosixPath('data/midi/v20/metadata/midi_sources.csv'),
 PosixPath('data/midi/v20/piano_duet/piano_duet.csv'))

In [8]:
# num_comps = 2 # note, duration
cutoff = 5 # max instruments
min_variation = 3 # minimum number of different midi notes played
# max_dur = 128

### Encoding midi to numpy

In [9]:
df = pd.read_csv(source_csv); df.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,ht_time_signature,genres,midi,midi_title,parts,ht_bpm,mxl,section,source,artist,md5,song_url,ht_key,title,ht_mode
0,4.0,,midi_sources/hooktheory/pianoroll/w/wayne-shar...,yu-gi-oh3,"intro,chorus",128.0,,chorus,hooktheory,wayne-sharpe,bf1f29e5ff84e3e93e37fb873bfb590e,https://www.hooktheory.com/theorytab/view/wayn...,C,yu-gi-oh-theme-song,1.0
1,3.0,,midi_sources/hooktheory/pianoroll/w/wayne-shar...,yu-gi-oh,"intro,chorus",85.0,,intro,hooktheory,wayne-sharpe,055f80ad67f64edb14a85ca8fbfe8c29,https://www.hooktheory.com/theorytab/view/wayn...,C,yu-gi-oh-theme-song,1.0
2,4.0,Jazz,midi_sources/hooktheory/pianoroll/w/what-a-day...,kiefer,chorus,96.0,,chorus,hooktheory,what-a-day,197f96f5d181f6ce1e2c5ab04ac1ff87,https://www.hooktheory.com/theorytab/view/what...,D,kiefer,6.0
3,4.0,"J-Pop,Pop",midi_sources/hooktheory/pianoroll/w/whiteflame...,senbonzakura - pre-Pre-Chorus,"verse,pre-chorus,chorus",152.0,,pre-chorus,hooktheory,whiteflame,9e7ce13a35f1314423a9a6d5a5287a4a,https://www.hooktheory.com/theorytab/view/whit...,D,senbonzakura,6.0
4,4.0,"J-Pop,Pop",midi_sources/hooktheory/pianoroll/w/whiteflame...,Senbonzakura,"verse,pre-chorus,chorus",152.0,,verse,hooktheory,whiteflame,d5aaf79d0989222f1362f9f46c540a27,https://www.hooktheory.com/theorytab/view/whit...,D,senbonzakura,6.0


In [10]:
all_records = df.to_dict(orient='records'); len(all_records)

197426

In [11]:
def process_metadata(metadata):
    result = metadata.copy()
    
    # Part 1. Compress tracks/instruments
    if not isinstance(metadata.get('midi'), str): return None
    
    input_path = version_path/metadata['midi']
    extension = input_path.suffix.lower()
    if not input_path.exists(): 
        print('Input path does not exist:', input_path, metadata)
        return result
    
    # Get outfile and check if it exists
    out_file = Path(str(input_path).replace(f'/{source_dir}/', f'/{out_dir}/'))
    out_file = out_file.with_suffix('.npy')
    out_file.parent.mkdir(parents=True, exist_ok=True)
    if out_file.exists(): 
        result['numpy'] = str(out_file.relative_to(version_path))
        return result
    
    npenc = transform_midi(input_path)
    if npenc is None: return result
    np.save(out_file, npenc)
    result['numpy'] = str(out_file.relative_to(version_path))
    return result

In [12]:
def transform_midi(midi_file):
    input_path = midi_file
    
    # Part 1: Filter out midi tracks (drums, repetitive instruments, etc.)
    try: 
        if duet_only and num_piano_tracks(input_path) not in [1, 2]: return None
        input_file = compress_midi_file(input_path, min_variation=min_variation, cutoff=cutoff) # remove non note tracks and standardize instruments
        
        if input_file is None: return None
    except Exception as e:
        if 'badly form' in str(e): return None # ignore badly formatted midi errors
        if 'out of range' in str(e): return None # ignore badly formatted midi errors
        print('Error parsing midi', input_path, e)
        return None
        
    # Part 2. Compress rests and long notes
    stream = file2stream(input_file) # 1.
    try:
        chordarr = stream2chordarr(stream) # 2. max_dur = quarter_len * sample_freq (4). 128 = 8 bars
    except Exception as e:
        print('Could not encode to chordarr:', input_path, e)
        print(traceback.format_exc())
        return None
    
    # Part 3. Compress song rests - Don't want songs with really long pauses 
    # (this happens because we filter out midi tracks).
    chord_trim = trim_chordarr_rests(chordarr)
    chord_short = shorten_chordarr_rests(chord_trim)
    delta_trim = chord_trim.shape[0] - chord_short.shape[0]
#     if delta_trim > 500: 
#         print(f'Removed {delta_trim} rests from {input_path}. Skipping song')
#         return None
    chordarr = chord_short
    
    # Part 3. Chord array to numpy
    npenc = chordarr2npenc(chordarr)
    if not is_valid_npenc(npenc, input_path=input_path):
        return None
    
    return npenc

In [13]:
def try_process_metadata(metadata):
    try:
        return process_metadata(metadata)
    except Exception:
#         print(traceback.format_exc())
        return None

In [16]:
# # sanity check
import random
for r in random.sample(all_records, 10):
    process_metadata(r)

In [17]:
def timeout_func(data, seconds):
    print("Timeout:", seconds, data.get('midi'))

In [None]:
timeout = 500
processed = process_all(try_process_metadata, all_records, timeout=timeout, timeout_func=timeout_func)

Sequence too short: 28 data/midi/v20/midi_sources/hooktheory/pianoroll/w/willie-nelson/on-the-road-again/verse_key_original.mid
Sequence too short: 19 data/midi/v20/midi_sources/hooktheory/pianoroll/w/willie-nelson/on-the-road-again/bridge_key_original.mid
Sequence too short: 15 data/midi/v20/midi_sources/hooktheory/pianoroll/w/weebl/donkeys/intro_key_original.mid
Sequence too short: 31 data/midi/v20/midi_sources/hooktheory/pianoroll/w/wavves/way-too-much/intro_key_original.mid
Sequence too short: 24 data/midi/v20/midi_sources/hooktheory/pianoroll/w/wolfgang-gartner/undertaker/intro_key_original.mid
Sequence too short: 16 data/midi/v20/midi_sources/hooktheory/pianoroll/w/womack-and-womack/teardrops/intro-and-verse_key_original.mid
Sequence too short: 18 data/midi/v20/midi_sources/hooktheory/pianoroll/w/wolfgang-amadeus-mozart/confutatis-from-requiem/solo-3_key_original.mid
Sequence too short: 31 data/midi/v20/midi_sources/hooktheory/pianoroll/w/working-for-a-nuclear-free-city/je-suis-l

Sequence too short: 15 data/midi/v20/midi_sources/hooktheory/pianoroll/j/julia-holter/feel-you/pre-chorus_key_original.mid
Sequence too short: 29 data/midi/v20/midi_sources/hooktheory/pianoroll/j/julia-holter/betsy-on-the-roof/verse_key_original.mid
Sequence too short: 20 data/midi/v20/midi_sources/hooktheory/pianoroll/j/jeff-liu/steven-and-the-crystal-gems/outro_key_original.mid
Sequence too short: 23 data/midi/v20/midi_sources/hooktheory/pianoroll/j/jerry-lee-lewis/whole-lotta-shakin-goin-on/verse_key_original.mid
Sequence too short: 23 data/midi/v20/midi_sources/hooktheory/pianoroll/j/jimi-hendrix/angel/intro_key_original.mid
Sequence too short: 11 data/midi/v20/midi_sources/hooktheory/pianoroll/j/jacksfilms/give-this-song-a-title/outro_key_original.mid
Sequence too short: 23 data/midi/v20/midi_sources/hooktheory/pianoroll/j/julia-holter/betsy-on-the-roof/bridge_key_original.mid
Sequence too short: 31 data/midi/v20/midi_sources/hooktheory/pianoroll/j/jacksfilms/valentines-day-is-a-l

Sequence too short: 23 data/midi/v20/midi_sources/hooktheory/pianoroll/l/lunice/fancy-fortie---rustie-remix/intro_key_original.mid
Sequence too short: 18 data/midi/v20/midi_sources/hooktheory/pianoroll/l/lido-and-santell/ashley/intro_key_original.mid
Sequence too short: 29 data/midi/v20/midi_sources/hooktheory/pianoroll/l/lou-christie/lightnin-strikes/chorus_key_original.mid
Sequence too short: 15 data/midi/v20/midi_sources/hooktheory/pianoroll/u/unknown-mortal-orchestra/so-good-at-being-in-trouble/pre-chorus_key_original.mid
Sequence too short: 19 data/midi/v20/midi_sources/hooktheory/pianoroll/u/unless/atoms-for-peace/intro_key_original.mid
Sequence too short: 23 data/midi/v20/midi_sources/hooktheory/pianoroll/u/u2/original-of-the-species/intro_key_original.mid
Sequence too short: 27 data/midi/v20/midi_sources/hooktheory/pianoroll/u/u2/love-comes-to-town/chorus_key_original.mid
Sequence too short: 31 data/midi/v20/midi_sources/hooktheory/pianoroll/u/uesaka-sumire/nanatsu-no-umi-yori-

Sequence too short: 19 data/midi/v20/midi_sources/hooktheory/pianoroll/b/bubbles-and-the-shitrockers/liquor-and-whores/solo_key_original.mid
Sequence too short: 11 data/midi/v20/midi_sources/hooktheory/pianoroll/b/bubbles-and-the-shitrockers/liquor-and-whores/intro_key_original.mid
Sequence too short: 19 data/midi/v20/midi_sources/hooktheory/pianoroll/b/bubbles-and-the-shitrockers/liquor-and-whores/verse_key_original.mid
Sequence too short: 11 data/midi/v20/midi_sources/hooktheory/pianoroll/b/bubbles-and-the-shitrockers/liquor-and-whores/chorus_key_original.mid
Sequence too short: 26 data/midi/v20/midi_sources/hooktheory/pianoroll/b/beach-house/heart-of-chambers/pre-chorus_key_original.mid
Sequence too short: 31 data/midi/v20/midi_sources/hooktheory/pianoroll/b/beach-house/equal-mind/chorus_key_original.mid
Sequence too short: 16 data/midi/v20/midi_sources/hooktheory/pianoroll/b/beach-house/master-of-none/chorus_key_original.mid
Sequence too short: 27 data/midi/v20/midi_sources/hookthe

Sequence too short: 27 data/midi/v20/midi_sources/hooktheory/pianoroll/d/deadmau5/i-remember/intro_key_original.mid
Sequence too short: 20 data/midi/v20/midi_sources/hooktheory/pianoroll/d/disney/something-there---beauty-and-the-beast/intro_key_original.mid
Sequence too short: 25 data/midi/v20/midi_sources/hooktheory/pianoroll/d/derrick-harriot/let-me-down-easy/verse_key_original.mid
Sequence too short: 23 data/midi/v20/midi_sources/hooktheory/pianoroll/d/disasterpeace/fez---pressure/intro_key_original.mid
Sequence too short: 23 data/midi/v20/midi_sources/hooktheory/pianoroll/x/xxxtentacion/orlando/intro_key_original.mid
Sequence too short: 31 data/midi/v20/midi_sources/hooktheory/pianoroll/x/xxxtentacion/revenge/intro_key_original.mid
Sequence too short: 28 data/midi/v20/midi_sources/hooktheory/pianoroll/s/stan-getz---joao-gilberto/the-girl-from-ipanema/verse_key_original.mid
Sequence too short: 29 data/midi/v20/midi_sources/hooktheory/pianoroll/s/stupeflip/crou-anthem/chorus_key_orig

Sequence too short: 31 data/midi/v20/midi_sources/hooktheory/pianoroll/c/canaan-smith/love-you-like-that/chorus_key_original.mid
Sequence too short: 23 data/midi/v20/midi_sources/hooktheory/pianoroll/c/carolina-liar/show-me-what-im-looking-for/intro_key_original.mid
Sequence too short: 19 data/midi/v20/midi_sources/hooktheory/pianoroll/c/chuck-berry/johnny-b-goode/chorus_key_original.mid
Sequence too short: 21 data/midi/v20/midi_sources/hooktheory/pianoroll/c/connan-mockasin/im-the-man-that-will-find-you-alternate/intro-and-verse_key_original.mid
Sequence too short: 21 data/midi/v20/midi_sources/hooktheory/pianoroll/c/connan-mockasin/im-the-man-that-will-find-you/intro-and-verse_key_original.mid
Sequence too short: 25 data/midi/v20/midi_sources/hooktheory/pianoroll/c/caro-emerald/a-night-like-this/verse_key_original.mid
Sequence too short: 19 data/midi/v20/midi_sources/hooktheory/pianoroll/c/charlotte-gainsbourg/hey-joe-/verse_key_original.mid
Sequence too short: 28 data/midi/v20/midi_

Sequence too short: 18 data/midi/v20/midi_sources/hooktheory/pianoroll/n/nintendo/rainbow-road---mario-kart-8/bridge_key_original.mid
Sequence too short: 23 data/midi/v20/midi_sources/hooktheory/pianoroll/n/neru/the-disease-called-love/intro_key_original.mid
Sequence too short: 29 data/midi/v20/midi_sources/hooktheory/pianoroll/n/nintendo/dark-matter/chorus_key_original.mid
Sequence too short: 31 data/midi/v20/midi_sources/hooktheory/pianoroll/i/iggy-pop/the-passenger/intro_key_original.mid
Sequence too short: 30 data/midi/v20/midi_sources/hooktheory/pianoroll/i/inon-zur/fallout-4-main-theme/intro_key_original.mid
Sequence too short: 19 data/midi/v20/midi_sources/hooktheory/pianoroll/i/inon-zur/fallout-4-intro/instrumental_key_original.mid
Sequence too short: 24 data/midi/v20/midi_sources/hooktheory/pianoroll/i/inverted-silence/fusion/bridge_key_original.mid
Sequence too short: 7 data/midi/v20/midi_sources/hooktheory/pianoroll/i/iamamiwhoami/fountain/pre-chorus_key_original.mid
Sequenc

Sequence too short: 19 data/midi/v20/midi_sources/hooktheory/pianoroll/t/tom-helsen/please/chorus-lead-out_key_original.mid
Sequence too short: 15 data/midi/v20/midi_sources/hooktheory/pianoroll/t/thq/spongebob-bfbb---chum-bucket-lab/intro_key_original.mid
Sequence too short: 17 data/midi/v20/midi_sources/hooktheory/pianoroll/t/taylor-swift/style/intro_key_original.mid
Sequence too short: 31 data/midi/v20/midi_sources/hooktheory/pianoroll/t/taylor-swift/delicate/verse_key_original.mid
Sequence too short: 15 data/midi/v20/midi_sources/hooktheory/pianoroll/t/taylor-swift/enchanted/intro_key_original.mid
Sequence too short: 17 data/midi/v20/midi_sources/hooktheory/pianoroll/t/tristam/devotion/intro_key_original.mid
Sequence too short: 30 data/midi/v20/midi_sources/hooktheory/pianoroll/t/tristam/once-again/pre-chorus_key_original.mid
Sequence too short: 31 data/midi/v20/midi_sources/hooktheory/pianoroll/t/the-xx/open-eyes/verse_key_original.mid
Sequence too short: 31 data/midi/v20/midi_sou

Sequence too short: 27 data/midi/v20/midi_sources/hooktheory/pianoroll/k/knife-party/destroy-them-with-lazers/bridge_key_original.mid
Sequence too short: 29 data/midi/v20/midi_sources/hooktheory/pianoroll/k/king-gizzard-and-the-lizard-wizard/gamma-knife/chorus-lead-out_key_original.mid
Sequence too short: 29 data/midi/v20/midi_sources/hooktheory/pianoroll/k/king-gizzard-and-the-lizard-wizard/evil-death-roll/pre-chorus_key_original.mid
Sequence too short: 15 data/midi/v20/midi_sources/hooktheory/pianoroll/k/king-gizzard-and-the-lizard-wizard/robot-stop/chorus-lead-out_key_original.mid
Sequence too short: 15 data/midi/v20/midi_sources/hooktheory/pianoroll/k/kelly-clarkson/already-gone/pre-chorus_key_original.mid
Sequence too short: 31 data/midi/v20/midi_sources/hooktheory/pianoroll/k/kenny-chesney/the-road-and-the-radio/verse_key_original.mid
Sequence too short: 15 data/midi/v20/midi_sources/hooktheory/pianoroll/k/kasbo/kaleidoscope/verse_key_original.mid
Sequence too short: 17 data/midi

Sequence too short: 15 data/midi/v20/midi_sources/hooktheory/pianoroll/m/muzzy/junction-seven/verse_key_original.mid
Sequence too short: 23 data/midi/v20/midi_sources/hooktheory/pianoroll/m/martin-solveig/do-it-right/intro_key_original.mid
Sequence too short: 15 data/midi/v20/midi_sources/hooktheory/pianoroll/m/mrsimon/constant/intro_key_original.mid
Sequence too short: 16 data/midi/v20/midi_sources/hooktheory/pianoroll/m/mark-isham/sense-of-touch/verse_key_original.mid
Sequence too short: 20 data/midi/v20/midi_sources/hooktheory/pianoroll/m/mark-isham/sense-of-touch/bridge_key_original.mid
Sequence too short: 30 data/midi/v20/midi_sources/hooktheory/pianoroll/m/mark-isham/sense-of-touch/outro_key_original.mid
Sequence too short: 16 data/midi/v20/midi_sources/hooktheory/pianoroll/m/matrix-soundtrack/clubbed-to-death/verse_key_original.mid
Sequence too short: 31 data/midi/v20/midi_sources/hooktheory/pianoroll/m/metronomy/she-wants/pre-chorus_key_original.mid
Sequence too short: 11 data/

Sequence too short: 5 data/midi/v20/midi_sources/midiworld/named_midi/Cameron_Lee_Simpson_-_Cue_.mid
Error parsing midi data/midi/v20/midi_sources/midiworld/named_midi/Nirvana_-_Drain_You.mid cannot handle ticks per frame: 77
Sequence too short: 31 data/midi/v20/midi_sources/midiworld/named_midi/Animenz_-_This_game_op_by_Animenz_in_Eb.mid
Sequence too short: 31 data/midi/v20/midi_sources/midiworld/named_midi/Animenz_-_This_game.mid
Sequence too short: 20 data/midi/v20/midi_sources/midiworld/named_midi/dragon_quest_4_-_Level_Up.mid
Sequence too short: 31 data/midi/v20/midi_sources/midiworld/named_midi/Animenz_-_This_game_By_Animenz.mid
Sequence too short: 26 data/midi/v20/midi_sources/midiworld/named_midi/dhdizitr_-_dhdizitr.mid


In [None]:
arr2csv(processed, out_csv); len(processed)

In [None]:
df = pd.read_csv(out_csv); df.head()

In [19]:
len([s for s in df.midi.values if isinstance(s, str)])

196531

In [20]:
len([s for s in df.numpy.values if isinstance(s, str)])

164774

In [38]:
len([s for s in df.numpy.values if isinstance(s, str)])

112169

In [39]:
Counter(df[df.midi.notnull()].source.values)

Counter({'hooktheory': 19882,
         'freemidi': 5168,
         'midiworld': 4108,
         'ecomp': 2508,
         'cprato': 312,
         'classical_piano': 329,
         'classical_archives': 14642,
         'musescore': 10933,
         'wikifonia': 6346,
         'lmd': 13565,
         'reddit': 98674,
         'hooktheory_c': 20076})

In [22]:
Counter(df[df.numpy.notnull()].source.values)

Counter({'hooktheory': 19404,
         'freemidi': 3974,
         'midiworld': 3460,
         'ecomp': 2508,
         'cprato': 308,
         'classical_piano': 329,
         'classical_archives': 14627,
         'musescore': 10891,
         'wikifonia': 6195,
         'lmd': 9175,
         'reddit': 74341,
         'hooktheory_c': 19562})