# Process MIDI for GloVe
Loads a collection of MIDI files from disk, filters out all non-monophonic tracks, and saves a sequence of notes from the remaining tracks to `data/notes.txt`. `notes.txt` can then be used by GloVe to create MIDI note embeddings.

## Imports

In [1]:
import os, time
import numpy as np
import pretty_midi

## Utils

In [2]:
def parse_midi(path):
    midi = None
    with open(path, 'r') as f:
        try:
            midi = pretty_midi.PrettyMIDI(f)
            midi.remove_invalid_notes()
        except:
            pass
    return midi

def get_percent_monophonic(pm_instrument_roll):
    mask = pm_instrument_roll.T > 0
    notes = np.sum(mask, axis=1)
    n = np.count_nonzero(notes)
    single = np.count_nonzero(notes == 1)
    if single > 0:
        return float(single) / float(n)
    elif single == 0 and n > 0:
        return 0.0
    else: # no notes of any kind
        return 0.0
    
def filter_monophonic(pm_instruments, percent_monophonic=0.99):
    return [i for i in pm_instruments if get_percent_monophonic(i.get_piano_roll()) >= percent_monophonic]

def get_note_string(midi):
    midi = parse_midi(f)
    if midi is not None:
        buff = ''
        for instrument in filter_monophonic(midi.instruments, 1.0):
            buff += 'TRACK_START '
            buff += ' '.join([str(n.pitch) for n in instrument.notes]) + ' '
        return buff
    return ''

Edit `midi_dir` to point to the directory of MIDI files you would like to learn your note embeddings from.

In [3]:
midi_dir = '../data/lmd_mono_tracks_seperated'

Run it!

In [4]:
files = [os.path.join(midi_dir, f) for f in os.listdir(midi_dir)]

start_time = time.time()
with open('../data/notes.txt', 'w') as out_file:
    for f in files:
        out_file.write(get_note_string(f))
    print('Finished in {} seconds'.format(time.time() - start_time))

Finished in 2202.00431204 seconds
