In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import sys
sys.path.insert(0, '../../')
from src.encode_data import *
from src.midi_data import *
from src.data_sources import process_all, arr2csv
from src.midi_transform import *
from src.fastai_data import *

In [3]:
import traceback
import time

## Standardize and reformat raw midi files before encoding to text
- Transform key to C major
- Remove unused instruments
- Combine multiple tracks with the same instrument into a single part
- Melody, Piano, String

### Load midi data

In [4]:
version = 'v16'
data_path = Path('data/midi')
version_path = data_path/version

In [5]:
import pandas as pd

In [6]:
# out_dir = 'midi_encode'
# duet_only = False
out_dir = 'piano_duet'
duet_only = True

In [7]:
source_dir = 'midi_sources'
source_csv = version_path/'metadata'/f'{source_dir}.csv'
out_csv = version_path/out_dir/f'{out_dir}.csv'
out_csv.parent.mkdir(parents=True, exist_ok=True)
source_csv, out_csv

(PosixPath('data/midi/v16/metadata/midi_sources.csv'),
 PosixPath('data/midi/v16/piano_duet/piano_duet.csv'))

In [8]:
# num_comps = 2 # note, duration
cutoff = 5 # max instruments
min_variation = 3 # minimum number of different midi notes played
# max_dur = 128

### Encoding midi to numpy

In [9]:
df = pd.read_csv(source_csv); df.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,ht_time_signature,ht_offset,midi,section,parts,ht_bpm,title,midi_title,artist,song_url,genres,source,ht_key,md5,mxl,ht_mode
0,4.0,0.0,midi_sources/hooktheory/pianoroll/w/wayne-shar...,chorus,"intro,chorus",128.0,yu-gi-oh-theme-song,yu-gi-oh3,wayne-sharpe,https://www.hooktheory.com/theorytab/view/wayn...,,hooktheory,C,bf1f29e5ff84e3e93e37fb873bfb590e,,1.0
1,3.0,0.0,midi_sources/hooktheory/pianoroll/w/wayne-shar...,intro,"intro,chorus",85.0,yu-gi-oh-theme-song,yu-gi-oh,wayne-sharpe,https://www.hooktheory.com/theorytab/view/wayn...,,hooktheory,C,055f80ad67f64edb14a85ca8fbfe8c29,,1.0
2,4.0,-5.0,midi_sources/hooktheory/pianoroll/w/what-a-day...,chorus,chorus,96.0,kiefer,kiefer,what-a-day,https://www.hooktheory.com/theorytab/view/what...,Jazz,hooktheory,D,197f96f5d181f6ce1e2c5ab04ac1ff87,,6.0
3,4.0,-5.0,midi_sources/hooktheory/pianoroll/w/whiteflame...,pre-chorus,"verse,pre-chorus,chorus",152.0,senbonzakura,senbonzakura - pre-Pre-Chorus,whiteflame,https://www.hooktheory.com/theorytab/view/whit...,"J-Pop,Pop",hooktheory,D,9e7ce13a35f1314423a9a6d5a5287a4a,,6.0
4,4.0,-5.0,midi_sources/hooktheory/pianoroll/w/whiteflame...,verse,"verse,pre-chorus,chorus",152.0,senbonzakura,Senbonzakura,whiteflame,https://www.hooktheory.com/theorytab/view/whit...,"J-Pop,Pop",hooktheory,D,d5aaf79d0989222f1362f9f46c540a27,,6.0


In [10]:
all_records = df.to_dict(orient='records'); len(all_records)

197182

In [11]:
def process_metadata(metadata):
    result = metadata.copy()
    
    # Part 1. Compress tracks/instruments
    if not isinstance(metadata.get('midi'), str): return None
    
    input_path = version_path/metadata['midi']
    extension = input_path.suffix.lower()
    if not input_path.exists(): 
        print('Input path does not exist:', input_path, metadata)
        return result
    
    # Get outfile and check if it exists
    out_file = Path(str(input_path).replace(f'/{source_dir}/', f'/{out_dir}/'))
    out_file = out_file.with_suffix('.npy')
    out_file.parent.mkdir(parents=True, exist_ok=True)
    if out_file.exists(): 
        result['numpy'] = str(out_file.relative_to(version_path))
        return result
    
    npenc = transform_midi(input_path)
    if npenc is None: return result
    np.save(out_file, npenc)
    result['numpy'] = str(out_file.relative_to(version_path))
    return result

In [12]:
def transform_midi(midi_file):
    input_path = midi_file
    
    # Part 1: Filter out midi tracks (drums, repetitive instruments, etc.)
    try: 
        if duet_only and num_piano_tracks(input_path) not in [1, 2]: return None
        input_file = compress_midi_file(input_path, min_variation=min_variation, cutoff=cutoff) # remove non note tracks and standardize instruments
        
        if input_file is None: return None
    except Exception as e:
        print('Error parsing midi', input_path, e)
        return None
        
    # Part 2. Compress rests and long notes
    stream = file2stream(input_file) # 1.
    try:
        chordarr = stream2chordarr(stream) # 2. max_dur = quarter_len * sample_freq (4). 128 = 8 bars
    except Exception as e:
        print('Could not encode to chordarr:', input_path, e)
        print(traceback.format_exc())
        return None
    
    # Part 3. Compress song rests - Don't want songs with really long pauses 
    # (this happens because we filter out midi tracks).
    chord_trim = trim_chordarr_rests(chordarr)
    chord_short = shorten_chordarr_rests(chord_trim)
    delta_trim = chord_trim.shape[0] - chord_short.shape[0]
#     if delta_trim > 500: 
#         print(f'Removed {delta_trim} rests from {input_path}. Skipping song')
#         return None
    chordarr = chord_short
    
    # Part 3. Chord array to numpy
    npenc = chordarr2npenc(chordarr)
    if not is_valid_npenc(npenc, input_path=input_path):
        return None
    
    return npenc

In [13]:
def try_process_metadata(metadata):
    try:
        return process_metadata(metadata)
    except Exception:
#         print(traceback.format_exc())
        return None

In [16]:
# # sanity check
import random
for r in random.sample(all_records, 10):
    process_metadata(r)

In [None]:
def timeout_func(data, seconds):
    print("Timeout:", seconds, data.get('midi'))

In [None]:
timeout = 120
processed = process_all(try_process_metadata, all_records, timeout=120, timeout_func=timeout_func)

Error parsing midi data/midi/v16/midi_sources/freemidi/genre-dance-eletric/Gloria Estefan - I'm Not Givin' You Up.mid badly formated midi bytes, got: b'RIFFB\x8c\x00\x00RMIDdata~\x8b\x00\x00'
Error parsing midi data/midi/v16/midi_sources/freemidi/genre-dance-eletric/Atomic Kitten - Whole Again.mid badly formated midi bytes, got: b'RIFF\x08K\x00\x00RMIDdata{J\x00\x00'
Error parsing midi data/midi/v16/midi_sources/freemidi/genre-dance-eletric/Tatu - Ya Tvoy Vrag (I'm Your Enemy).mid badly formated midi bytes, got: b'RIFF,\xa3\x00\x00RMIDdata\xc6\xa2\x00\x00'
Error parsing midi data/midi/v16/midi_sources/freemidi/genre-dance-eletric/Gloria Estefan - Get On Your Feet.mid badly formated midi bytes, got: b'RIFF\x92\xa8\x01\x00RMIDdata\xd8\xa7\x01\x00'
Error parsing midi data/midi/v16/midi_sources/freemidi/genre-dance-eletric/Apollo 440 - Lost In Space.mid index out of range
Error parsing midi data/midi/v16/midi_sources/freemidi/genre-dance-eletric/Gloria Estefan - Mi Tierra.mid badly formate

Error parsing midi data/midi/v16/midi_sources/freemidi/genre-pop/Westlife - Fool Again.mid badly formed midi string: missing leading MTrk
Error parsing midi data/midi/v16/midi_sources/freemidi/genre-pop/Four Seasons - I've Got You Under My Skin.mid badly formed midi string: missing leading MTrk
Error parsing midi data/midi/v16/midi_sources/freemidi/genre-pop/Wings - Band On The Run.mid badly formed midi string: missing leading MTrk
Error parsing midi data/midi/v16/midi_sources/freemidi/genre-pop/Usher - You Got It Bad.mid badly formated midi bytes, got: b'RIFF\x80\x85\x00\x00RMIDdata\xe0\x84\x00\x00'
Error parsing midi data/midi/v16/midi_sources/freemidi/genre-pop/Wilson Phillips - Release.mid badly formed midi string: missing leading MTrk
Error parsing midi data/midi/v16/midi_sources/freemidi/genre-pop/Evanescence - Haunted.mid badly formated midi bytes, got: b'RIFF\x0e[\x00\x00RMIDdata\x01[\x00\x00'
Error parsing midi data/midi/v16/midi_sources/freemidi/genre-pop/Box Car Racer - Cat 

Error parsing midi data/midi/v16/midi_sources/midiworld/named_midi/Nirvana_-_Lounge_Act.mid badly formed midi string: missing leading MTrk
Error parsing midi data/midi/v16/midi_sources/midiworld/named_midi/Third_Eye_Blind_-_Semi_Charmed_Life.mid badly formated midi bytes, got: b'RIFFV\x0c\x01\x00RMIDdataI\x0c\x01\x00'
Timeout: 120 midi_sources/freemidi/genre-dance-eletric/Enigma - Variations.mid
Error parsing midi data/midi/v16/midi_sources/midiworld/named_midi/Nirvana_-_Heart-Shaped_Box.mid badly formed midi string: missing leading MTrk
Error parsing midi data/midi/v16/midi_sources/midiworld/named_midi/Nirvana_-_On_A_Plain.mid badly formed midi string: missing leading MTrk
Error parsing midi data/midi/v16/midi_sources/midiworld/named_midi/Nirvana_-_Mexican_Seafood.mid badly formed midi string: missing leading MTrk
Error parsing midi data/midi/v16/midi_sources/midiworld/named_midi/Nirvana_-_Been_A_Son.mid badly formed midi string: missing leading MTrk
Error parsing midi data/midi/v16/m

Error parsing midi data/midi/v16/midi_sources/from_mxl/wikifonia/George Harrison - Here Comes the Sun.mid badly formated midi bytes, got: b''
Error parsing midi data/midi/v16/midi_sources/from_mxl/wikifonia/Michel PLATRE - LE CHIEN.mid badly formated midi bytes, got: b''
Error parsing midi data/midi/v16/midi_sources/from_mxl/wikifonia/Carole King - You've Got a Friend.mid badly formated midi bytes, got: b''
Error parsing midi data/midi/v16/midi_sources/from_mxl/wikifonia/Jimmy Davis, Jimmy Sherman, Roger Ram Ramirez - Lover Man (Oh Where Can You Be?).mid badly formated midi bytes, got: b''
Error parsing midi data/midi/v16/midi_sources/from_mxl/wikifonia/Andr? Pat?, Marc Provance - A La P?che Aux Moules.mid badly formated midi bytes, got: b''
Error parsing midi data/midi/v16/midi_sources/from_mxl/wikifonia/George Gershwin, Ira Gershwin - Someone To Watch Over Me.mid badly formated midi bytes, got: b''
Error parsing midi data/midi/v16/midi_sources/from_mxl/wikifonia/Boby Lapointe - Arago

Error parsing midi data/midi/v16/midi_sources/from_mxl/wikifonia/Avishai Cohen - Remembering.mid badly formated midi bytes, got: b''
Error parsing midi data/midi/v16/midi_sources/from_mxl/wikifonia/Duke Ellington - Come Sunday.mid badly formated midi bytes, got: b''
Error parsing midi data/midi/v16/midi_sources/from_mxl/wikifonia/Harold Arlen, Johnny Mercer - One For My Baby.mid badly formated midi bytes, got: b''
Error parsing midi data/midi/v16/midi_sources/from_mxl/wikifonia/Willie Dixon - I Just Want To Make Love To You.mid badly formated midi bytes, got: b''
Error parsing midi data/midi/v16/midi_sources/from_mxl/wikifonia/Charles Mingus - Smooch.mid badly formated midi bytes, got: b''
Error parsing midi data/midi/v16/midi_sources/from_mxl/wikifonia/Mark Zanter - Question Mark.mid badly formated midi bytes, got: b''
Error parsing midi data/midi/v16/midi_sources/from_mxl/wikifonia/Words and music by Carol Hall - Hard Candy Christmas.mid badly formated midi bytes, got: b''
Error pars

Error parsing midi data/midi/v16/midi_sources/from_mxl/wikifonia/ Ray Davies - Sunny Afternoon.mid badly formated midi bytes, got: b''
Error parsing midi data/midi/v16/midi_sources/from_mxl/wikifonia/Duke Ellington, Johnny Hodges - I'm Beginning To See The Light.mid badly formated midi bytes, got: b''
Error parsing midi data/midi/v16/midi_sources/from_mxl/wikifonia/Josiah Booth, James Thomson - The Star O' Rabbie Burns.mid badly formated midi bytes, got: b''
Error parsing midi data/midi/v16/midi_sources/from_mxl/wikifonia/Benny Golson - Stablemates.mid badly formated midi bytes, got: b''
Error parsing midi data/midi/v16/midi_sources/from_mxl/wikifonia/Traditional - When the Saints Go Marching In.mid badly formated midi bytes, got: b''
Error parsing midi data/midi/v16/midi_sources/from_mxl/wikifonia/Maxime Le Forestier - San Francisco.mid badly formated midi bytes, got: b''
Error parsing midi data/midi/v16/midi_sources/from_mxl/wikifonia/Johnny Green - Body and Soul .mid badly formated 

Error parsing midi data/midi/v16/midi_sources/lmd_clean/Jackson Michael/Smooth Criminal.mid badly formated midi bytes, got: b'RIFF|\x9a\x00\x00RMIDdata\xb5\x99\x00\x00'
Error parsing midi data/midi/v16/midi_sources/lmd_clean/Jackson Michael/I'll Be There.mid badly formed midi string: missing leading MTrk
Error parsing midi data/midi/v16/midi_sources/lmd_clean/Jackson Michael/The Girl Is Mine.mid ord() expected string of length 1, but int found
Error parsing midi data/midi/v16/midi_sources/lmd_clean/Jackson Michael/Thriller.3.mid ord() expected string of length 1, but int found
Error parsing midi data/midi/v16/midi_sources/lmd_clean/Jackson Michael/The Girl Is Mine.3.mid ord() expected string of length 1, but int found
Error parsing midi data/midi/v16/midi_sources/lmd_clean/Jackson Michael/Don't Stop 'Til You Get Enough.mid ord() expected string of length 1, but int found
Error parsing midi data/midi/v16/midi_sources/lmd_clean/Mariah Carey/Long Ago.mid badly formed midi string: missing 

Error parsing midi data/midi/v16/midi_sources/130k_reddit/E/E/executor.mid cannot handle midi file format: <built-in function format>
Error parsing midi data/midi/v16/midi_sources/130k_reddit/E/E/EndreH - Teardrop.mid badly formated midi bytes, got: b'Rar!\x1a\x07\x00\xcf\x90s\x00\x00\r\x00\x00\x00\x00\x00\x00\x00'
Error parsing midi data/midi/v16/midi_sources/130k_reddit/E/E/erevshos.mid badly formated midi bytes, got: b'vti_encoding:SR|utf8'
Error parsing midi data/midi/v16/midi_sources/130k_reddit/E/E/EndOfInnocence.mid badly formed midi string: missing leading MTrk
Error parsing midi data/midi/v16/midi_sources/130k_reddit/E/E/EarthBound - Summers.mid badly formated midi bytes, got: b'ENIGMA BINARY FILE\x00\x00'
Error parsing midi data/midi/v16/midi_sources/130k_reddit/E/E/eyes.mid badly formed midi string: missing leading MTrk
Error parsing midi data/midi/v16/midi_sources/130k_reddit/E/E/everytime03.mid badly formed midi string: missing leading MTrk
Error parsing midi data/midi/v16

In [17]:
arr2csv(processed, out_csv); len(processed)

196514

In [18]:
df = pd.read_csv(out_csv); df.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,song_url,section,numpy,ht_mode,midi,title,ht_time_signature,mxl,ht_offset,ht_bpm,ht_key,md5,midi_title,artist,genres,parts,source
0,https://www.hooktheory.com/theorytab/view/wayn...,chorus,piano_duet/hooktheory/pianoroll/w/wayne-sharpe...,1.0,midi_sources/hooktheory/pianoroll/w/wayne-shar...,yu-gi-oh-theme-song,4.0,,0.0,128.0,C,bf1f29e5ff84e3e93e37fb873bfb590e,yu-gi-oh3,wayne-sharpe,,"intro,chorus",hooktheory
1,https://www.hooktheory.com/theorytab/view/wayn...,intro,piano_duet/hooktheory/pianoroll/w/wayne-sharpe...,1.0,midi_sources/hooktheory/pianoroll/w/wayne-shar...,yu-gi-oh-theme-song,3.0,,0.0,85.0,C,055f80ad67f64edb14a85ca8fbfe8c29,yu-gi-oh,wayne-sharpe,,"intro,chorus",hooktheory
2,https://www.hooktheory.com/theorytab/view/what...,chorus,piano_duet/hooktheory/pianoroll/w/what-a-day/k...,6.0,midi_sources/hooktheory/pianoroll/w/what-a-day...,kiefer,4.0,,-5.0,96.0,D,197f96f5d181f6ce1e2c5ab04ac1ff87,kiefer,what-a-day,Jazz,chorus,hooktheory
3,https://www.hooktheory.com/theorytab/view/whit...,pre-chorus,piano_duet/hooktheory/pianoroll/w/whiteflame/s...,6.0,midi_sources/hooktheory/pianoroll/w/whiteflame...,senbonzakura,4.0,,-5.0,152.0,D,9e7ce13a35f1314423a9a6d5a5287a4a,senbonzakura - pre-Pre-Chorus,whiteflame,"J-Pop,Pop","verse,pre-chorus,chorus",hooktheory
4,https://www.hooktheory.com/theorytab/view/whit...,verse,piano_duet/hooktheory/pianoroll/w/whiteflame/s...,6.0,midi_sources/hooktheory/pianoroll/w/whiteflame...,senbonzakura,4.0,,-5.0,152.0,D,d5aaf79d0989222f1362f9f46c540a27,Senbonzakura,whiteflame,"J-Pop,Pop","verse,pre-chorus,chorus",hooktheory


In [20]:
len([s for s in df.numpy.values if isinstance(s, str)])

112947