In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import sys
sys.path.insert(0, '../../')
from src.encode_data import *
from src.midi_data import *
from src.data_sources import process_all, arr2csv
from src.midi_transform import *
from src.fastai_data import *

In [3]:
import traceback
import time

## Standardize and reformat raw midi files before encoding to text
- Transform key to C major
- Remove unused instruments
- Combine multiple tracks with the same instrument into a single part
- Melody, Piano, String

### Load midi data

In [4]:
version = 'v16'
data_path = Path('data/midi')
version_path = data_path/version

In [5]:
import pandas as pd

In [6]:
out_dir = 'midi_encode'
duet_only = False
# out_dir = 'piano_duet'
# duet_only = True

In [7]:
source_dir = 'midi_sources'
source_csv = version_path/'metadata'/f'{source_dir}.csv'
out_csv = version_path/out_dir/f'{out_dir}.csv'
out_csv.parent.mkdir(parents=True, exist_ok=True)
source_csv, out_csv

(PosixPath('data/midi/v16/metadata/midi_sources.csv'),
 PosixPath('data/midi/v16/midi_encode/midi_encode.csv'))

In [8]:
# num_comps = 2 # note, duration
cutoff = 5 # max instruments
min_variation = 3 # minimum number of different midi notes played
# max_dur = 128

### Encoding midi to numpy

In [9]:
df = pd.read_csv(source_csv); df.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,ht_time_signature,ht_offset,midi,section,parts,ht_bpm,title,midi_title,artist,song_url,genres,source,ht_key,md5,mxl,ht_mode
0,4.0,0.0,midi_sources/hooktheory/pianoroll/w/wayne-shar...,chorus,"intro,chorus",128.0,yu-gi-oh-theme-song,yu-gi-oh3,wayne-sharpe,https://www.hooktheory.com/theorytab/view/wayn...,,hooktheory,C,bf1f29e5ff84e3e93e37fb873bfb590e,,1.0
1,3.0,0.0,midi_sources/hooktheory/pianoroll/w/wayne-shar...,intro,"intro,chorus",85.0,yu-gi-oh-theme-song,yu-gi-oh,wayne-sharpe,https://www.hooktheory.com/theorytab/view/wayn...,,hooktheory,C,055f80ad67f64edb14a85ca8fbfe8c29,,1.0
2,4.0,-5.0,midi_sources/hooktheory/pianoroll/w/what-a-day...,chorus,chorus,96.0,kiefer,kiefer,what-a-day,https://www.hooktheory.com/theorytab/view/what...,Jazz,hooktheory,D,197f96f5d181f6ce1e2c5ab04ac1ff87,,6.0
3,4.0,-5.0,midi_sources/hooktheory/pianoroll/w/whiteflame...,pre-chorus,"verse,pre-chorus,chorus",152.0,senbonzakura,senbonzakura - pre-Pre-Chorus,whiteflame,https://www.hooktheory.com/theorytab/view/whit...,"J-Pop,Pop",hooktheory,D,9e7ce13a35f1314423a9a6d5a5287a4a,,6.0
4,4.0,-5.0,midi_sources/hooktheory/pianoroll/w/whiteflame...,verse,"verse,pre-chorus,chorus",152.0,senbonzakura,Senbonzakura,whiteflame,https://www.hooktheory.com/theorytab/view/whit...,"J-Pop,Pop",hooktheory,D,d5aaf79d0989222f1362f9f46c540a27,,6.0


In [10]:
all_records = df.to_dict(orient='records'); len(all_records)

197182

In [11]:
def process_metadata(metadata):
    result = metadata.copy()
    
    # Part 1. Compress tracks/instruments
    if not isinstance(metadata.get('midi'), str): return None
    
    input_path = version_path/metadata['midi']
    extension = input_path.suffix.lower()
    if not input_path.exists(): 
        print('Input path does not exist:', input_path, metadata)
        return result
    
    # Get outfile and check if it exists
    out_file = Path(str(input_path).replace(f'/{source_dir}/', f'/{out_dir}/'))
    out_file = out_file.with_suffix('.npy')
    out_file.parent.mkdir(parents=True, exist_ok=True)
    if out_file.exists(): 
        result['numpy'] = str(out_file.relative_to(version_path))
        return result
    
    npenc = transform_midi(input_path)
    if npenc is None: return result
    np.save(out_file, npenc)
    result['numpy'] = str(out_file.relative_to(version_path))
    return result

In [12]:
def transform_midi(midi_file):
    input_path = midi_file
    
    # Part 1: Filter out midi tracks (drums, repetitive instruments, etc.)
    try: 
        if duet_only and num_piano_tracks(input_path) not in [1, 2]: return None
        input_file = compress_midi_file(input_path, min_variation=min_variation, cutoff=cutoff) # remove non note tracks and standardize instruments
        
        if input_file is None: return None
    except Exception as e:
        if 'badly form' in str(e): return None # ignore badly formatted midi errors
        if 'out of range' in str(e): return None # ignore badly formatted midi errors
        print('Error parsing midi', input_path, e)
        return None
        
    # Part 2. Compress rests and long notes
    stream = file2stream(input_file) # 1.
    try:
        chordarr = stream2chordarr(stream) # 2. max_dur = quarter_len * sample_freq (4). 128 = 8 bars
    except Exception as e:
        print('Could not encode to chordarr:', input_path, e)
        print(traceback.format_exc())
        return None
    
    # Part 3. Compress song rests - Don't want songs with really long pauses 
    # (this happens because we filter out midi tracks).
    chord_trim = trim_chordarr_rests(chordarr)
    chord_short = shorten_chordarr_rests(chord_trim)
    delta_trim = chord_trim.shape[0] - chord_short.shape[0]
#     if delta_trim > 500: 
#         print(f'Removed {delta_trim} rests from {input_path}. Skipping song')
#         return None
    chordarr = chord_short
    
    # Part 3. Chord array to numpy
    npenc = chordarr2npenc(chordarr)
    if not is_valid_npenc(npenc, input_path=input_path):
        return None
    
    return npenc

In [13]:
def try_process_metadata(metadata):
    try:
        return process_metadata(metadata)
    except Exception:
#         print(traceback.format_exc())
        return None

In [14]:
# # sanity check
import random
for r in random.sample(all_records, 10):
    process_metadata(r)

In [15]:
def timeout_func(data, seconds):
    print("Timeout:", seconds, data.get('midi'))

In [16]:
timeout = 500
processed = process_all(try_process_metadata, all_records, timeout=timeout, timeout_func=timeout_func)

npenc exceeds max 481 duration: 486 data/midi/v16/midi_sources/freemidi/genre-pop/Roxette - Hotblooded.mid
npenc exceeds max 481 duration: 486 data/midi/v16/midi_sources/freemidi/genre-pop/Peter Gabriel - The Family And The Fishing Net.mid
Error parsing midi data/midi/v16/midi_sources/freemidi/genre-pop/PSY - gangnam style KILLYURSLEF.mid ord() expected string of length 1, but int found
Error parsing midi data/midi/v16/midi_sources/midiworld/named_midi/Nirvana_-_Drain_You.mid cannot handle ticks per frame: 77
npenc exceeds max 481 duration: 486 data/midi/v16/midi_sources/midiworld/named_midi/Stravinsky_Igor_-_Stravinsky_-_rite_of_spring_2.mid
Error parsing midi data/midi/v16/midi_sources/lmd_clean/The Police/So Lonely.1.mid ord() expected string of length 1, but int found
npenc exceeds max 481 duration: 486 data/midi/v16/midi_sources/lmd_clean/Nine Inch Nails/A Warm Place.1.mid
Could not encode to chordarr: data/midi/v16/midi_sources/lmd_clean/Rob Zombie/Demonoid Phenomenon.mid 
Traceb

Error parsing midi data/midi/v16/midi_sources/130k_reddit/C/C/cloak.mid cannot handle midi file format: <built-in function format>
Error parsing midi data/midi/v16/midi_sources/130k_reddit/A/A/animals-house_of_the_rising_sun.mid ord() expected string of length 1, but int found
Error parsing midi data/midi/v16/midi_sources/130k_reddit/A/A/always04.mid ord() expected string of length 1, but int found
Error parsing midi data/midi/v16/midi_sources/130k_reddit/V/V/Voivod-'Nothingface'.mid ord() expected string of length 1, but int found
Error parsing midi data/midi/v16/midi_sources/130k_reddit/D/D/Dark_Forces.mid cannot handle midi file format: <built-in function format>
Error parsing midi data/midi/v16/midi_sources/130k_reddit/D/D/dfdemo.mid cannot handle midi file format: <built-in function format>
npenc exceeds max 481 duration: 484 data/midi/v16/midi_sources/130k_reddit/D/D/darude.mid
Error parsing midi data/midi/v16/midi_sources/130k_reddit/D/D/DION.Move any mountain.mid ord() expected

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [17]:
arr2csv(processed, out_csv); len(processed)

196536

In [18]:
df = pd.read_csv(out_csv); df.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,genres,source,section,ht_offset,midi,mxl,song_url,ht_time_signature,midi_title,title,artist,ht_key,ht_bpm,numpy,md5,ht_mode,parts
0,,hooktheory,chorus,0.0,midi_sources/hooktheory/pianoroll/w/wayne-shar...,,https://www.hooktheory.com/theorytab/view/wayn...,4.0,yu-gi-oh3,yu-gi-oh-theme-song,wayne-sharpe,C,128.0,midi_encode/hooktheory/pianoroll/w/wayne-sharp...,bf1f29e5ff84e3e93e37fb873bfb590e,1.0,"intro,chorus"
1,,hooktheory,intro,0.0,midi_sources/hooktheory/pianoroll/w/wayne-shar...,,https://www.hooktheory.com/theorytab/view/wayn...,3.0,yu-gi-oh,yu-gi-oh-theme-song,wayne-sharpe,C,85.0,midi_encode/hooktheory/pianoroll/w/wayne-sharp...,055f80ad67f64edb14a85ca8fbfe8c29,1.0,"intro,chorus"
2,Jazz,hooktheory,chorus,-5.0,midi_sources/hooktheory/pianoroll/w/what-a-day...,,https://www.hooktheory.com/theorytab/view/what...,4.0,kiefer,kiefer,what-a-day,D,96.0,midi_encode/hooktheory/pianoroll/w/what-a-day/...,197f96f5d181f6ce1e2c5ab04ac1ff87,6.0,chorus
3,"J-Pop,Pop",hooktheory,pre-chorus,-5.0,midi_sources/hooktheory/pianoroll/w/whiteflame...,,https://www.hooktheory.com/theorytab/view/whit...,4.0,senbonzakura - pre-Pre-Chorus,senbonzakura,whiteflame,D,152.0,midi_encode/hooktheory/pianoroll/w/whiteflame/...,9e7ce13a35f1314423a9a6d5a5287a4a,6.0,"verse,pre-chorus,chorus"
4,"J-Pop,Pop",hooktheory,verse,-5.0,midi_sources/hooktheory/pianoroll/w/whiteflame...,,https://www.hooktheory.com/theorytab/view/whit...,4.0,Senbonzakura,senbonzakura,whiteflame,D,152.0,midi_encode/hooktheory/pianoroll/w/whiteflame/...,d5aaf79d0989222f1362f9f46c540a27,6.0,"verse,pre-chorus,chorus"


In [19]:
len([s for s in df.midi.values if isinstance(s, str)])

196536

In [20]:
len([s for s in df.numpy.values if isinstance(s, str)])

164776

In [21]:
Counter(df[df.midi.notnull()].source.values)

Counter({'hooktheory': 19882,
         'freemidi': 5167,
         'midiworld': 4109,
         'ecomp': 2508,
         'cprato': 312,
         'classical_piano': 329,
         'classical_archives': 14642,
         'musescore': 10935,
         'wikifonia': 6346,
         'lmd': 13564,
         'reddit': 98666,
         'hooktheory_c': 20076})

In [22]:
Counter(df[df.numpy.notnull()].source.values)

Counter({'hooktheory': 19404,
         'freemidi': 3974,
         'midiworld': 3461,
         'ecomp': 2508,
         'cprato': 308,
         'classical_piano': 329,
         'classical_archives': 14627,
         'musescore': 10894,
         'wikifonia': 6195,
         'lmd': 9175,
         'reddit': 74339,
         'hooktheory_c': 19562})