In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import sys
sys.path.insert(0, '../../')
from src.encode_data import *
from src.midi_data import *
from src.data_sources import process_parallel, transform_csv_row
from src.midi_transform import *

In [3]:
from tqdm import tqdm

## Standardize and reformat raw midi files before encoding to text
- Transform key to C major
- Remove unused instruments
- Combine multiple tracks with the same instrument into a single part
- Melody, Piano, String

### Load midi data

In [4]:
version = 'v10'
data_path = Path('data/midi')
version_path = data_path/version

In [5]:
import pandas as pd

In [6]:
source_dir = 'midi_sources'
out_dir = 'midi_encode'
source_csv = version_path/'metadata'/f'{source_dir}.csv'
out_csv = version_path/out_dir/f'{out_dir}.csv'
out_csv.parent.mkdir(parents=True, exist_ok=True)
source_csv, out_csv

(PosixPath('data/midi/v10/metadata/midi_sources.csv'),
 PosixPath('data/midi/v10/midi_encode/midi_encode.csv'))

In [7]:
num_comps = 2 # note, duration
cutoff = 4 # max instruments

In [8]:
df = pd.read_csv(source_csv); df.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,midi,section,parts,title,midi_title,ht_key,source,md5,genres,ht_time_signature,song_url,ht_bpm,ht_offset,artist,ht_mode,mxl
0,midi_sources/hooktheory/pianoroll/w/wayne-shar...,chorus,"intro,chorus",yu-gi-oh-theme-song,yu-gi-oh3,C,hooktheory,bf1f29e5ff84e3e93e37fb873bfb590e,,4.0,https://www.hooktheory.com/theorytab/view/wayn...,128.0,0.0,wayne-sharpe,1.0,
1,midi_sources/hooktheory/pianoroll/w/wayne-shar...,intro,"intro,chorus",yu-gi-oh-theme-song,yu-gi-oh,C,hooktheory,055f80ad67f64edb14a85ca8fbfe8c29,,3.0,https://www.hooktheory.com/theorytab/view/wayn...,85.0,0.0,wayne-sharpe,1.0,
2,midi_sources/hooktheory/pianoroll/w/what-a-day...,chorus,chorus,kiefer,kiefer,D,hooktheory,197f96f5d181f6ce1e2c5ab04ac1ff87,Jazz,4.0,https://www.hooktheory.com/theorytab/view/what...,96.0,-5.0,what-a-day,6.0,
3,midi_sources/hooktheory/pianoroll/w/whiteflame...,pre-chorus,"verse,pre-chorus,chorus",senbonzakura,senbonzakura - pre-Pre-Chorus,D,hooktheory,9e7ce13a35f1314423a9a6d5a5287a4a,"J-Pop,Pop",4.0,https://www.hooktheory.com/theorytab/view/whit...,152.0,-5.0,whiteflame,6.0,
4,midi_sources/hooktheory/pianoroll/w/whiteflame...,verse,"verse,pre-chorus,chorus",senbonzakura,Senbonzakura,D,hooktheory,d5aaf79d0989222f1362f9f46c540a27,"J-Pop,Pop",4.0,https://www.hooktheory.com/theorytab/view/whit...,152.0,-5.0,whiteflame,6.0,


In [9]:
all_records = df.to_dict(orient='records')

### Via Data Sources

In [10]:

def process_all(func, arr, total=None, max_workers=None, timeout=None):
    "Process array in parallel"
    if total is None: total = len(arr)
    results = []
    with ProcessPoolExecutor(max_workers=max_workers) as ex:
        futures = [ex.submit(func,o) for i,o in enumerate(arr)]
        for f in progress_bar(concurrent.futures.as_completed(futures, timeout=timeout), total=total):
            res = f.result()
            if res is not None:
                results.append(res)
    return results

### Need better midi formatting

In [11]:
music21.midi.MidiFile

music21.midi.MidiFile

In [24]:
def transform_func(metadata):
    result = metadata.copy()
    
    # Part 1. Compress tracks/instruments
    if isinstance(metadata.get('midi'), str):
        input_path = version_path/metadata['midi']
    else:
        input_path = version_path/metadata.get('mxl', None)
    extension = input_path.suffix.lower()
    if not input_path.exists(): 
        print('Input path does not exist:', input_path, metadata)
        return None
    print(input_path)
    
    # Get outfile and check if it exists
    out_file = Path(str(input_path).replace(f'/{source_dir}/', f'/{out_dir}/'))
    out_file = out_file.with_suffix('.npy')
    out_file.parent.mkdir(parents=True, exist_ok=True)
    if out_file.exists(): 
        result['numpy'] = str(out_file.relative_to(version_path))
        return result
    
    if extension == '.mid':
        input_file = compress_midi_file(input_path, cutoff=cutoff) # remove non note tracks and standardize instruments
        if not input_file: return None
    elif extension in ['.mxl', '.xml']:
        input_file = input_path
    else:
        raise Exception('Error finding extension:', input_path, extension)
        
    # Part 2. Compress rests and long notes
    stream = file2stream(input_file) # 1.
    chordarr = stream2chordarr(stream, max_dur=128) # 2. max_dur = quarter_len * sample_freq (4). 128 = 8 bars
    
    chord_short = compress_chordarr(chordarr)
    delta_trim = chordarr.shape[0] - chord_short.shape[0]
    if delta_trim > 100: print(f'Removed {delta_trim} rests from {input_path}')
    chordarr = chord_short
    
    # Part 3. Chord array to numpy
    seq = chordarr2seq(chordarr)
    if len(seq) < 32:
        print('Sequence too short:', len(seq), input_path)
        return None
    
    npenc = seq2npenc(seq, num_comps=num_comps)
    if (npenc[...,1] > 128+ENC_OFFSET).any(): 
        print('npenc exceeds max 128 duration:', input_path)
        return None
    
    if ((npenc[...,0] > ENC_OFFSET) & ((npenc[...,0] < 12+ENC_OFFSET) | (npenc[...,0] >= 127-12+ENC_OFFSET))).any(): 
        print('npenc out of note range 12 - 116:', input_path)
        return None
    
    
    np.save(out_file, npenc)
    
    result['numpy'] = str(out_file.relative_to(version_path))
    return result

In [25]:
import random

In [26]:
# sanity check
for r in random.sample(all_records, 500):
    transform_func(r)
        

data/midi/v10/midi_sources/130k_reddit/F/F/FINESTLIGHT.mid
data/midi/v10/midi_sources/130k_reddit/Classical_Guitar_classicalguitarmidi.com_MIDIRip/Coste_Lecon_No27.mid
data/midi/v10/midi_sources/130k_reddit/W/W/WITHHELP.MID
data/midi/v10/midi_sources/hooktheory/pianoroll/j/junichi-masuda/trainer-battle---pokemon-diamond-and-pearl/chorus_key_original.mid
data/midi/v10/midi_sources/130k_reddit/C/C/chpn_op7_2.mid
data/midi/v10/midi_sources/lmd_clean/The Doors/Hello, I Love You, Won't You Tell Me Your Name?.mid
data/midi/v10/midi_sources/lmd_clean/Dave Matthews Band/Satellite.mid
data/midi/v10/midi_sources/hooktheory/pianoroll/t/the-tokens/the-lion-sleeps-tonight/pre-chorus-and-chorus_key_original.mid
data/midi/v10/midi_sources/130k_reddit/Classical_mfiles.co.uk_MIDIRip/sans-day-carol-piano.mid
data/midi/v10/midi_sources/lmd_clean/The Everly Brothers/Crying in the Rain.mid
data/midi/v10/midi_sources/130k_reddit/S/S/Super Mario RPG - Battle Victory.mid
data/midi/v10/midi_sources/130k_reddit

MidiException: badly formated midi bytes, got: b'RIFF\xde[\x00\x00RMIDdata\xd2[\x00\x00'

In [23]:
%debug

> [0;32m/home/ubuntu/anaconda3/envs/midi/lib/python3.7/pathlib.py[0m(633)[0;36m_parse_args[0;34m()[0m
[0;32m    631 [0;31m                [0mparts[0m [0;34m+=[0m [0ma[0m[0;34m.[0m[0m_parts[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    632 [0;31m            [0;32melse[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m--> 633 [0;31m                [0ma[0m [0;34m=[0m [0mos[0m[0;34m.[0m[0mfspath[0m[0;34m([0m[0ma[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    634 [0;31m                [0;32mif[0m [0misinstance[0m[0;34m([0m[0ma[0m[0;34m,[0m [0mstr[0m[0;34m)[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    635 [0;31m                    [0;31m# Force-cast str subclasses to str (issue #21127)[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[0m
ipdb> up
> [0;32m/home/ubuntu/anaconda3/envs/midi/lib/python3.7/pathlib.py[0m(679)[0;36m_make_child[0;34m()[0m
[0;32m    677 [0;31m[0;34m[0m[0m
[0m[0;32m    678 [0;31m    [0;32mde

In [19]:
%debug

> [0;32m/home/ubuntu/anaconda3/envs/midi/lib/python3.7/site-packages/music21/converter/__init__.py[0m(1123)[0;36mparse[0;34m()[0m
[0;32m   1121 [0;31m        [0;32mreturn[0m [0mparseData[0m[0;34m([0m[0mvalue[0m[0;34m,[0m [0mnumber[0m[0;34m=[0m[0mnumber[0m[0;34m,[0m [0;34m**[0m[0mkeywords[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m   1122 [0;31m    [0;31m# a midi string, must come before os.path.exists test[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m-> 1123 [0;31m    [0;32melif[0m [0;32mnot[0m [0misinstance[0m[0;34m([0m[0mvalue[0m[0;34m,[0m [0mbytes[0m[0;34m)[0m [0;32mand[0m [0mvalueStr[0m[0;34m.[0m[0mstartswith[0m[0;34m([0m[0;34m'MThd'[0m[0;34m)[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m   1124 [0;31m        [0;32mreturn[0m [0mparseData[0m[0;34m([0m[0mvalue[0m[0;34m,[0m [0mnumber[0m[0;34m=[0m[0mnumber[0m[0;34m,[0m [0mformat[0m[0;34m=[0m[0mm21Format[0m[0;34m,[0m [0;34m**

In [None]:
### AS TOD: Fix 'info channel is not channel 0' error

In [None]:
idx2out = process_parallel(parallel_func, result, total=df.shape[0])

In [16]:
tdf = pd.DataFrame(data={out_dir: list(idx2out.values())}, index=list(idx2out.keys()))

In [17]:
merged_df = df.join(tdf)

In [18]:
tdf.shape, df.shape, merged_df.shape

((166644, 1), (185846, 22), (185846, 23))

In [20]:
merged_df.to_csv(out_csv, index=False); merged_df.head()

Unnamed: 0,inferred_offset,song_url,instruments,ht_mode,midi_title,title,seconds,midi,inferred_key,quarter_length,...,ht_bpm,artist,ht_key,ht_time_signature,bpm,section,parts,genres,mxl,midi_transform
0,0.0,https://www.hooktheory.com/theorytab/view/wayn...,Piano,1.0,yu-gi-oh,yu-gi-oh-theme-song,25.411765,midi_sources/hooktheory/pianoroll/w/wayne-shar...,C major,36.0,...,85.0,wayne-sharpe,C,3.0,85.0,intro,"intro,chorus",,,midi_transform/hooktheory/pianoroll/w/wayne-sh...
1,0.0,https://www.hooktheory.com/theorytab/view/wayn...,"Piano,Piano",1.0,yu-gi-oh3,yu-gi-oh-theme-song,15.0,midi_sources/hooktheory/pianoroll/w/wayne-shar...,C major,32.0,...,128.0,wayne-sharpe,C,4.0,128.0,chorus,"intro,chorus",,,midi_transform/hooktheory/pianoroll/w/wayne-sh...
2,5.0,https://www.hooktheory.com/theorytab/view/what...,"Piano,Piano",1.0,kiefer,kiefer,10.0,midi_sources/hooktheory/pianoroll/w/what-a-day...,E minor,16.0,...,96.0,what-a-day,C,4.0,96.0,chorus,chorus,Jazz,,midi_transform/hooktheory/pianoroll/w/what-a-d...
3,,https://www.hooktheory.com/theorytab/view/weez...,,1.0,My New Song,beverly-hills,,midi_sources/hooktheory/pianoroll/w/weezer/bev...,,,...,128.0,weezer,C,4.0,,intro-and-verse,intro-and-verse,,,
4,0.0,https://www.hooktheory.com/theorytab/view/weez...,"Piano,Piano",1.0,Weezer - Fall Together,fall-together-,10.322581,midi_sources/hooktheory/pianoroll/w/weezer/fal...,A minor,16.0,...,93.0,weezer,C,4.0,93.0,chorus,chorus,Rock,,midi_transform/hooktheory/pianoroll/w/weezer/f...
