In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import sys
sys.path.insert(0, '../../')
from src.encode_data import *
from src.midi_data import *
# from src.data_sources import process_parallel, transform_csv_row
from src.midi_transform import *
from concurrent.futures import ProcessPoolExecutor
from fastprogress.fastprogress import master_bar, progress_bar

In [3]:
from tqdm import tqdm

## Standardize and reformat raw midi files before encoding to text
- Transform key to C major
- Remove unused instruments
- Combine multiple tracks with the same instrument into a single part
- Melody, Piano, String

### Load midi data

In [4]:
version = 'v10'
data_path = Path('data/midi')
version_path = data_path/version

In [5]:
import pandas as pd

In [6]:
source_dir = 'midi_sources'
out_dir = 'midi_encode'
source_csv = version_path/'metadata'/f'{source_dir}.csv'
out_csv = version_path/out_dir/f'{out_dir}.csv'
out_csv.parent.mkdir(parents=True, exist_ok=True)
source_csv, out_csv

(PosixPath('data/midi/v10/metadata/midi_sources.csv'),
 PosixPath('data/midi/v10/midi_encode/midi_encode.csv'))

In [7]:
num_comps = 2 # note, duration
cutoff = 4 # max instruments

In [8]:
df = pd.read_csv(source_csv); df.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,song_url,ht_bpm,ht_time_signature,midi,genres,artist,md5,section,ht_key,source,parts,midi_title,title,ht_offset,ht_mode,mxl
0,https://www.hooktheory.com/theorytab/view/wayn...,128.0,4.0,midi_sources/hooktheory/pianoroll/w/wayne-shar...,,wayne-sharpe,bf1f29e5ff84e3e93e37fb873bfb590e,chorus,C,hooktheory,"intro,chorus",yu-gi-oh3,yu-gi-oh-theme-song,0.0,1.0,
1,https://www.hooktheory.com/theorytab/view/wayn...,85.0,3.0,midi_sources/hooktheory/pianoroll/w/wayne-shar...,,wayne-sharpe,055f80ad67f64edb14a85ca8fbfe8c29,intro,C,hooktheory,"intro,chorus",yu-gi-oh,yu-gi-oh-theme-song,0.0,1.0,
2,https://www.hooktheory.com/theorytab/view/what...,96.0,4.0,midi_sources/hooktheory/pianoroll/w/what-a-day...,Jazz,what-a-day,197f96f5d181f6ce1e2c5ab04ac1ff87,chorus,D,hooktheory,chorus,kiefer,kiefer,-5.0,6.0,
3,https://www.hooktheory.com/theorytab/view/whit...,152.0,4.0,midi_sources/hooktheory/pianoroll/w/whiteflame...,"J-Pop,Pop",whiteflame,9e7ce13a35f1314423a9a6d5a5287a4a,pre-chorus,D,hooktheory,"verse,pre-chorus,chorus",senbonzakura - pre-Pre-Chorus,senbonzakura,-5.0,6.0,
4,https://www.hooktheory.com/theorytab/view/whit...,152.0,4.0,midi_sources/hooktheory/pianoroll/w/whiteflame...,"J-Pop,Pop",whiteflame,d5aaf79d0989222f1362f9f46c540a27,verse,D,hooktheory,"verse,pre-chorus,chorus",Senbonzakura,senbonzakura,-5.0,6.0,


In [9]:
ecomp_records = df.loc[df.source == 'ecomp'].to_dict(orient='records')

In [10]:
all_records = df.to_dict(orient='records')

### Via Data Sources

In [11]:
import concurrent

In [12]:

def process_all(func, arr, total=None, max_workers=None, timeout=None):
    "Process array in parallel"
    if total is None: total = len(arr)
    results = []
    with ProcessPoolExecutor(max_workers=max_workers) as ex:
        futures = [ex.submit(func,o) for i,o in enumerate(arr)]
        for f in tqdm(concurrent.futures.as_completed(futures, timeout=timeout), total=total):
            res = f.result(timeout=timeout)
            if res is not None:
                results.append(res)
    return results

### Need better midi formatting

In [13]:
def transform_func(metadata):
    result = metadata.copy()
    
    # Part 1. Compress tracks/instruments
    if isinstance(metadata.get('midi'), str):
        input_path = version_path/metadata['midi']
    else:
        input_path = version_path/metadata.get('mxl', None)
    extension = input_path.suffix.lower()
    if not input_path.exists(): 
        print('Input path does not exist:', input_path, metadata)
        return None
    
    # Get outfile and check if it exists
    out_file = Path(str(input_path).replace(f'/{source_dir}/', f'/{out_dir}/'))
    out_file = out_file.with_suffix('.npy')
    out_file.parent.mkdir(parents=True, exist_ok=True)
    if out_file.exists(): 
        result['numpy'] = str(out_file.relative_to(version_path))
        return result
    return None
#     print(input_path)
    if extension == '.mid':
        try: input_file = compress_midi_file(input_path, cutoff=cutoff) # remove non note tracks and standardize instruments
        except music21.midi.MidiException as e:
            print('Error parsing midi', input_path, e)
            return None
        if not input_file: return None
    elif extension in ['.mxl', '.xml']:
        input_file = input_path
    else:
        raise Exception('Error finding extension:', input_path, extension)
        
    # Part 2. Compress rests and long notes
    stream = file2stream(input_file) # 1.
#     chordarr = stream2chordarr(stream, max_dur=128, flat=True) # 2. max_dur = quarter_len * sample_freq (4). 128 = 8 bars
    chordarr = stream2chordarr(stream, max_dur=128, flat=(extension != '.mid')) # 2. max_dur = quarter_len * sample_freq (4). 128 = 8 bars
    
    chord_short = compress_chordarr(chordarr)
    delta_trim = chordarr.shape[0] - chord_short.shape[0]
    if delta_trim > 100: print(f'Removed {delta_trim} rests from {input_path}')
    chordarr = chord_short
    
    # Part 3. Chord array to numpy
    seq = chordarr2seq(chordarr)
    if len(seq) < 32:
        print('Sequence too short:', len(seq), input_path)
        return None
    
    npenc = seq2npenc(seq, num_comps=num_comps)
    if (npenc[...,1] > 128+ENC_OFFSET).any(): 
        print('npenc exceeds max 128 duration:', input_path)
        return None
    
    if ((npenc[...,0] > ENC_OFFSET) & ((npenc[...,0] < 12+ENC_OFFSET) | (npenc[...,0] >= 127-12+ENC_OFFSET))).any(): 
        print('npenc out of note range 12 - 116:', input_path)
        return None
    
    
    np.save(out_file, npenc)
    
    result['numpy'] = str(out_file.relative_to(version_path))
    return result

In [14]:
# # sanity check
# import random
# for r in random.sample(ecomp_records, 500):
#     transform_func(r)
        

In [15]:
### AS TOD: Fix 'info channel is not channel 0' error

In [16]:
# all_records = all_records[60000:]

In [17]:
processed = process_all(transform_func, all_records)

100%|██████████| 181716/181716 [00:54<00:00, 3319.88it/s]


In [18]:
from src.data_sources import arr2csv

In [19]:
arr2csv(processed, out_csv)

In [20]:
df = pd.read_csv(out_csv)

  interactivity=interactivity, compiler=compiler, result=result)


In [21]:
df.head()

Unnamed: 0,ht_bpm,md5,numpy,ht_time_signature,midi_title,source,artist,ht_offset,section,song_url,mxl,ht_key,parts,title,genres,midi,ht_mode
0,128.0,bf1f29e5ff84e3e93e37fb873bfb590e,midi_encode/hooktheory/pianoroll/w/wayne-sharp...,4.0,yu-gi-oh3,hooktheory,wayne-sharpe,0.0,chorus,https://www.hooktheory.com/theorytab/view/wayn...,,C,"intro,chorus",yu-gi-oh-theme-song,,midi_sources/hooktheory/pianoroll/w/wayne-shar...,1.0
1,85.0,055f80ad67f64edb14a85ca8fbfe8c29,midi_encode/hooktheory/pianoroll/w/wayne-sharp...,3.0,yu-gi-oh,hooktheory,wayne-sharpe,0.0,intro,https://www.hooktheory.com/theorytab/view/wayn...,,C,"intro,chorus",yu-gi-oh-theme-song,,midi_sources/hooktheory/pianoroll/w/wayne-shar...,1.0
2,96.0,197f96f5d181f6ce1e2c5ab04ac1ff87,midi_encode/hooktheory/pianoroll/w/what-a-day/...,4.0,kiefer,hooktheory,what-a-day,-5.0,chorus,https://www.hooktheory.com/theorytab/view/what...,,D,chorus,kiefer,Jazz,midi_sources/hooktheory/pianoroll/w/what-a-day...,6.0
3,152.0,9e7ce13a35f1314423a9a6d5a5287a4a,midi_encode/hooktheory/pianoroll/w/whiteflame/...,4.0,senbonzakura - pre-Pre-Chorus,hooktheory,whiteflame,-5.0,pre-chorus,https://www.hooktheory.com/theorytab/view/whit...,,D,"verse,pre-chorus,chorus",senbonzakura,"J-Pop,Pop",midi_sources/hooktheory/pianoroll/w/whiteflame...,6.0
4,152.0,d5aaf79d0989222f1362f9f46c540a27,midi_encode/hooktheory/pianoroll/w/whiteflame/...,4.0,Senbonzakura,hooktheory,whiteflame,-5.0,verse,https://www.hooktheory.com/theorytab/view/whit...,,D,"verse,pre-chorus,chorus",senbonzakura,"J-Pop,Pop",midi_sources/hooktheory/pianoroll/w/whiteflame...,6.0
