In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import sys
sys.path.insert(0, '../../')
from src.encode_data import *
from src.midi_data import *
from src.data_sources import process_all, arr2csv
from src.midi_transform import *
from src.fastai_data import *

In [3]:
import traceback
import time

## Standardize and reformat raw midi files before encoding to text
- Transform key to C major
- Remove unused instruments
- Combine multiple tracks with the same instrument into a single part
- Melody, Piano, String

### Load midi data

In [4]:
version = 'v18'
data_path = Path('data/midi')
version_path = data_path/version

In [5]:
import pandas as pd

In [6]:
sf_path = f'sf{SAMPLE_FREQ}'
out_dir = Path(f'{sf_path}/s2s_encode')

In [7]:
out_csv = version_path/out_dir/f'{out_dir.name}.csv'
out_csv.parent.mkdir(parents=True, exist_ok=True)
out_csv

PosixPath('data/midi/v18/sf4/s2s_encode/s2s_encode.csv')

### Encoding midi to numpy

In [8]:
df = pd.read_csv(out_csv); df.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,ht_mode,title,genres,source,section,midi,artist,parts,ht_bpm,midi_title,ht_time_signature,ht_key,ht_offset,md5,song_url,numpy,mxl
0,1.0,yu-gi-oh-theme-song,,hooktheory,chorus,midi_sources/hooktheory/pianoroll/w/wayne-shar...,wayne-sharpe,"intro,chorus",128.0,yu-gi-oh3,4.0,C,0.0,bf1f29e5ff84e3e93e37fb873bfb590e,https://www.hooktheory.com/theorytab/view/wayn...,sf4/s2s_encode/hooktheory/pianoroll/w/wayne-sh...,
1,1.0,yu-gi-oh-theme-song,,hooktheory,intro,midi_sources/hooktheory/pianoroll/w/wayne-shar...,wayne-sharpe,"intro,chorus",85.0,yu-gi-oh,3.0,C,0.0,055f80ad67f64edb14a85ca8fbfe8c29,https://www.hooktheory.com/theorytab/view/wayn...,,
2,6.0,kiefer,Jazz,hooktheory,chorus,midi_sources/hooktheory/pianoroll/w/what-a-day...,what-a-day,chorus,96.0,kiefer,4.0,D,-5.0,197f96f5d181f6ce1e2c5ab04ac1ff87,https://www.hooktheory.com/theorytab/view/what...,sf4/s2s_encode/hooktheory/pianoroll/w/what-a-d...,
3,6.0,senbonzakura,"J-Pop,Pop",hooktheory,pre-chorus,midi_sources/hooktheory/pianoroll/w/whiteflame...,whiteflame,"verse,pre-chorus,chorus",152.0,senbonzakura - pre-Pre-Chorus,4.0,D,-5.0,9e7ce13a35f1314423a9a6d5a5287a4a,https://www.hooktheory.com/theorytab/view/whit...,sf4/s2s_encode/hooktheory/pianoroll/w/whitefla...,
4,6.0,senbonzakura,"J-Pop,Pop",hooktheory,verse,midi_sources/hooktheory/pianoroll/w/whiteflame...,whiteflame,"verse,pre-chorus,chorus",152.0,Senbonzakura,4.0,D,-5.0,d5aaf79d0989222f1362f9f46c540a27,https://www.hooktheory.com/theorytab/view/whit...,sf4/s2s_encode/hooktheory/pianoroll/w/whitefla...,


In [9]:
all_records = df.to_dict(orient='records'); len(all_records)

196499

In [10]:
import pdb

In [11]:
def enc_tfm(np_array):
    np_array = to_single_stream(np_array)
    np_array = position_tfm(np_array)
    return np_array

In [12]:

def partenc2seq2seq(part_np, part_type, vocab=vocab):
    part_meta = np.array([vocab.stoi[part_type], vocab.pad_idx])
    s2s_out = to_single_stream(part_np, start_seq=part_meta)
    s2s_out = np.pad(s2s_out, (0,1), 'constant', constant_values=vocab.stoi[EOS])
    s2s_out = position_tfm(s2s_out)
    return s2s_out

In [13]:

def avg_pitch(t, sep_idx=VALTSEP):
    return t[t[:, 0] > sep_idx][:, 0].mean()

In [14]:
def process_metadata(metadata):
    result = metadata.copy()
    
    if not isinstance(metadata.get('numpy'), str): return None
    
    input_path = version_path/'old'/metadata['numpy']
    output_path = version_path/metadata['numpy']
    
    if not input_path.exists(): return
    if output_path.exists(): return
    output_path.parent.mkdir(parents=True, exist_ok=True)
    
    np_array = np.load(input_path)
    
    if np_array.shape[0] == 2:
        # s2s encoding
        # order by melody > chords
        p1, p2 = np_array
        m, c = (p1, p2) if avg_pitch(p1) > avg_pitch(p2) else (p2, p1) # Assuming melody has higher pitch
        m = partenc2seq2seq(m, MSEQ)
        c = partenc2seq2seq(c, CSEQ)
        out = np.array((m, c))
        np.save(output_path, out)
        return 
    
    np_array = to_single_stream(np_array)
    np_array = position_tfm(np_array)
    np.save(output_path, np_array)

In [15]:
# for r in progress_bar(all_records, total=len(all_records)):
#     process_metadata(r)

In [16]:
# # sanity check
import random
for r in random.sample(all_records, 10):
    process_metadata(r)

In [21]:
def timeout_func(data, seconds):
    print("Timeout:", seconds, data.get('midi'))

In [None]:
timeout = 500
processed = process_all(process_metadata, all_records, timeout=timeout, timeout_func=timeout_func)

In [22]:
test = np.load(version_path/'sf4/midi_encode/hooktheory/pianoroll/w/wayne-sharpe/yu-gi-oh-theme-song/chorus_key_original.npy')

In [29]:
test[:20,0]

array([  0,   1,   8, 138,  64, 138,   8, 138,  63, 138,   8, 138,  54, 138,   8, 138,  62, 138,   8, 138])

In [30]:
to_double_stream(test[:20, 0])

array([[-1,  1],
       [55,  1],
       [-1,  1],
       [54,  1],
       [-1,  1],
       [45,  1],
       [-1,  1],
       [53,  1],
       [-1,  1]])

In [31]:
test[:20, 1]

array([ 0,  0,  0,  0, -1, -1, -1, -1, -2, -2, -2, -2, -3, -3, -3, -3, -4, -4, -4, -4])

In [20]:
all_records[0]

{'song_url': 'https://www.hooktheory.com/theorytab/view/wayne-sharpe/yu-gi-oh-theme-song',
 'md5': 'bf1f29e5ff84e3e93e37fb873bfb590e',
 'mxl': nan,
 'genres': nan,
 'ht_offset': 0.0,
 'ht_key': 'C',
 'section': 'chorus',
 'source': 'hooktheory',
 'ht_bpm': 128.0,
 'artist': 'wayne-sharpe',
 'ht_mode': 1.0,
 'numpy': 'sf4/midi_encode/hooktheory/pianoroll/w/wayne-sharpe/yu-gi-oh-theme-song/chorus_key_original.npy',
 'midi_title': 'yu-gi-oh3',
 'title': 'yu-gi-oh-theme-song',
 'midi': 'midi_sources/hooktheory/pianoroll/w/wayne-sharpe/yu-gi-oh-theme-song/chorus_key_original.mid',
 'parts': 'intro,chorus',
 'ht_time_signature': 4.0}

In [None]:
arr2csv(processed, out_csv); len(processed)

In [None]:
df = pd.read_csv(out_csv); df.head()

In [19]:
len([s for s in df.midi.values if isinstance(s, str)])

196531

In [20]:
len([s for s in df.numpy.values if isinstance(s, str)])

164774

In [38]:
len([s for s in df.numpy.values if isinstance(s, str)])

112169

In [39]:
Counter(df[df.midi.notnull()].source.values)

Counter({'hooktheory': 19882,
         'freemidi': 5168,
         'midiworld': 4108,
         'ecomp': 2508,
         'cprato': 312,
         'classical_piano': 329,
         'classical_archives': 14642,
         'musescore': 10933,
         'wikifonia': 6346,
         'lmd': 13565,
         'reddit': 98674,
         'hooktheory_c': 20076})

In [22]:
Counter(df[df.numpy.notnull()].source.values)

Counter({'hooktheory': 19404,
         'freemidi': 3974,
         'midiworld': 3460,
         'ecomp': 2508,
         'cprato': 308,
         'classical_piano': 329,
         'classical_archives': 14627,
         'musescore': 10891,
         'wikifonia': 6195,
         'lmd': 9175,
         'reddit': 74341,
         'hooktheory_c': 19562})