# Tokenize and save data

In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import os
os.chdir('../../')

In [3]:
from src.numpy_encode import *
from src.utils.file_processing import process_all, process_file
from src.config import *
from src.music_transformer import *

In [4]:
# from fastai.text import *

## Preparing the data

In [5]:
version = 'v19'
data_path = Path('data/midi')
version_path = data_path/version

In [6]:
source_dir = 'piano_duet'
# source_dir = 'midi_encode'
out_path = version_path/source_dir
csv_path = out_path/f'{source_dir}.csv'
version_path.ls()

[PosixPath('data/midi/v19/metadata'),
 PosixPath('data/midi/v19/midi_sources'),
 PosixPath('data/midi/v19/midi_encode'),
 PosixPath('data/midi/v19/piano_duet'),
 PosixPath('data/midi/v19/s2s_encode')]

In [7]:
csv = pd.read_csv(csv_path)

  interactivity=interactivity, compiler=compiler, result=result)


In [11]:
csv['numpy'] = csv['numpy'].apply(lambda x: x.replace('sf4/', '') if isinstance(x, str) else x)
csv.to_csv(csv_path, index=False)

In [12]:
csv = csv.loc[csv['numpy'].notna()];

In [13]:
def create_databunch(files, data_save_name, path=out_path):
    save_file = path/data_save_name
    if save_file.exists():
        data = load_data(path, data_save_name)
    else:
        save_file.parent.mkdir(exist_ok=True, parents=True)
        vocab = MusicVocab.create()
        processors = [OpenNPFileProcessor(), MusicItemProcessor()]

        data = MusicDataBunch.from_files(files, path, processors=processors)
        data.save(data_save_name)
    return data

In [14]:
def get_files(csv):
    files = csv['numpy']
    flist = [Path(version_path/f) for f in files.values if isinstance(f, str)]
    flist = [f for f in flist if f.exists()]
    return flist

In [15]:
version_path

PosixPath('data/midi/v19')

### Create All Dataset

In [16]:
csv.head()

Unnamed: 0,section,ht_key,song_url,midi,artist,md5,numpy,ht_offset,title,midi_title,parts,genres,mxl,source,ht_mode,ht_time_signature,ht_bpm
0,chorus,C,https://www.hooktheory.com/theorytab/view/wayn...,midi_sources/hooktheory/pianoroll/w/wayne-shar...,wayne-sharpe,bf1f29e5ff84e3e93e37fb873bfb590e,piano_duet/hooktheory/pianoroll/w/wayne-sharpe...,0.0,yu-gi-oh-theme-song,yu-gi-oh3,"intro,chorus",,,hooktheory,1.0,4.0,128.0
1,intro,C,https://www.hooktheory.com/theorytab/view/wayn...,midi_sources/hooktheory/pianoroll/w/wayne-shar...,wayne-sharpe,055f80ad67f64edb14a85ca8fbfe8c29,piano_duet/hooktheory/pianoroll/w/wayne-sharpe...,0.0,yu-gi-oh-theme-song,yu-gi-oh,"intro,chorus",,,hooktheory,1.0,3.0,85.0
2,chorus,D,https://www.hooktheory.com/theorytab/view/what...,midi_sources/hooktheory/pianoroll/w/what-a-day...,what-a-day,197f96f5d181f6ce1e2c5ab04ac1ff87,piano_duet/hooktheory/pianoroll/w/what-a-day/k...,-5.0,kiefer,kiefer,chorus,Jazz,,hooktheory,6.0,4.0,96.0
3,pre-chorus,D,https://www.hooktheory.com/theorytab/view/whit...,midi_sources/hooktheory/pianoroll/w/whiteflame...,whiteflame,9e7ce13a35f1314423a9a6d5a5287a4a,piano_duet/hooktheory/pianoroll/w/whiteflame/s...,-5.0,senbonzakura,senbonzakura - pre-Pre-Chorus,"verse,pre-chorus,chorus","J-Pop,Pop",,hooktheory,6.0,4.0,152.0
4,verse,D,https://www.hooktheory.com/theorytab/view/whit...,midi_sources/hooktheory/pianoroll/w/whiteflame...,whiteflame,d5aaf79d0989222f1362f9f46c540a27,piano_duet/hooktheory/pianoroll/w/whiteflame/s...,-5.0,senbonzakura,Senbonzakura,"verse,pre-chorus,chorus","J-Pop,Pop",,hooktheory,6.0,4.0,152.0


In [17]:
all_files = get_files(csv); len(all_files)

112169

In [18]:
all_data = create_databunch(all_files, data_save_name='cached/all.pkl')

In [19]:
all_data.one_batch()

(tensor([[ 17, 129, 147,  ..., 105,  73, 105],
         [ 17, 129, 147,  ..., 105,  73, 105],
         [ 17, 129, 147,  ..., 105,  73, 105],
         ...,
         [ 17, 129, 147,  ..., 105,  73, 105],
         [ 17, 129, 147,  ..., 105,  73, 105],
         [ 17, 129, 147,  ..., 105,  73, 105]]),
 tensor([[129, 147, 129,  ...,  73, 105, 154],
         [129, 147, 129,  ...,  73, 105, 154],
         [129, 147, 129,  ...,  73, 105, 154],
         ...,
         [129, 147, 129,  ...,  73, 105, 154],
         [129, 147, 129,  ...,  73, 105, 154],
         [129, 147, 129,  ...,  73, 105, 154]]))

### Create sample

In [20]:
import random
sample_data = create_databunch(random.sample(all_files, 1000), data_save_name='cached/sample.pkl')

Note: we are reusing all_vocab for the following datasets

### Create Hooktheory Dataset

In [21]:
hook_csv = csv.loc[csv.source.isin(['hooktheory'])]
hook_files = get_files(hook_csv); len(hook_files)

19404

In [22]:
hook_data = create_databunch(hook_files, 'cached/hook.pkl')

### Create Hooktheory C Dataset

In [23]:
hook_csv = csv.loc[csv.source.isin(['hooktheory_c'])]
hook_files = get_files(hook_csv); len(hook_files)

19562

In [24]:
hook_data = create_databunch(hook_files, 'cached/hook_c.pkl')

## Low quality

In [34]:
lq_csv = csv.loc[csv.source.isin(['reddit', 'classical_piano', 'ecomp', 'midiworld', 'freemidi', 'lmd', 'cprato', 'wikifonia', 'classical_archives'])]
lq_files = get_files(lq_csv); len(lq_files)

64006

In [35]:
lq_data = create_databunch(lq_files, 'cached/lq.pkl')

## All High quality

In [36]:
hq_csv = csv.loc[csv.source.isin(['hooktheory', 'musescore'])]
hq_files = get_files(hq_csv); len(hq_files)

28601

In [37]:
hq_data = create_databunch(hq_files, 'cached/hq.pkl')

### Testing

In [38]:
data = all_data

In [34]:
data.train_ds.x

MusicItemList (163127 items)
[[   0    0]
 [   1    0]
 [   8    0]
 [ 138    0]
 ...
 [  63 -112]
 [ 153 -112]
 [  60 -112]
 [ 153 -112]],[[   0    0]
 [   1    0]
 [  74    0]
 [ 138    0]
 ...
 [   8 -128]
 [ 141 -128]
 [  76 -132]
 [ 149 -132]],[[  0   0]
 [  1   0]
 [ 81   0]
 [138   0]
 ...
 [  8 -58]
 [139 -58]
 [ 73 -60]
 [139 -60]],[[   0    0]
 [   1    0]
 [  74    0]
 [ 141    0]
 ...
 [   8 -124]
 [ 139 -124]
 [  74 -126]
 [ 139 -126]],[[   0    0]
 [   1    0]
 [  71    0]
 [ 141    0]
 ...
 [   8 -120]
 [ 141 -120]
 [  71 -124]
 [ 141 -124]]
Path: data/midi/v18/sf4/midi_encode

In [None]:
# train_ids_file = out_path/'tmp/all/train_ids.npy'
# all_ids = np.load(train_ids_file)
# id_cat = np.concatenate(all_ids); id_cat.shape
# ax = tuple(range(len(id_cat.shape)-1))
# max_vocab = id_cat.max(axis=ax)
# max_vocab = (max_vocab+1).tolist(); max_vocab

In [None]:
npenc2stream(data.train_ds.x[10]).show()