# Tokenize and save data

In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import sys
sys.path.insert(0, '../..')
from src.encode_data import *
from src.fastai_data import *

In [3]:
import pandas as pd
import numpy as np

In [4]:
from fastai.text import *

In [5]:
from fastai.text import data

## Preparing the data

In [6]:
version = 'v16'
data_path = Path('data/midi')
version_path = data_path/version

In [7]:
source_dir = 'midi_encode'
# source_dir = 'piano_duet'
out_path = version_path/source_dir
csv_path = out_path/f'{source_dir}.csv'
version_path.ls()

[PosixPath('data/midi/v16/metadata'),
 PosixPath('data/midi/v16/midi_sources'),
 PosixPath('data/midi/v16/midi_encode'),
 PosixPath('data/midi/v16/piano_duet'),
 PosixPath('data/midi/v16/midi_encode.tar.gz'),
 PosixPath('data/midi/v16/s2s_encode'),
 PosixPath('data/midi/v16/s2s_encode.tar.gz'),
 PosixPath('data/midi/v16/piano_duet.tar.gz')]

In [8]:
csv = pd.read_csv(csv_path)
csv = csv.loc[csv['numpy'].notna()];

  interactivity=interactivity, compiler=compiler, result=result)


In [9]:
def create_databunch(files, cache_name, batch_size=32, load_cached=False):
    if load_cached and (out_path/f'{cache_name}/train_ids.npy').exists():
        data = MusicDataBunch.load(out_path, bs=batch_size, cache_name=cache_name)
    else:
        ps = [OpenNPFileProcessor()]

        data = (MusicItemList(items=files, path=out_path, processor=ps)
                .split_by_rand_pct(0.01, seed=6)
                .label_const(label_cls=LMLabelList))
        data.x._bunch = MusicDataBunch
        data = data.databunch(bs=batch_size)
        data.save(cache_name)
    return data

In [10]:
def get_files(csv):
    files = csv['numpy']
    flist = [Path(version_path/f) for f in files.values if isinstance(f, str)]
    flist = [f for f in flist if f.exists()]
    return flist

In [11]:
version_path

PosixPath('data/midi/v16')

### Create All Dataset

In [12]:
csv.head()

Unnamed: 0,genres,source,section,ht_offset,midi,mxl,song_url,ht_time_signature,midi_title,title,artist,ht_key,ht_bpm,numpy,md5,ht_mode,parts
0,,hooktheory,chorus,0.0,midi_sources/hooktheory/pianoroll/w/wayne-shar...,,https://www.hooktheory.com/theorytab/view/wayn...,4.0,yu-gi-oh3,yu-gi-oh-theme-song,wayne-sharpe,C,128.0,midi_encode/hooktheory/pianoroll/w/wayne-sharp...,bf1f29e5ff84e3e93e37fb873bfb590e,1.0,"intro,chorus"
1,,hooktheory,intro,0.0,midi_sources/hooktheory/pianoroll/w/wayne-shar...,,https://www.hooktheory.com/theorytab/view/wayn...,3.0,yu-gi-oh,yu-gi-oh-theme-song,wayne-sharpe,C,85.0,midi_encode/hooktheory/pianoroll/w/wayne-sharp...,055f80ad67f64edb14a85ca8fbfe8c29,1.0,"intro,chorus"
2,Jazz,hooktheory,chorus,-5.0,midi_sources/hooktheory/pianoroll/w/what-a-day...,,https://www.hooktheory.com/theorytab/view/what...,4.0,kiefer,kiefer,what-a-day,D,96.0,midi_encode/hooktheory/pianoroll/w/what-a-day/...,197f96f5d181f6ce1e2c5ab04ac1ff87,6.0,chorus
3,"J-Pop,Pop",hooktheory,pre-chorus,-5.0,midi_sources/hooktheory/pianoroll/w/whiteflame...,,https://www.hooktheory.com/theorytab/view/whit...,4.0,senbonzakura - pre-Pre-Chorus,senbonzakura,whiteflame,D,152.0,midi_encode/hooktheory/pianoroll/w/whiteflame/...,9e7ce13a35f1314423a9a6d5a5287a4a,6.0,"verse,pre-chorus,chorus"
4,"J-Pop,Pop",hooktheory,verse,-5.0,midi_sources/hooktheory/pianoroll/w/whiteflame...,,https://www.hooktheory.com/theorytab/view/whit...,4.0,Senbonzakura,senbonzakura,whiteflame,D,152.0,midi_encode/hooktheory/pianoroll/w/whiteflame/...,d5aaf79d0989222f1362f9f46c540a27,6.0,"verse,pre-chorus,chorus"


In [13]:
csv.shape

(164776, 17)

In [14]:
all_files = get_files(csv)
all_data = create_databunch(all_files, cache_name='tmp/all', load_cached=True)

### Create sample

In [15]:
import random
sample_data = create_databunch(random.sample(all_files, 1000), cache_name='tmp/sample')

Note: we are reusing all_vocab for the following datasets

### Create Hooktheory Dataset

In [16]:
hook_csv = csv.loc[csv.source.isin(['hooktheory'])]
hook_files = get_files(hook_csv); len(hook_files)

19404

In [17]:
hook_data = create_databunch(hook_files, cache_name='tmp/hook')

### Create Hooktheory C Dataset

In [18]:
hook_csv = csv.loc[csv.source.isin(['hooktheory_c'])]
hook_files = get_files(hook_csv); len(hook_files)

19562

In [19]:
hook_data = create_databunch(hook_files, cache_name='tmp/hook_c')

### Create Solo Piano

In [20]:
solo_csv = csv.loc[csv.source.isin(['musescore', 'wikifonia'])]
solo_files = get_files(solo_csv); len(solo_csv)

17089

In [21]:
solo_data = create_databunch(solo_files, cache_name='tmp/solo')

### Create Pop Dataset

In [22]:
pop_csv = csv.loc[csv.source.isin(['midiworld', 'freemidi', 'lmd', 'cprato'])]
pop_files = get_files(pop_csv); len(pop_csv)

16918

In [23]:
pop_data = create_databunch(pop_files, cache_name='tmp/pop')

### Create Classical Dataset

In [24]:
clc_csv = csv.loc[csv.source.isin(['classical_piano', 'ecomp', 'classical_archives'])]
clc_files = get_files(clc_csv); len(clc_files)

17464

In [25]:
clc_data = create_databunch(clc_files, cache_name='tmp/clc')

In [26]:
len(clc_files)

17464

## Create dumpster dataset

In [27]:
dmp_csv = csv.loc[csv.source.isin(['reddit'])]
dmp_files = get_files(dmp_csv); len(dmp_files)

74339

In [28]:
dmp_data = create_databunch(dmp_files, cache_name='tmp/dmp')

## Low quality

In [29]:
lq_csv = csv.loc[csv.source.isin(['reddit', 'classical_piano', 'ecomp', 'midiworld', 'freemidi', 'lmd', 'cprato', 'wikifonia', 'classical_archives'])]
lq_files = get_files(lq_csv); len(lq_files)

114916

In [30]:
lq_data = create_databunch(lq_files, cache_name='tmp/lq')

## All High quality

In [31]:
hq_csv = csv.loc[csv.source.isin(['hooktheory', 'musescore'])]
hq_files = get_files(hq_csv); len(hq_files)

30298

In [32]:
hq_data = create_databunch(hq_files, cache_name='tmp/hq')

### Testing

In [None]:
data = all_data

In [None]:
data.train_ds.x

In [None]:
# train_ids_file = out_path/'tmp/all/train_ids.npy'
# all_ids = np.load(train_ids_file)
# id_cat = np.concatenate(all_ids); id_cat.shape
# ax = tuple(range(len(id_cat.shape)-1))
# max_vocab = id_cat.max(axis=ax)
# max_vocab = (max_vocab+1).tolist(); max_vocab

In [None]:
npenc2stream(data.train_ds.x[10]).show()