# Tokenize and save data

In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import os
os.chdir('../../')

In [3]:
from src.numpy_encode import *
from src.utils.file_processing import process_all, process_file
from src.config import *
from src.music_transformer import *

In [4]:
# from fastai.text import *

## Preparing the data

In [5]:
version = 'v18'
data_path = Path('data/midi')
version_path = data_path/version

In [6]:
sf_path = f'sf{SAMPLE_FREQ}'
# source_dir = Path(f'{sf_path}/piano_duet')
source_dir = Path(f'{sf_path}/midi_encode')
out_path = version_path/source_dir
csv_path = out_path/f'{source_dir.name}.csv'
version_path.ls()

[PosixPath('data/midi/v18/metadata'),
 PosixPath('data/midi/v18/midi_sources'),
 PosixPath('data/midi/v18/sf4')]

In [7]:
csv = pd.read_csv(csv_path)
csv = csv.loc[csv['numpy'].notna()];

  interactivity=interactivity, compiler=compiler, result=result)


In [8]:

class NPStreamProcessor(PreProcessor):
    "`PreProcessor` that opens the filenames and read the texts."
    def process_one(self,item):
        item = to_single_stream(item)
        item = stack_position_tfm(item)
        return item
    
    def process(self, ds:Collection):
        ds.items = [self.process_one(item) for item in ds.items]

In [9]:
def create_databunch(files, cache_name, batch_size=32, load_cached=False):
    if load_cached and (out_path/f'{cache_name}/train_ids.npy').exists():
        data = MusicDataBunch.load(out_path, bs=batch_size, cache_name=cache_name)
    else:
        ps = [OpenNPFileProcessor()]

        data = (MusicItemList(items=files, path=out_path, processor=ps)
                .split_by_rand_pct(0.01, seed=6)
                .label_const(label_cls=LMLabelList))
        data.x._bunch = MusicDataBunch
        data = data.databunch(bs=batch_size)
        data.save(cache_name)
    return data

In [10]:
def get_files(csv):
    files = csv['numpy']
    flist = [Path(version_path/f) for f in files.values if isinstance(f, str)]
    flist = [f for f in flist if f.exists()]
    return flist

In [11]:
version_path

PosixPath('data/midi/v18')

### Create All Dataset

In [12]:
csv.head()

Unnamed: 0,song_url,md5,mxl,genres,ht_offset,ht_key,section,source,ht_bpm,artist,ht_mode,numpy,midi_title,title,midi,parts,ht_time_signature
0,https://www.hooktheory.com/theorytab/view/wayn...,bf1f29e5ff84e3e93e37fb873bfb590e,,,0.0,C,chorus,hooktheory,128.0,wayne-sharpe,1.0,sf4/midi_encode/hooktheory/pianoroll/w/wayne-s...,yu-gi-oh3,yu-gi-oh-theme-song,midi_sources/hooktheory/pianoroll/w/wayne-shar...,"intro,chorus",4.0
1,https://www.hooktheory.com/theorytab/view/wayn...,055f80ad67f64edb14a85ca8fbfe8c29,,,0.0,C,intro,hooktheory,85.0,wayne-sharpe,1.0,sf4/midi_encode/hooktheory/pianoroll/w/wayne-s...,yu-gi-oh,yu-gi-oh-theme-song,midi_sources/hooktheory/pianoroll/w/wayne-shar...,"intro,chorus",3.0
2,https://www.hooktheory.com/theorytab/view/what...,197f96f5d181f6ce1e2c5ab04ac1ff87,,Jazz,-5.0,D,chorus,hooktheory,96.0,what-a-day,6.0,sf4/midi_encode/hooktheory/pianoroll/w/what-a-...,kiefer,kiefer,midi_sources/hooktheory/pianoroll/w/what-a-day...,chorus,4.0
3,https://www.hooktheory.com/theorytab/view/whit...,9e7ce13a35f1314423a9a6d5a5287a4a,,"J-Pop,Pop",-5.0,D,pre-chorus,hooktheory,152.0,whiteflame,6.0,sf4/midi_encode/hooktheory/pianoroll/w/whitefl...,senbonzakura - pre-Pre-Chorus,senbonzakura,midi_sources/hooktheory/pianoroll/w/whiteflame...,"verse,pre-chorus,chorus",4.0
4,https://www.hooktheory.com/theorytab/view/whit...,d5aaf79d0989222f1362f9f46c540a27,,"J-Pop,Pop",-5.0,D,verse,hooktheory,152.0,whiteflame,6.0,sf4/midi_encode/hooktheory/pianoroll/w/whitefl...,Senbonzakura,senbonzakura,midi_sources/hooktheory/pianoroll/w/whiteflame...,"verse,pre-chorus,chorus",4.0


In [13]:
csv.shape

(164774, 17)

In [17]:
all_files = get_files(csv); len(all_files)

164774

In [18]:
all_data = create_databunch(all_files, cache_name='tmp/all', load_cached=True)

AttributeError: type object 'MusicDataBunch' has no attribute 'load'

### Create sample

In [None]:
import random
sample_data = create_databunch(random.sample(all_files, 1000), cache_name='tmp/sample')

Note: we are reusing all_vocab for the following datasets

### Create Hooktheory Dataset

In [16]:
hook_csv = csv.loc[csv.source.isin(['hooktheory'])]
hook_files = get_files(hook_csv); len(hook_files)

The history saving thread hit an unexpected error (OperationalError('database is locked')).History will not be written to the database.


19404

In [17]:
hook_data = create_databunch(hook_files, cache_name='tmp/hook')

### Create Hooktheory C Dataset

In [18]:
hook_csv = csv.loc[csv.source.isin(['hooktheory_c'])]
hook_files = get_files(hook_csv); len(hook_files)

19562

In [19]:
hook_data = create_databunch(hook_files, cache_name='tmp/hook_c')

### Create Solo Piano

In [20]:
solo_csv = csv.loc[csv.source.isin(['musescore', 'wikifonia'])]
solo_files = get_files(solo_csv); len(solo_csv)

17086

In [21]:
solo_data = create_databunch(solo_files, cache_name='tmp/solo')

### Create Pop Dataset

In [22]:
pop_csv = csv.loc[csv.source.isin(['midiworld', 'freemidi', 'lmd', 'cprato'])]
pop_files = get_files(pop_csv); len(pop_csv)

16917

In [23]:
pop_data = create_databunch(pop_files, cache_name='tmp/pop')

### Create Classical Dataset

In [24]:
clc_csv = csv.loc[csv.source.isin(['classical_piano', 'ecomp', 'classical_archives'])]
clc_files = get_files(clc_csv); len(clc_files)

17464

In [25]:
clc_data = create_databunch(clc_files, cache_name='tmp/clc')

In [26]:
len(clc_files)

17464

## Create dumpster dataset

In [27]:
dmp_csv = csv.loc[csv.source.isin(['reddit'])]
dmp_files = get_files(dmp_csv); len(dmp_files)

74341

In [28]:
dmp_data = create_databunch(dmp_files, cache_name='tmp/dmp')

## Low quality

In [29]:
lq_csv = csv.loc[csv.source.isin(['reddit', 'classical_piano', 'ecomp', 'midiworld', 'freemidi', 'lmd', 'cprato', 'wikifonia', 'classical_archives'])]
lq_files = get_files(lq_csv); len(lq_files)

114917

In [30]:
lq_data = create_databunch(lq_files, cache_name='tmp/lq')

## All High quality

In [31]:
hq_csv = csv.loc[csv.source.isin(['hooktheory', 'musescore'])]
hq_files = get_files(hq_csv); len(hq_files)

30295

In [32]:
hq_data = create_databunch(hq_files, cache_name='tmp/hq')

### Testing

In [33]:
data = all_data

In [34]:
data.train_ds.x

MusicItemList (163127 items)
[[   0    0]
 [   1    0]
 [   8    0]
 [ 138    0]
 ...
 [  63 -112]
 [ 153 -112]
 [  60 -112]
 [ 153 -112]],[[   0    0]
 [   1    0]
 [  74    0]
 [ 138    0]
 ...
 [   8 -128]
 [ 141 -128]
 [  76 -132]
 [ 149 -132]],[[  0   0]
 [  1   0]
 [ 81   0]
 [138   0]
 ...
 [  8 -58]
 [139 -58]
 [ 73 -60]
 [139 -60]],[[   0    0]
 [   1    0]
 [  74    0]
 [ 141    0]
 ...
 [   8 -124]
 [ 139 -124]
 [  74 -126]
 [ 139 -126]],[[   0    0]
 [   1    0]
 [  71    0]
 [ 141    0]
 ...
 [   8 -120]
 [ 141 -120]
 [  71 -124]
 [ 141 -124]]
Path: data/midi/v18/sf4/midi_encode

In [None]:
# train_ids_file = out_path/'tmp/all/train_ids.npy'
# all_ids = np.load(train_ids_file)
# id_cat = np.concatenate(all_ids); id_cat.shape
# ax = tuple(range(len(id_cat.shape)-1))
# max_vocab = id_cat.max(axis=ax)
# max_vocab = (max_vocab+1).tolist(); max_vocab

In [None]:
npenc2stream(data.train_ds.x[10]).show()