# Tokenize and save data

In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import sys
sys.path.insert(0, '../..')
from src.encode_data import *
from src.fastai_data import *

In [3]:
import pandas as pd
import numpy as np

In [4]:
from fastai.text import *

In [5]:
from fastai.text import data

## Preparing the data

In [6]:
version = 'v15'
data_path = Path('data/midi')
version_path = data_path/version

In [7]:
# source_dir = 'midi_encode'
source_dir = 'piano_duet'
out_path = version_path/source_dir
csv_path = out_path/f'{source_dir}.csv'
version_path.ls()

[PosixPath('data/midi/v15/metadata'),
 PosixPath('data/midi/v15/midi_sources'),
 PosixPath('data/midi/v15/midi_encode'),
 PosixPath('data/midi/v15/piano_duet')]

In [8]:
csv = pd.read_csv(csv_path)
csv = csv.loc[csv['numpy'].notna()];

  interactivity=interactivity, compiler=compiler, result=result)


In [9]:
def create_databunch(files, cache_name, batch_size=32, load_cached=False):
    if load_cached and (out_path/f'{cache_name}/train_ids.npy').exists():
        data = MusicDataBunch.load(out_path, bs=batch_size, cache_name=cache_name)
    else:
        ps = [OpenNPFileProcessor()]

        data = (MusicItemList(items=files, path=out_path, processor=ps)
                .split_by_rand_pct(0.01, seed=6)
                .label_const(label_cls=LMLabelList))
        data.x._bunch = MusicDataBunch
        data = data.databunch(bs=batch_size)
        data.save(cache_name)
    return data

In [10]:
def get_files(csv):
    files = csv['numpy']
    flist = [Path(version_path/f) for f in files.values]
    flist = [f for f in flist if f.exists()]
    return flist

In [11]:
version_path

PosixPath('data/midi/v15')

### Create All Dataset

In [12]:
csv.head()

Unnamed: 0,song_url,section,numpy,ht_mode,midi,title,ht_time_signature,mxl,ht_offset,ht_bpm,ht_key,md5,midi_title,artist,genres,parts,source
0,https://www.hooktheory.com/theorytab/view/wayn...,chorus,piano_duet/hooktheory/pianoroll/w/wayne-sharpe...,1.0,midi_sources/hooktheory/pianoroll/w/wayne-shar...,yu-gi-oh-theme-song,4.0,,0.0,128.0,C,bf1f29e5ff84e3e93e37fb873bfb590e,yu-gi-oh3,wayne-sharpe,,"intro,chorus",hooktheory
1,https://www.hooktheory.com/theorytab/view/wayn...,intro,piano_duet/hooktheory/pianoroll/w/wayne-sharpe...,1.0,midi_sources/hooktheory/pianoroll/w/wayne-shar...,yu-gi-oh-theme-song,3.0,,0.0,85.0,C,055f80ad67f64edb14a85ca8fbfe8c29,yu-gi-oh,wayne-sharpe,,"intro,chorus",hooktheory
2,https://www.hooktheory.com/theorytab/view/what...,chorus,piano_duet/hooktheory/pianoroll/w/what-a-day/k...,6.0,midi_sources/hooktheory/pianoroll/w/what-a-day...,kiefer,4.0,,-5.0,96.0,D,197f96f5d181f6ce1e2c5ab04ac1ff87,kiefer,what-a-day,Jazz,chorus,hooktheory
3,https://www.hooktheory.com/theorytab/view/whit...,pre-chorus,piano_duet/hooktheory/pianoroll/w/whiteflame/s...,6.0,midi_sources/hooktheory/pianoroll/w/whiteflame...,senbonzakura,4.0,,-5.0,152.0,D,9e7ce13a35f1314423a9a6d5a5287a4a,senbonzakura - pre-Pre-Chorus,whiteflame,"J-Pop,Pop","verse,pre-chorus,chorus",hooktheory
4,https://www.hooktheory.com/theorytab/view/whit...,verse,piano_duet/hooktheory/pianoroll/w/whiteflame/s...,6.0,midi_sources/hooktheory/pianoroll/w/whiteflame...,senbonzakura,4.0,,-5.0,152.0,D,d5aaf79d0989222f1362f9f46c540a27,Senbonzakura,whiteflame,"J-Pop,Pop","verse,pre-chorus,chorus",hooktheory


In [13]:
csv.shape

(112947, 17)

In [14]:
all_files = get_files(csv)
all_data = create_databunch(all_files, cache_name='tmp/all', load_cached=True)

DLTFMS: None


### Create sample

In [15]:
import random
sample_data = create_databunch(random.sample(all_files, 1000), cache_name='tmp/sample')

DLTFMS: None


Note: we are reusing all_vocab for the following datasets

### Create Hooktheory Dataset

In [16]:
hook_csv = csv.loc[csv.source.isin(['hooktheory'])]
hook_files = get_files(hook_csv); len(hook_files)

19812

In [17]:
hook_data = create_databunch(hook_files, cache_name='tmp/hook')

DLTFMS: None


### Create Hooktheory C Dataset

In [None]:
hook_csv = csv.loc[csv.source.isin(['hooktheory_c'])]
hook_files = get_files(hook_csv); len(hook_files)

19973

In [None]:
hook_data = create_databunch(hook_files, cache_name='tmp/hook_c')

DLTFMS: None


### Create Solo Piano

In [None]:
solo_csv = csv.loc[csv.source.isin(['musescore', 'wikifonia'])]
solo_files = get_files(solo_csv); len(solo_csv)

13106

In [None]:
solo_data = create_databunch(solo_files, cache_name='tmp/solo')

DLTFMS: None


### Create Pop Dataset

In [None]:
pop_csv = csv.loc[csv.source.isin(['midiworld', 'freemidi', 'lmd', 'cprato'])]
pop_files = get_files(pop_csv); len(pop_csv)

8850

In [None]:
pop_data = create_databunch(pop_files, cache_name='tmp/pop')

DLTFMS: None


### Create Classical Dataset

In [None]:
clc_csv = csv.loc[csv.source.isin(['classical_piano', 'ecomp', 'classical_archives'])]
clc_files = get_files(clc_csv); len(clc_files)

7704

In [None]:
clc_data = create_databunch(clc_files, cache_name='tmp/clc')

DLTFMS: None


In [None]:
len(clc_files)

7704

## Create dumpster dataset

In [None]:
dmp_csv = csv.loc[csv.source.isin(['reddit'])]
dmp_files = get_files(dmp_csv); len(dmp_files)

43502

In [None]:
dmp_data = create_databunch(dmp_files, cache_name='tmp/dmp')

DLTFMS: None


## Low quality

In [None]:
lq_csv = csv.loc[csv.source.isin(['reddit', 'classical_piano', 'ecomp', 'midiworld', 'freemidi', 'lmd', 'cprato', 'wikifonia', 'classical_archives'])]
lq_files = get_files(lq_csv); len(lq_files)

63967

In [None]:
lq_data = create_databunch(lq_files, cache_name='tmp/lq')

## All High quality

In [None]:
hq_csv = csv.loc[csv.source.isin(['hooktheory', 'musescore'])]
hq_files = get_files(hq_csv); len(hq_files)

In [None]:
hq_data = create_databunch(hq_files, cache_name='tmp/hq')

### Testing

In [None]:
data = all_data

In [None]:
data.train_ds.x

In [None]:
# train_ids_file = out_path/'tmp/all/train_ids.npy'
# all_ids = np.load(train_ids_file)
# id_cat = np.concatenate(all_ids); id_cat.shape
# ax = tuple(range(len(id_cat.shape)-1))
# max_vocab = id_cat.max(axis=ax)
# max_vocab = (max_vocab+1).tolist(); max_vocab

In [None]:
npenc2stream(data.train_ds.x[10]).show()