# Tokenize and save data

In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import pandas as pd
import numpy as np
import sys
sys.path.insert(0, '../..')
from src.fastai_data import *

In [3]:
from fastai.text import *

In [4]:
from fastai.text import data

## Preparing the data

In [5]:
version = 'v10'
data_path = Path('data/midi')
version_path = data_path/version

In [6]:
source_dir = 'midi_encode'
out_path = version_path/source_dir
csv_path = out_path/f'{source_dir}.csv'
version_path.ls()

[PosixPath('data/midi/v10/metadata'),
 PosixPath('data/midi/v10/midi_sources'),
 PosixPath('data/midi/v10/midi_encode')]

In [7]:
csv = pd.read_csv(csv_path)
csv = csv.loc[csv['numpy'].notna()];

  interactivity=interactivity, compiler=compiler, result=result)


In [8]:
def create_databunch(files, cache_name, batch_size=32, load_cached=False):
    if load_cached and (out_path/f'{cache_name}/train_ids.npy').exists():
        data = LMNPDataBunch.load(out_path, bs=batch_size, cache_name=cache_name)
    else:
        ps = [OpenNPFileProcessor()]

        data = (ItemList(items=files, path=out_path, processor=ps)
                .split_by_rand_pct(0.01, seed=6)
                .label_const(label_cls=LMLabelList))
        data.x._bunch = LMNPDataBunch
        data = data.databunch(bs=batch_size)
        data.save(cache_name)
    return data

In [9]:
def get_files(csv):
    files = csv['numpy']
    flist = [Path(version_path/f) for f in files.values]
    flist = [f for f in flist if f.exists()]
    return flist

In [10]:
version_path

PosixPath('data/midi/v10')

### Create All Dataset

In [11]:
csv.head()

Unnamed: 0,mxl,section,md5,midi,song_url,source,ht_time_signature,numpy,title,midi_title,parts,ht_offset,ht_mode,ht_bpm,artist,ht_key,genres
0,,chorus,bf1f29e5ff84e3e93e37fb873bfb590e,midi_sources/hooktheory/pianoroll/w/wayne-shar...,https://www.hooktheory.com/theorytab/view/wayn...,hooktheory,4.0,midi_encode/hooktheory/pianoroll/w/wayne-sharp...,yu-gi-oh-theme-song,yu-gi-oh3,"intro,chorus",0.0,1.0,128.0,wayne-sharpe,C,
1,,pre-chorus,9e7ce13a35f1314423a9a6d5a5287a4a,midi_sources/hooktheory/pianoroll/w/whiteflame...,https://www.hooktheory.com/theorytab/view/whit...,hooktheory,4.0,midi_encode/hooktheory/pianoroll/w/whiteflame/...,senbonzakura,senbonzakura - pre-Pre-Chorus,"verse,pre-chorus,chorus",-5.0,6.0,152.0,whiteflame,D,"J-Pop,Pop"
2,,chorus,e76bfe452839d0d653a8eb920f117905,midi_sources/hooktheory/pianoroll/w/wavves/nin...,https://www.hooktheory.com/theorytab/view/wavv...,hooktheory,4.0,midi_encode/hooktheory/pianoroll/w/wavves/nine...,nine-is-god,Nine is God,"verse,chorus",-4.0,1.0,122.0,wavves,E,
3,,verse,d5aaf79d0989222f1362f9f46c540a27,midi_sources/hooktheory/pianoroll/w/whiteflame...,https://www.hooktheory.com/theorytab/view/whit...,hooktheory,4.0,midi_encode/hooktheory/pianoroll/w/whiteflame/...,senbonzakura,Senbonzakura,"verse,pre-chorus,chorus",-5.0,6.0,152.0,whiteflame,D,"J-Pop,Pop"
4,,chorus,e0c189ee753b30c4758d85211f13c189,midi_sources/hooktheory/pianoroll/w/whiteflame...,https://www.hooktheory.com/theorytab/view/whit...,hooktheory,4.0,midi_encode/hooktheory/pianoroll/w/whiteflame/...,senbonzakura,Senbonzakura,"verse,pre-chorus,chorus",-5.0,6.0,152.0,whiteflame,D,"J-Pop,Pop"


In [12]:
csv.shape

(156755, 17)

In [13]:
all_files = get_files(csv)
all_data = create_databunch(all_files, cache_name='tmp/all', load_cached=True)

### Create sample

In [14]:
import random
sample_data = create_databunch(random.sample(all_files, 1000), cache_name='tmp/sample')

Note: we are reusing all_vocab for the following datasets

### Save vocab sizes

In [15]:
VOCAB_SZ = create_vocab_sizes(out_path/'tmp/all'); VOCAB_SZ

[130, 132]

### Create Hooktheory Dataset

In [16]:
hook_csv = csv.loc[csv.source.isin(['hooktheory'])]
hook_files = get_files(hook_csv); len(hook_files)

19820

In [17]:
hook_data = create_databunch(hook_files, cache_name='tmp/hook')

### Create Hooktheory C Dataset

In [16]:
hook_csv = csv.loc[csv.source.isin(['hooktheory_c'])]
hook_files = get_files(hook_csv); len(hook_files)

19820

In [17]:
hook_data = create_databunch(hook_files, cache_name='tmp/hook')

### Create Solo Piano

In [18]:
solo_csv = csv.loc[csv.source.isin(['musescore', 'wikifonia'])]
solo_files = get_files(solo_csv); len(solo_csv)

18134

In [19]:
solo_data = create_databunch(solo_files, cache_name='tmp/solo')

### Create Pop Dataset

In [20]:
pop_csv = csv.loc[csv.source.isin(['midiworld', 'freemidi', 'lmd', 'cprato'])]
pop_files = get_files(pop_csv); len(pop_csv)

21516

In [21]:
pop_data = create_databunch(pop_files, cache_name='tmp/pop')

### Create Classical Dataset

In [22]:
clc_csv = csv.loc[csv.source.isin(['classical_piano', 'ecomp'])]
clc_files = get_files(clc_csv); len(clc_files)

3063

In [23]:
clc_data = create_databunch(clc_files, cache_name='tmp/clc')

In [24]:
len(clc_files)

3063

## Create dumpster dataset

In [26]:
dmp_csv = csv.loc[csv.source.isin(['reddit'])]
dmp_files = get_files(dmp_csv); len(dmp_files)

94222

In [27]:
dmp_data = create_databunch(dmp_files, cache_name='tmp/dmp')

## Low quality

In [28]:
lq_csv = csv.loc[csv.source.isin(['reddit', 'classical_piano', 'ecomp', 'midiworld', 'freemidi', 'lmd', 'cprato'])]
lq_files = get_files(lq_csv); len(lq_files)

118801

In [29]:
lq_data = create_databunch(lq_files, cache_name='tmp/lq')

## All High quality

In [30]:
hq_csv = csv.loc[csv.source.isin(['hooktheory', 'wikifonia', 'musescore'])]
hq_files = get_files(hq_csv); len(hq_files)

37954

In [31]:
hq_data = create_databunch(hq_files, cache_name='tmp/hq')

### Testing

In [38]:
data = pop_data

In [39]:
data.train_ds.x

ItemList (9009 items)
[[ 2  0]
 [ 1  3]
 [61  6]
 [ 1  8]
 ...
 [64  4]
 [ 1  4]
 [93 11]
 [64  5]],[[ 2  0]
 [ 1  3]
 [47  5]
 [35  5]
 ...
 [70 21]
 [67 21]
 [51 21]
 [39 21]],[[ 2  0]
 [ 1  3]
 [73  4]
 [ 1  3]
 ...
 [ 1 34]
 [73 27]
 [ 1 34]
 [73 26]],[[ 2  0]
 [ 1  3]
 [70 10]
 [ 1 10]
 ...
 [73  7]
 [68  7]
 [65  7]
 [37  7]],[[ 2  0]
 [ 1  3]
 [75  5]
 [ 1  4]
 ...
 [77  5]
 [ 1  4]
 [74  5]
 [48  5]]
Path: data/midi/v10/midi_encode

In [12]:
train_ids_file = out_path/'tmp/all/train_ids.npy'
all_ids = np.load(train_ids_file)
id_cat = np.concatenate(all_ids); id_cat.shape
ax = tuple(range(len(id_cat.shape)-1))
max_vocab = id_cat.max(axis=ax)
max_vocab = (max_vocab+1).tolist(); max_vocab

[118, 132]

In [27]:
from encode_data import *

In [29]:
one_text = data.train_ds[0][0].text

AttributeError: 'numpy.ndarray' object has no attribute 'text'

In [None]:
seq = str2seq(one_text); seq

In [None]:
seq2numpy(seq).shape

In [None]:
s = str2stream(one_text)

In [None]:
s.show('midi')

In [None]:
s.show('text')