# Tokenize and save data

In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
from fastai.text import *

In [3]:
from fastai.text import data

In [4]:
import pandas as pd
from fastai_data import *

## Preparing the data

In [5]:
version = 'v7'
data_path = Path('data/midi')
version_path = data_path/version

In [6]:
source_dir = 'midi_encode/np/dur'
out_path = version_path/source_dir
csv_path = out_path/f'midi_encode.csv'
version_path.ls()

[PosixPath('data/midi/v7/metadata'),
 PosixPath('data/midi/v7/midi_sources'),
 PosixPath('data/midi/v7/midi_encode'),
 PosixPath('data/midi/v7/midi_npz'),
 PosixPath('data/midi/v7/midi_transform')]

In [7]:
csv = pd.read_csv(csv_path)
csv = csv.loc[csv[source_dir].notna()];

  interactivity=interactivity, compiler=compiler, result=result)


In [8]:
def create_databunch(files, cache_name, batch_size=32, load_cached=False):
    if load_cached and (out_path/f'{cache_name}/train_ids.npy').exists():
        data = LMDataBunch.load(out_path, bs=batch_size, cache_name=cache_name)
    else:
        ps = [OpenNPFileProcessor()]

        data = (ItemList(items=files, path=out_path, processor=ps)
                .random_split_by_pct(0.01, seed=6)
                .label_const(label_cls=LMLabelList))
        data.x._bunch = LMDataBunch
        data = data.databunch(bs=batch_size)
        data.save(cache_name)
    return data

In [9]:
def get_files(csv):
    files = csv[source_dir]
    flist = [Path(version_path/f) for f in files.values]
    flist = [f for f in flist if f.exists()]
    return flist

### Create All Dataset

In [10]:
all_files = get_files(csv)
all_data = create_databunch(all_files, cache_name='tmp/all', load_cached=True)

Note: we are reusing all_vocab for the following datasets

### Create Hooktheory Dataset

In [11]:
hook_csv = csv.loc[csv.source.isin(['hooktheory', 'cprato'])]
hook_files = get_files(hook_csv)

In [12]:
hook_data = create_databunch(hook_files, cache_name='tmp/hook')

### Create Pop Dataset

In [13]:
pop_csv = csv.loc[csv.source.isin(['midiworld', 'freemidi', 'wikifonia'])]
pop_files = get_files(pop_csv)

In [14]:
pop_data = create_databunch(pop_files, cache_name='tmp/pop')

## Create Classical Dataset

In [15]:
clc_csv = csv.loc[csv.source.isin(['classical_piano', 'ecomp'])]
clc_files = get_files(clc_csv)

In [16]:
clc_data = create_databunch(clc_files, cache_name='tmp/clc')

In [17]:
len(clc_files)

2851

### Testing

In [22]:
data = pop_data

In [28]:
data.train_ds.x

ItemList (15644 items)
[[12 -1  0 -1]
 [ 1  4  4  0]
 [12 -1  0 -1]
 [ 1  4 -2  0]
 ...
 [10  4 -2  0]
 [12 -1  0 -1]
 [ 1  4 -2  0]
 [10  4 -2  0]],[[12 -1  0 -1]
 [ 8  3  1  0]
 [ 8  3 34  3]
 [ 8  3 34  6]
 ...
 [12 -1  0 -1]
 [ 8  3  1  0]
 [ 8  3 -2  3]
 [ 8  4 -2  3]],[[12 -1  0 -1]
 [ 2  5  2  0]
 [12 -1  0 -1]
 [ 2  5 -2  0]
 ...
 [11  4 -2  0]
 [12 -1  0 -1]
 [ 9  4 -2  1]
 [11  4 -2  0]],[[12 -1  0 -1]
 [ 7  2  4  0]
 [12 -1  0 -1]
 [ 7  2 -2  0]
 ...
 [ 1  3 -2  6]
 [ 8  3 -2  6]
 [ 1  4 -2  0]
 [ 1  4 -2  6]],[[12 -1  0 -1]
 [ 4  2  4  2]
 [ 4  2  4  8]
 [ 1  3  4  0]
 ...
 [ 3  3 -2  0]
 [ 7  3 -2 10]
 [11  3 -2 10]
 [ 4  4 -2 10]]
Path: data/midi/v7/midi_encode/np/dur

In [23]:
data.show_batch()

AttributeError: 'numpy.ndarray' object has no attribute 'reconstruct'

In [None]:
ob = data.one_batch()

In [23]:
txt_out = data.vocab.textify(ob[0][0]).replace('xxbos ', ''); txt_out

't-2 || nB3 t-2 nD4 t-2 nC5 t-2 || nD4 t-1 || nD4 t-2 || nD4 t-2 || nD4 t-2 || nE-4 t-1 || nE-4 t-2 || nE-4 t-2 || nE-4 t-2 || nD4 t-1 || nD4 t-2 || nD4 t-2 || nD4 t-2 || nE-4 t-1 || nE-4 t-2 || nD4 t-1 || nD4 t-2 || nE-4 t-1 || nE-4 t-2 || nD4 t-1 || nD4 t-2 || nB3'

In [20]:
from encode_data import *

In [21]:
one_text = data.train_ds[0][0].text

In [22]:
seq = str2seq(one_text); seq

[[E-4t-1],
 [E-4t-2],
 [E-4t-2],
 [E-4t-2],
 [D4t-1],
 [D4t-2],
 [D4t-2],
 [D4t-2],
 [E-4t-1],
 [E-4t-2],
 [D4t-1],
 [D4t-2],
 [E-4t-1],
 [E-4t-2],
 [D5t-1],
 [D5t-2],
 [C5t-1],
 [C5t-2],
 [C5t-2],
 [C5t-2],
 [D4t-1],
 [D4t-2],
 [D4t-2],
 [D4t-2],
 [E-4t-1],
 [E-4t-2],
 [E-4t-2],
 [E-4t-2],
 [D4t-1],
 [D4t-2],
 [D4t-2],
 [D4t-2],
 [E-4t-1],
 [E-4t-2],
 [D4t-1],
 [D4t-2],
 [G3t-1],
 [G3t-2],
 [C5t-1],
 [C5t-2],
 [B3t-1, D4t-1, C5t-1],
 [B3t-2, D4t-2, C5t-2],
 [B3t-2, D4t-2, C5t-2],
 [B3t-2, D4t-2, C5t-2],
 [D4t-1],
 [D4t-2],
 [D4t-2],
 [D4t-2],
 [E-4t-1],
 [E-4t-2],
 [E-4t-2],
 [E-4t-2],
 [D4t-1],
 [D4t-2],
 [D4t-2],
 [D4t-2],
 [E-4t-1],
 [E-4t-2],
 [D4t-1],
 [D4t-2],
 [E-4t-1],
 [E-4t-2],
 [D4t-1],
 [D4t-2],
 [B3t-1, D4t-1],
 [B3t-2, D4t-2, C5t-1],
 [B3t-2, D4t-2, C5t-2],
 [B3t-2, D4t-2, C5t-2],
 [D4t-1],
 [D4t-2],
 [D4t-2],
 [D4t-2],
 [E-4t-1],
 [E-4t-2],
 [E-4t-2],
 [E-4t-2],
 [D4t-1],
 [D4t-2],
 [D4t-2],
 [D4t-2],
 [E-4t-1],
 [E-4t-2],
 [D4t-1],
 [D4t-2],
 [E-4t-1],
 [E-4t-2],
 [D4t

In [19]:
seq2numpy(seq).shape

(129, 1, 127)

In [20]:
s = str2stream(one_text)

In [21]:
s.show('midi')

In [22]:
s.show('text')

{0.0} <music21.stream.Part 0x7f0626bf9b70>
    {0.0} <music21.instrument.Piano Piano>
    {0.0} <music21.meter.TimeSignature 4/4>
    {0.0} <music21.tempo.MetronomeMark animato Quarter=120>
    {0.0} <music21.key.KeySignature of no sharps or flats>
    {0.25} <music21.chord.Chord G3>
    {0.5} <music21.chord.Chord F#3>
    {0.75} <music21.chord.Chord A2>
    {1.0} <music21.chord.Chord F3>
    {1.25} <music21.chord.Chord E3>
    {1.5} <music21.chord.Chord A2>
    {1.75} <music21.chord.Chord F3>
    {2.0} <music21.chord.Chord E3>
    {2.25} <music21.chord.Chord A2>
    {3.0} <music21.chord.Chord G3>
    {3.25} <music21.chord.Chord F#3>
    {3.5} <music21.chord.Chord G3>
    {3.75} <music21.chord.Chord C#4>
    {4.0} <music21.chord.Chord E3 G3 B3>
    {8.0} <music21.chord.Chord E3 G3 A3 C4>
    {10.0} <music21.chord.Chord E3 G3 B3>
    {12.0} <music21.chord.Chord G2 B-2 D3>
    {18.0} <music21.chord.Chord G2 A2 C3 E3>
    {19.0} <music21.chord.Chord A2 B2 D3 F#3>
    {20.0} <music21.chord