# Tokenize and save data

In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
from fastai.text import *

In [3]:
from fastai.text import data

In [4]:
import pandas as pd
from fastai_data import *
from encode_data import npenc2seq, seq2npenc

## Preparing the data

In [5]:
version = 'v7'
data_path = Path('data/midi')
version_path = data_path/version

In [6]:
source_dir = 'midi_encode/np/dur'
out_path = version_path/source_dir
csv_path = out_path/f'midi_encode.csv'
version_path.ls()

[PosixPath('data/midi/v7/metadata'),
 PosixPath('data/midi/v7/midi_sources'),
 PosixPath('data/midi/v7/midi_encode'),
 PosixPath('data/midi/v7/midi_encode.tar.gz'),
 PosixPath('data/midi/v7/midi_npz'),
 PosixPath('data/midi/v7/midi_transform')]

In [7]:
csv = pd.read_csv(csv_path)
csv = csv.loc[csv[source_dir].notna()];

  interactivity=interactivity, compiler=compiler, result=result)


In [53]:
class LMItemList(ItemList):
    def get(self, i)->Any:
        return self.items[i] + 1
    
    def reconstruct(self, t:Tensor):
        return npenc2seq((t-1))
    
    def __getitem__(self,idxs:int)->Any:
        idxs = try_int(idxs)
        if isinstance(idxs, Integral): return self.get(idxs)
        else: return self.new(self.items[idxs], xtra=index_row(self.xtra, idxs))

In [54]:

        
## For npenc dataset
class OpenNPFileProcessor(PreProcessor):
    "`PreProcessor` that opens the filenames and read the texts."
    def process_one(self,item):
        return np.load(item) if isinstance(item, Path) else item
    
class LMDataBunch(DataBunch):
    "Create a `TextDataBunch` suitable for training a language model."
    @classmethod
    def create(cls, train_ds, valid_ds, test_ds=None, path:PathOrStr='.', no_check:bool=False, bs=64, val_bs:int=None, 
               num_workers:int=0, device:torch.device=None, collate_fn:Callable=data_collate, 
               dl_tfms:Optional[Collection[Callable]]=None, bptt:int=70, backwards:bool=False) -> DataBunch:
        "Create a `TextDataBunch` in `path` from the `datasets` for language modelling."
        datasets = cls._init_ds(train_ds, valid_ds, test_ds)
        val_bs = ifnone(val_bs, bs)
        datasets = [LMNPPreloader(ds, shuffle=(i==0), bs=(bs if i==0 else val_bs), bptt=bptt, backwards=backwards) 
                    for i,ds in enumerate(datasets)]
        val_bs = bs
        dls = [DataLoader(d, b, shuffle=False) for d,b in zip(datasets, (bs,val_bs,val_bs,val_bs)) if d is not None]
        return cls(*dls, path=path, device=device, dl_tfms=dl_tfms, collate_fn=collate_fn, no_check=no_check)
    
    @classmethod    
    def from_ids(cls, path:PathOrStr, train_ids:Collection[Collection[int]], valid_ids:Collection[Collection[int]],
                 test_ids:Collection[Collection[int]]=None, train_lbls:Collection[Union[int,float]]=None,
                 valid_lbls:Collection[Union[int,float]]=None, classes:Collection[Any]=None,
                 processor:PreProcessor=None, **kwargs) -> DataBunch:
        "Create a `TextDataBunch` from ids, labels and a `vocab`. `kwargs` are passed to the dataloader creation."
        src = ItemLists(path, LMItemList(train_ids, path=path, processor=[]),
                        LMItemList(valid_ids, path=path, processor=[]))
        src = src.label_const(label_cls=LMLabelList)
        if not is1d(train_lbls): src.train.y.one_hot,src.valid.y.one_hot = True,True
        return src.databunch(**kwargs)
    
    def save(self, cache_name:PathOrStr='tmp'):
        "Save the `DataBunch` in `self.path/cache_name` folder."
        os.makedirs(self.path/cache_name, exist_ok=True)
        cache_path = self.path/cache_name
        np.save(cache_path/f'train_ids.npy', self.train_ds.x.items)
        np.save(cache_path/f'train_lbl.npy', self.train_ds.y.items)
        np.save(cache_path/f'valid_ids.npy', self.valid_ds.x.items)
        np.save(cache_path/f'valid_lbl.npy', self.valid_ds.y.items)
        if self.test_dl is not None: np.save(cache_path/f'test_ids.npy', self.test_ds.x.items)
        if hasattr(self.train_ds, 'classes'): save_texts(cache_path/'classes.txt', self.train_ds.classes)

    @classmethod
    def load(cls, path:PathOrStr, cache_name:PathOrStr='tmp', processor:PreProcessor=None, **kwargs):
        "Load a `TextDataBunch` from `path/cache_name`. `kwargs` are passed to the dataloader creation."
        cache_path = Path(path)/cache_name
        train_ids,train_lbls = np.load(cache_path/f'train_ids.npy'), np.load(cache_path/f'train_lbl.npy')
        valid_ids,valid_lbls = np.load(cache_path/f'valid_ids.npy'), np.load(cache_path/f'valid_lbl.npy')
        test_ids = np.load(cache_path/f'test_ids.npy') if os.path.isfile(cache_path/f'test_ids.npy') else None
        classes = loadtxt_str(cache_path/'classes.txt') if os.path.isfile(cache_path/'classes.txt') else None
        return cls.from_ids(path, train_ids, valid_ids, test_ids, train_lbls, valid_lbls, classes, processor, **kwargs)

In [8]:
path = Path('data/midi/v7/midi_encode/np/dur/')
data = LMNPDataBunch.load(path, bs=2, bptt=8, cache_name='tmp/clc')

In [10]:
out = data.train_ds.x[0]

In [11]:
seq = data.train_ds.x.reconstruct(out)

In [12]:
out

array([[14,  0,  0,  0],
       [13,  0, 13,  0],
       [10,  2,  5,  3],
       [ 1,  6,  3,  1],
       ...,
       [10,  4,  9,  1],
       [13,  0,  8,  0],
       [10,  2,  5,  1],
       [12,  2,  9,  1]])

In [13]:
seq

array([[13, -1, -1, -1],
       [12, -1, 12, -1],
       [ 9,  1,  4,  2],
       [ 0,  5,  2,  0],
       ...,
       [ 9,  3,  8,  0],
       [12, -1,  7, -1],
       [ 9,  1,  4,  0],
       [11,  1,  8,  0]])

In [176]:
from encode_data import NoteEnc

In [177]:
out = data.train_ds.x[10]

In [178]:
out.shape

(2384, 4)

In [179]:
out-1

array([[13, -1, -1, -1],
       [12, -1, 12, -1],
       [ 6,  3,  3,  2],
       [10,  3,  3,  2],
       ...,
       [12, -1, 11, -1],
       [ 6,  1, 24,  2],
       [ 6,  2, 24,  2],
       [ 6,  3, 24,  0]])

In [180]:
# n = seq[0][0]
# n.pitch.pitchClass, n.pitch.octave

In [181]:
def npenc2seq(npenc):
    seq = []
    tstep = []
    for x in npenc:
        n,o,d,i = x
        if n == VALTBOS: continue
        if n == VALTSEP: 
            if len(tstep) > 0: seq.append(tstep)
            tstep = []
            for i in range(d): seq.append([])
        else:
            tstep.append(NoteEnc(n+((o+1)*12),d,i))
    if len(tstep) > 0: seq.append(tstep)
    return seq

In [182]:
seq = npenc2seq(out-1); seq[-10:]

[[], [], [], [], [], [], [], [], [], [F#1t24, F#2t24, F#3t24]]

In [183]:
len(seq)

1353

In [184]:
len(seq)

1353

In [185]:
(out-1)

array([[13, -1, -1, -1],
       [12, -1, 12, -1],
       [ 6,  3,  3,  2],
       [10,  3,  3,  2],
       ...,
       [12, -1, 11, -1],
       [ 6,  1, 24,  2],
       [ 6,  2, 24,  2],
       [ 6,  3, 24,  0]])

In [186]:
len(seq2npenc(seq))

2384

In [187]:
seq2npenc(seq)

array([[13, -1, -1, -1],
       [12, -1, 12, -1],
       [ 6,  3,  3,  2],
       [10,  3,  3,  2],
       ...,
       [12, -1, 11, -1],
       [ 6,  1, 24,  2],
       [ 6,  2, 24,  2],
       [ 6,  3, 24,  0]])

In [174]:
%debug

> [0;32m<ipython-input-155-c985789909ba>[0m(1)[0;36m<module>[0;34m()[0m
[0;32m----> 1 [0;31m[0mn[0m [0;34m=[0m [0mseq[0m[0;34m[[0m[0;36m0[0m[0;34m][0m[0;34m[[0m[0;36m0[0m[0;34m][0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m      2 [0;31m[0mn[0m[0;34m.[0m[0mpitch[0m[0;34m.[0m[0mpitchClass[0m[0;34m,[0m [0mn[0m[0;34m.[0m[0mpitch[0m[0;34m.[0m[0moctave[0m[0;34m[0m[0;34m[0m[0m
[0m
ipdb> quit


### Create All Dataset

In [10]:
all_files = get_files(csv)
all_data = create_databunch(all_files, cache_name='tmp/all', load_cached=True)

Note: we are reusing all_vocab for the following datasets

### Create Hooktheory Dataset

In [11]:
hook_csv = csv.loc[csv.source.isin(['hooktheory', 'cprato'])]
hook_files = get_files(hook_csv)

In [12]:
hook_data = create_databunch(hook_files, cache_name='tmp/hook')

### Create Pop Dataset

In [13]:
pop_csv = csv.loc[csv.source.isin(['midiworld', 'freemidi', 'wikifonia'])]
pop_files = get_files(pop_csv)

In [14]:
pop_data = create_databunch(pop_files, cache_name='tmp/pop')

## Create Classical Dataset

In [15]:
clc_csv = csv.loc[csv.source.isin(['classical_piano', 'ecomp'])]
clc_files = get_files(clc_csv)

In [16]:
clc_data = create_databunch(clc_files, cache_name='tmp/clc')

In [17]:
len(clc_files)

2851

### Testing

In [22]:
data = pop_data

In [28]:
data.train_ds.x

ItemList (15644 items)
[[12 -1  0 -1]
 [ 1  4  4  0]
 [12 -1  0 -1]
 [ 1  4 -2  0]
 ...
 [10  4 -2  0]
 [12 -1  0 -1]
 [ 1  4 -2  0]
 [10  4 -2  0]],[[12 -1  0 -1]
 [ 8  3  1  0]
 [ 8  3 34  3]
 [ 8  3 34  6]
 ...
 [12 -1  0 -1]
 [ 8  3  1  0]
 [ 8  3 -2  3]
 [ 8  4 -2  3]],[[12 -1  0 -1]
 [ 2  5  2  0]
 [12 -1  0 -1]
 [ 2  5 -2  0]
 ...
 [11  4 -2  0]
 [12 -1  0 -1]
 [ 9  4 -2  1]
 [11  4 -2  0]],[[12 -1  0 -1]
 [ 7  2  4  0]
 [12 -1  0 -1]
 [ 7  2 -2  0]
 ...
 [ 1  3 -2  6]
 [ 8  3 -2  6]
 [ 1  4 -2  0]
 [ 1  4 -2  6]],[[12 -1  0 -1]
 [ 4  2  4  2]
 [ 4  2  4  8]
 [ 1  3  4  0]
 ...
 [ 3  3 -2  0]
 [ 7  3 -2 10]
 [11  3 -2 10]
 [ 4  4 -2 10]]
Path: data/midi/v7/midi_encode/np/dur

In [23]:
data.show_batch()

AttributeError: 'numpy.ndarray' object has no attribute 'reconstruct'

In [None]:
ob = data.one_batch()

In [23]:
txt_out = data.vocab.textify(ob[0][0]).replace('xxbos ', ''); txt_out

't-2 || nB3 t-2 nD4 t-2 nC5 t-2 || nD4 t-1 || nD4 t-2 || nD4 t-2 || nD4 t-2 || nE-4 t-1 || nE-4 t-2 || nE-4 t-2 || nE-4 t-2 || nD4 t-1 || nD4 t-2 || nD4 t-2 || nD4 t-2 || nE-4 t-1 || nE-4 t-2 || nD4 t-1 || nD4 t-2 || nE-4 t-1 || nE-4 t-2 || nD4 t-1 || nD4 t-2 || nB3'

In [20]:
from encode_data import *

In [21]:
one_text = data.train_ds[0][0].text

In [22]:
seq = str2seq(one_text); seq

[[E-4t-1],
 [E-4t-2],
 [E-4t-2],
 [E-4t-2],
 [D4t-1],
 [D4t-2],
 [D4t-2],
 [D4t-2],
 [E-4t-1],
 [E-4t-2],
 [D4t-1],
 [D4t-2],
 [E-4t-1],
 [E-4t-2],
 [D5t-1],
 [D5t-2],
 [C5t-1],
 [C5t-2],
 [C5t-2],
 [C5t-2],
 [D4t-1],
 [D4t-2],
 [D4t-2],
 [D4t-2],
 [E-4t-1],
 [E-4t-2],
 [E-4t-2],
 [E-4t-2],
 [D4t-1],
 [D4t-2],
 [D4t-2],
 [D4t-2],
 [E-4t-1],
 [E-4t-2],
 [D4t-1],
 [D4t-2],
 [G3t-1],
 [G3t-2],
 [C5t-1],
 [C5t-2],
 [B3t-1, D4t-1, C5t-1],
 [B3t-2, D4t-2, C5t-2],
 [B3t-2, D4t-2, C5t-2],
 [B3t-2, D4t-2, C5t-2],
 [D4t-1],
 [D4t-2],
 [D4t-2],
 [D4t-2],
 [E-4t-1],
 [E-4t-2],
 [E-4t-2],
 [E-4t-2],
 [D4t-1],
 [D4t-2],
 [D4t-2],
 [D4t-2],
 [E-4t-1],
 [E-4t-2],
 [D4t-1],
 [D4t-2],
 [E-4t-1],
 [E-4t-2],
 [D4t-1],
 [D4t-2],
 [B3t-1, D4t-1],
 [B3t-2, D4t-2, C5t-1],
 [B3t-2, D4t-2, C5t-2],
 [B3t-2, D4t-2, C5t-2],
 [D4t-1],
 [D4t-2],
 [D4t-2],
 [D4t-2],
 [E-4t-1],
 [E-4t-2],
 [E-4t-2],
 [E-4t-2],
 [D4t-1],
 [D4t-2],
 [D4t-2],
 [D4t-2],
 [E-4t-1],
 [E-4t-2],
 [D4t-1],
 [D4t-2],
 [E-4t-1],
 [E-4t-2],
 [D4t

In [19]:
seq2numpy(seq).shape

(129, 1, 127)

In [20]:
s = str2stream(one_text)

In [21]:
s.show('midi')

In [22]:
s.show('text')

{0.0} <music21.stream.Part 0x7f0626bf9b70>
    {0.0} <music21.instrument.Piano Piano>
    {0.0} <music21.meter.TimeSignature 4/4>
    {0.0} <music21.tempo.MetronomeMark animato Quarter=120>
    {0.0} <music21.key.KeySignature of no sharps or flats>
    {0.25} <music21.chord.Chord G3>
    {0.5} <music21.chord.Chord F#3>
    {0.75} <music21.chord.Chord A2>
    {1.0} <music21.chord.Chord F3>
    {1.25} <music21.chord.Chord E3>
    {1.5} <music21.chord.Chord A2>
    {1.75} <music21.chord.Chord F3>
    {2.0} <music21.chord.Chord E3>
    {2.25} <music21.chord.Chord A2>
    {3.0} <music21.chord.Chord G3>
    {3.25} <music21.chord.Chord F#3>
    {3.5} <music21.chord.Chord G3>
    {3.75} <music21.chord.Chord C#4>
    {4.0} <music21.chord.Chord E3 G3 B3>
    {8.0} <music21.chord.Chord E3 G3 A3 C4>
    {10.0} <music21.chord.Chord E3 G3 B3>
    {12.0} <music21.chord.Chord G2 B-2 D3>
    {18.0} <music21.chord.Chord G2 A2 C3 E3>
    {19.0} <music21.chord.Chord A2 B2 D3 F#3>
    {20.0} <music21.chord