# Tokenize and save data

Data borrowed from here: https://github.com/mcleavey/musical-neural-net/  
http://www.christinemcleavey.com/files/notewise_piano_solo.tar.gz  
http://www.christinemcleavey.com/files/jazz.tar.gz  
http://www.christinemcleavey.com/files/notewise_chamber.tar.gz  

Notebook is a combined implementation from:
https://github.com/fastai/course-v3/blob/master/nbs/dl1/lesson3-imdb.ipynb  
https://github.com/mcleavey/musical-neural-net/blob/master/train.py

In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
from fastai.text import *

## Preparing the data

In [3]:
class MusicTokenizer():
    def __init__(self):
        super().__init__()
        self.n_cpus = num_cpus()
        
    def process_text(self, t:str) -> List[str]:
        return t.split(" ")
    
    def _process_all_1(self, texts:Collection[str]) -> List[List[str]]:
        return [self.process_text(t) for t in texts]

    def process_all(self, texts:Collection[str]) -> List[List[str]]:
        "Process a list of `texts`."
        if self.n_cpus <= 1: return self._process_all_1(texts)
        with ProcessPoolExecutor(self.n_cpus) as e:
            return sum(e.map(self._process_all_1, partition_by_cores(texts, self.n_cpus)), [])


In [9]:
path = Path('../data/midi/transcribedc_v1/hooktheory/')
path.ls()[:5]

[PosixPath('../data/midi/transcribedc_v1/hooktheory/pianoroll')]

In [10]:
bs=32

In [11]:
if (path/'tmp/itos.pkl').exists():
# if False:
    data = TextLMDataBunch.load(path, bs=bs)
else:
    p = [OpenFileProcessor(), TokenizeProcessor(tokenizer=MusicTokenizer(), chunksize=10), NumericalizeProcessor(vocab=None, max_vocab=500)]

    data = (TextList.from_folder(path, recurse=True, processor=p)
            .random_split_by_pct(0.05, seed=6)
            .label_for_lm()
            .databunch(bs=bs))
    data.save('tmp')
vocab = data.train_ds.vocab
len(data.train_ds), len(data.valid_ds), len(data.train_ds.vocab.itos)

(10955, 576, 36)

In [12]:
t = data.train_ds[0][0]
t.text[:50], t.data

('xxbos || |s| nG o3 t1 i0 || |s| nF# o3 t1 i0 || |s',
 array([ 2, 12, 13, 20, ...,  9, 12, 25, 12]))

In [13]:
data.show_batch()

idx,text
0,xxbos nA o2 t1 i1 nC o3 t1 i1 nE o3 t1 i1 nC o5 t1 i0 || |s| nA o2 t2 i1 nC o3 t2 i1 nE o3 t2 i1 nC o5 t2 i0 || |s| nA o2 t2 i1 nC o3 t2 i1 nE o3 t2 i1 nE o5 t1 i0 || |s| nA o2 t2 i1 nC o3 t2 i1 nE o3 t2 i1 nE o5 t2
1,i1 nE o3 t2 i1 nG o3 t2 i1 nD o5 t2 i0 || |e| |s| nC o3 t2 i1 nE o3 t2 i1 nG o3 t2 i1 nD o5 t2 i0 || |s| nC o3 t2 i1 nE o3 t2 i1 nG o3 t2 i1 nD o5 t2 i0 || |s| nC o3 t2 i1 nE o3 t2 i1 nG o3 t2 i1 nD o5 t2 i0 || |s|
2,o4 t1 i0 || |s| nG o2 t2 i1 nC o3 t2 i1 nD o3 t2 i1 nG o3 t1 i0 || |e| |s| nA o2 t1 i1 nC o3 t1 i1 nE o3 t1 i1 nG o3 t1 i1 nG o4 t1 i0 || |s| nA o2 t2 i1 nC o3 t2 i1 nE o3 t2 i1 nG o3 t2 i1 nC o4 t1 i0 || |s| nA o2
3,t2 i0 || |s| nA o2 t2 i1 nC o3 t2 i1 nE o3 t2 i1 nA o3 t2 i0 || |e| |s| nA o2 t2 i1 nC o3 t2 i1 nE o3 t2 i1 nC o4 t1 i0 || |s| nA o2 t2 i1 nC o3 t2 i1 nE o3 t2 i1 nC o4 t2 i0 || |s| nA o2 t2 i1 nC o3 t2 i1 nE o3 t2
4,o4 t2 i0 || |s| nG o2 t2 i1 nB o2 t2 i1 nD o3 t2 i1 nG o4 t1 i0 || |s| nG o2 t2 i1 nB o2 t2 i1 nD o3 t2 i1 nG o4 t2 i0 || |e| |s| nG o2 t2 i1 nB o2 t2 i1 nD o3 t2 i1 nG o4 t1 i0 || |s| nG o2 t2 i1 nB o2 t2 i1 nD o3
