# Tokenize and save data

Data borrowed from here: https://github.com/mcleavey/musical-neural-net/  
http://www.christinemcleavey.com/files/notewise_piano_solo.tar.gz  
http://www.christinemcleavey.com/files/jazz.tar.gz  
http://www.christinemcleavey.com/files/notewise_chamber.tar.gz  

Notebook is a combined implementation from:
https://github.com/fastai/course-v3/blob/master/nbs/dl1/lesson3-imdb.ipynb  
https://github.com/mcleavey/musical-neural-net/blob/master/train.py

In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
from fastai.text import *

## Preparing the data

In [3]:
class MusicTokenizer():
    def __init__(self):
        super().__init__()
        self.n_cpus = num_cpus()
        
    def process_text(self, t:str) -> List[str]:
        return t.split(" ")
    
    def _process_all_1(self, texts:Collection[str]) -> List[List[str]]:
        return [self.process_text(t) for t in texts]

    def process_all(self, texts:Collection[str]) -> List[List[str]]:
        "Process a list of `texts`."
        if self.n_cpus <= 1: return self._process_all_1(texts)
        with ProcessPoolExecutor(self.n_cpus) as e:
            return sum(e.map(self._process_all_1, partition_by_cores(texts, self.n_cpus)), [])


In [4]:
path = Path('data/composers/notewise/piano_solo/note_range38/sample_freq12')
path.ls()[:5]

[PosixPath('data/composers/notewise/piano_solo/note_range38/sample_freq12/jazz'),
 PosixPath('data/composers/notewise/piano_solo/note_range38/sample_freq12/ravel'),
 PosixPath('data/composers/notewise/piano_solo/note_range38/sample_freq12/bach'),
 PosixPath('data/composers/notewise/piano_solo/note_range38/sample_freq12/schumann'),
 PosixPath('data/composers/notewise/piano_solo/note_range38/sample_freq12/liszt')]

In [5]:
bs=32

In [6]:
if (path/'tmp/itos.pkl').exists():
# if False:
    data = TextLMDataBunch.load(path, bs=bs)
else:
    p = [OpenFileProcessor(), TokenizeProcessor(tokenizer=MusicTokenizer(), chunksize=10), NumericalizeProcessor(vocab=None, max_vocab=500)]

    data = (TextList.from_folder(path, recurse=True, processor=p)
            .random_split_by_pct(0.05, seed=6)
            .label_for_lm()
            .databunch(bs=bs))
    data.save('tmp')
vocab = data.train_ds.vocab
len(data.train_ds), len(data.valid_ds), len(data.train_ds.vocab.itos)

(1798, 94, 110)

It only contains one csv file, let's have a look at it.

In [7]:
t = data.train_ds[0][0]
t.text[:50], t.data

('xxbos wait25 wait25 wait25 wait25 wait25 wait25 wa',
 array([  2,  94,  94,  94, ...,   9,  53,   9, 109]))

In [8]:
data.show_batch()

idx,text
0,xxbos p11 wait5 endp11 wait1 p15 p18 wait6 p15 p18 wait5 endp15 endp18 wait1 p7 wait5 endp7 wait1 p15 p17 wait6 p15 p17 wait5 endp15 endp17 wait1 p11 wait5 endp11 wait1 p15 p18 wait6 p15 p18 wait5 endp15 endp18 wait1 p7 wait5 endp7 wait1 p15 p17 wait6 p15 p17 wait5 endp15 endp17 wait1 p11 wait5 endp11 wait1 p15 p18 wait6 p15 p18 wait5 endp15 endp18 wait1 p7 wait5 endp7 wait1
1,wait1 p6 wait8 p2 endp6 wait2 endp2 wait2 p3 p4 p6 p8 p11 wait3 endp11 wait2 endp3 endp6 endp8 wait3 p3 p6 p8 p11 wait1 p4 wait1 endp3 endp11 wait1 endp6 endp8 wait1 p3 p11 wait3 p6 p8 p11 wait2 endp3 wait3 endp4 endp6 endp8 endp11 wait4 p3 p4 p6 p8 p11 wait5 endp3 endp6 endp8 endp11 wait2 endp4 wait2 p2 p3 p6 p8 p11 wait2 endp2 wait1 p3 p4
2,endp18 wait1 p9 wait5 endp9 wait1 p1 wait6 p1 wait6 p1 wait5 endp1 endp25 endp33 wait1 p11 p23 p32 wait3 endp23 endp32 wait1 p25 p33 wait1 endp11 wait1 p17 endp25 endp33 wait2 p23 p32 wait3 endp17 endp23 endp32 wait1 p21 p25 p30 wait5 endp25 wait1 p17 wait5 endp17 wait1 p11 wait5 endp11 endp21 endp30 wait1 p1 p20 p29 wait5 endp1 endp20 endp29 wait1 p6 p21 p30 wait6 p1 wait1 endp6
3,endp36 wait1 p32 wait5 endp32 wait1 p33 wait5 endp33 wait1 p29 wait5 endp29 wait1 p28 wait11 endp28 wait1 p20 p23 p26 wait11 endp20 endp23 endp26 wait1 p21 p24 wait11 endp21 endp24 wait1 p29 wait11 endp29 wait1 p14 p21 p23 wait11 endp14 wait1 p15 p21 p23 wait11 endp15 endp21 endp23 wait1 p16 p20 p24 wait2 endp24 wait1 p23 wait1 p21 wait2 endp21 p23 wait5 endp16 endp20 endp23 wait1 p4 wait6 p23
4,endp5 endp12 wait3 p15 wait3 p9 endp15 wait3 endp9 p12 wait3 endp12 wait3 p3 p17 wait3 endp17 p21 wait3 p1 endp21 p22 wait1 endp3 wait4 endp22 wait1 p17 wait1 endp1 wait2 p0 endp17 p24 wait3 p17 wait2 endp24 wait1 endp17 wait1 endp0 wait2 p10 p25 wait3 p22 wait3 endp22 wait2 p20 wait3 endp20 wait1 p18 wait3 endp10 endp18 p20 wait3 endp20 wait1 endp25 wait1 p23 wait3 endp23 wait1 p17 wait3
