# Tokenize and save data

Data borrowed from here: https://github.com/mcleavey/musical-neural-net/  
http://www.christinemcleavey.com/files/notewise_piano_solo.tar.gz  
http://www.christinemcleavey.com/files/jazz.tar.gz  
http://www.christinemcleavey.com/files/notewise_chamber.tar.gz  

Notebook is a combined implementation from:
https://github.com/fastai/course-v3/blob/master/nbs/dl1/lesson3-imdb.ipynb  
https://github.com/mcleavey/musical-neural-net/blob/master/train.py

In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
from fastai.text import *

## Preparing the data

In [3]:
class MusicTokenizer():
    def __init__(self):
        super().__init__()
        self.n_cpus = num_cpus()
        
    def process_text(self, t:str) -> List[str]:
        return t.split(" ")
    
    def _process_all_1(self, texts:Collection[str]) -> List[List[str]]:
        return [self.process_text(t) for t in texts]

    def process_all(self, texts:Collection[str]) -> List[List[str]]:
        "Process a list of `texts`."
        if self.n_cpus <= 1: return self._process_all_1(texts)
        with ProcessPoolExecutor(self.n_cpus) as e:
            return sum(e.map(self._process_all_1, partition_by_cores(texts, self.n_cpus)), [])


In [4]:
path = Path('../data/midi/transcribedc_v1/hooktheory/')
path.ls()[:5]

[PosixPath('../data/midi/transcribedc_v1/hooktheory/tmp'),
 PosixPath('../data/midi/transcribedc_v1/hooktheory/models'),
 PosixPath('../data/midi/transcribedc_v1/hooktheory/pianoroll')]

In [5]:
bs=32

In [6]:
if (path/'tmp/itos.pkl').exists():
# if False:
    data = TextLMDataBunch.load(path, bs=bs)
else:
    p = [OpenFileProcessor(), TokenizeProcessor(tokenizer=MusicTokenizer(), chunksize=10), NumericalizeProcessor(vocab=None, max_vocab=500)]

    data = (TextList.from_folder(path, recurse=True, processor=p)
            .random_split_by_pct(0.05, seed=6)
            .label_for_lm()
            .databunch(bs=bs))
    data.save('tmp')
vocab = data.train_ds.vocab
len(data.train_ds), len(data.valid_ds), len(data.train_ds.vocab.itos)

(10955, 576, 36)

In [7]:
t = data.train_ds[0][0]
t.text[:50], t.data

('xxbos || |s| nG o3 t1 i0 || |s| nF# o3 t1 i0 || |s',
 array([ 2, 12, 13, 20, ...,  9, 12, 25, 12]))

### Testing

In [8]:
data.show_batch()

idx,text
0,xxbos nF o3 t1 i0 nA o3 t1 i0 nC o4 t1 i0 || |s| nF o3 t2 i0 nA o3 t2 i0 nC o4 t2 i0 || |s| nF o3 t2 i0 nA o3 t2 i0 nC o4 t2 i0 || |s| nF o3 t2 i0 nA o3 t2 i0 nC o4 t2 i0 || |e| |s| nF o3 t2 i0 nA o3 t2 i0 nC o4 t2 i0
1,i0 nC o3 t2 i0 || |e| |s| nF o2 t2 i0 nA o2 t2 i0 nC o3 t2 i0 || |s| nF o2 t2 i0 nA o2 t2 i0 nC o3 t2 i0 || |s| nF o2 t2 i0 nA o2 t2 i0 nC o3 t2 i0 || |s| nF o2 t2 i0 nA o2 t2 i0 nC o3 t2 i0 || |e| |s| nF o2 t2 i0 nA
2,o3 t2 i0 nE o3 t2 i0 nG o3 t2 i0 || |e| |s| nC o3 t2 i0 nE o3 t2 i0 nG o3 t2 i0 || |s| nC o3 t2 i0 nE o3 t2 i0 nG o3 t2 i0 || |s| nC o3 t2 i0 nE o3 t2 i0 nG o3 t2 i0 || |s| nC o3 t2 i0 nE o3 t2 i0 nG o3 t2 i0 || |e|
3,i1 nB o2 t2 i1 nD o3 t2 i1 nD o4 t2 i0 || |s| nG o2 t2 i1 nB o2 t2 i1 nD o3 t2 i1 nD o4 t2 i0 || |s| nG o2 t2 i1 nB o2 t2 i1 nD o3 t2 i1 nD o4 t2 i0 || |s| nG o2 t2 i1 nB o2 t2 i1 nD o3 t2 i1 nD o4 t2 i0 || |e| |s|
4,nA o2 t2 i1 nC o3 t2 i1 nE o3 t2 i1 nC o5 t1 i0 || |s| nA o2 t2 i1 nC o3 t2 i1 nE o3 t2 i1 nC o5 t2 i0 || |s| nA o2 t2 i1 nC o3 t2 i1 nE o3 t2 i1 nA o3 t1 i0 || |s| nA o2 t2 i1 nC o3 t2 i1 nE o3 t2 i1 nA o3 t2 i0


In [9]:
ob = data.one_batch()

In [10]:
txt_out = data.vocab.textify(ob[0][0]).replace('xxbos ', ''); txt_out

'nF o3 t1 i0 nA o3 t1 i0 nC o4 t1 i0 || |s| nF o3 t2 i0 nA o3 t2 i0 nC o4 t2 i0 || |s| nF o3 t2 i0 nA o3 t2 i0 nC o4 t2 i0 || |s| nF o3 t2 i0 nA o3 t2 i0 nC o4 t2 i0 || |e| |s| nF o3 t2 i0 nA o3 t2 i0 nC o4 t2 i0'

In [11]:
from encode_data import *

In [19]:
a = data.train_ds[0][0]

In [30]:
atext = a.text

In [31]:
seq = str2seq(atext)

In [32]:
s = str2stream(atext)

In [33]:
s.show('midi')

In [36]:
s.show('text')

{0.0} <music21.stream.Part 0x7fb8186c9b70>
    {0.0} <music21.instrument.Piano Piano>
    {0.0} <music21.meter.TimeSignature 4/4>
    {0.0} <music21.key.KeySignature of no sharps or flats>
    {0.0} <music21.chord.Chord G3>
    {0.25} <music21.chord.Chord F#3>
    {0.5} <music21.chord.Chord A2>
    {0.75} <music21.chord.Chord F3>
    {1.0} <music21.chord.Chord E3>
    {1.25} <music21.chord.Chord A2>
    {1.5} <music21.chord.Chord F3>
    {1.75} <music21.chord.Chord E3>
    {2.0} <music21.chord.Chord A2>
    {2.75} <music21.chord.Chord G3>
    {3.0} <music21.chord.Chord F#3>
    {3.25} <music21.chord.Chord G3>
    {3.5} <music21.chord.Chord C#4>
{3.75} <music21.stream.Part 0x7fb818178b38>
    {0.0} <music21.instrument.Piano Piano>
    {0.0} <music21.meter.TimeSignature 4/4>
    {0.0} <music21.key.KeySignature of no sharps or flats>
    {0.0} <music21.chord.Chord E3 G3 B3>
    {4.0} <music21.chord.Chord E3 G3 A3 C4>
    {6.0} <music21.chord.Chord E3 G3 B3>
    {8.0} <music21.chord.Chord 

In [38]:
s.makeMeasures().show()

TypeError: append() argument must be xml.etree.ElementTree.Element, not Element

In [35]:
s.flat.show()

TypeError: append() argument must be xml.etree.ElementTree.Element, not Element