# Tokenize and save data

In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
from fastai.text import *

In [3]:
import pandas as pd

## Preparing the data

In [4]:
source_dir = 'midi_transcribe_v1_simple'
path = Path(f'data/midi/{source_dir}/')
csv_path = Path(f'data/midi/metadata/{source_dir}.csv')
path.ls()[:5]

[PosixPath('data/midi/midi_transcribe_v1_simple/ecomp'),
 PosixPath('data/midi/midi_transcribe_v1_simple/classic_piano'),
 PosixPath('data/midi/midi_transcribe_v1_simple/cprato'),
 PosixPath('data/midi/midi_transcribe_v1_simple/freemidi'),
 PosixPath('data/midi/midi_transcribe_v1_simple/midiworld')]

### Select file data

In [5]:
cache_name = 'tmp_all'

In [6]:
csv = pd.read_csv(csv_path)
csv = csv.loc[csv[source_dir].notna()];

  interactivity=interactivity, compiler=compiler, result=result)


In [7]:
sv = csv.seconds.values
sv = sv[~np.isnan(sv)]
sv.sort()
sv[-100:]

array([ 1551.452991,  1556.324786,  1572.264957,  1577.820513, ...,  3012.571429,  3126.908425,  3126.908425,
       23520.961538])

In [8]:
files = csv[source_dir]

In [9]:
flist = [Path(f) for f in files.values]
flist = [f for f in flist if f.exists()]

### Create databunch

In [10]:
bs=32

In [11]:
class MusicTokenizer():
    def __init__(self):
        super().__init__()
        self.n_cpus = num_cpus()
    def process_text(self, t:str) -> List[str]: return t.split(" ")
    def _process_all_1(self, texts:Collection[str]) -> List[List[str]]:
        return [self.process_text(t) for t in texts]
    def process_all(self, texts:Collection[str]) -> List[List[str]]:
        "Process a list of `texts`."
        if self.n_cpus <= 1: return self._process_all_1(texts)
        with ProcessPoolExecutor(self.n_cpus) as e:
            return sum(e.map(self._process_all_1, partition_by_cores(texts, self.n_cpus)), [])


In [12]:
def lm_join_texts(texts:Collection[str]):
    return [f'{BOS} {t}' for t in texts]

In [13]:
class LMOpenFileProcessor(OpenFileProcessor):
    # Removing numpy array conversion to fix OOM error
    def process(self, ds:Collection): ds.items = [self.process_one(item) for item in ds.items] 

In [14]:
class LMTokenizeProcessor(PreProcessor):
    "`PreProcessor` that tokenizes the texts in `ds`."
    def __init__(self, ds:ItemList=None, tokenizer:Tokenizer=None, chunksize:int=10000):
        self.tokenizer,self.chunksize = ifnone(tokenizer, Tokenizer()),chunksize
    def process_one(self, item):  return self.tokenizer._process_all_1([item])[0]
    def process(self, ds):
        ds.items = lm_join_texts(ds.items)
        tokens = []
        for i in progress_bar(range(0,len(ds),self.chunksize), leave=False):
            tokens += self.tokenizer.process_all(ds.items[i:i+self.chunksize])
        ds.items = tokens

In [16]:
def numericalize(data):
    stoi, items = data
    return [[stoi[w] for w in item] for item in items]

In [17]:
def count_tokens(tokens): return Counter(p for o in tokens for p in o)    
def vocab_create_parallel(tokens:Tokens, max_vocab:int, min_freq:int) -> 'Vocab':
    "Create a vocabulary from a set of `tokens`."
    n_cpus = num_cpus()
    with ProcessPoolExecutor(n_cpus) as e:
        freq = sum(e.map(count_tokens, partition_by_cores(tokens, n_cpus)), Counter())
    
    itos = [o for o,c in freq.most_common(max_vocab) if c > min_freq]
    for o in reversed(defaults.text_spec_tok):
        if o in itos: itos.remove(o)
        itos.insert(0, o)
    return Vocab(itos)
Vocab.create = vocab_create_parallel

In [18]:
class LMNumericalizeProcessor(PreProcessor):
    "`PreProcessor` that numericalizes the tokens in `ds`."
    def __init__(self, ds:ItemList=None, vocab:Vocab=None, max_vocab:int=60000, min_freq:int=2):
        vocab = ifnone(vocab, ds.vocab if ds is not None else None)
        self.vocab,self.max_vocab,self.min_freq = vocab,max_vocab,min_freq

    def process_one(self,item): return np.array(self.vocab.numericalize(item), dtype=np.int64)
    def process(self, ds):
        if self.vocab is None: self.vocab = vocab_create_parallel(ds.items, self.max_vocab, self.min_freq)
        ds.vocab = self.vocab
        
        n_cpus = num_cpus()
        parts = partition_by_cores(ds.items, n_cpus)
        vocabs = [ds.vocab.stoi.copy() for i in range(len(parts))]
        with ProcessPoolExecutor(n_cpus) as e:
            items = sum(e.map(numericalize, zip(vocabs, parts)), [])
        ds.items = array(items)

In [19]:
ps = [LMOpenFileProcessor(), 
     LMTokenizeProcessor(tokenizer=MusicTokenizer(), chunksize=num_cpus()*20),
     LMNumericalizeProcessor(vocab=None, max_vocab=500)]

In [24]:
data = TextList(items=flist, path=path, processor=ps)

In [25]:
data_split = data.random_split_by_pct(0.05, seed=6)

In [26]:
data_lm = data_split.label_for_lm()

In [None]:
databunch.save(cache_name)

In [None]:
databunch = data_lm.databunch(bs=bs)

In [21]:
databunch.one_batch()

(tensor([[ 8, 16,  8,  ..., 13,  8, 12],
         [ 8, 20,  8,  ...,  9, 10, 40],
         [60,  8, 54,  ..., 87,  8, 10],
         ...,
         [ 8, 47,  8,  ..., 57,  8, 65],
         [ 8, 62,  8,  ...,  8, 21,  8],
         [19,  8, 44,  ..., 28,  8, 36]]),
 tensor([[16,  8, 10,  ...,  8, 12,  8],
         [20,  8, 19,  ..., 10, 40,  8],
         [ 8, 54,  9,  ...,  8, 10, 53],
         ...,
         [47,  8, 32,  ...,  8, 65,  8],
         [62,  8, 67,  ..., 21,  8, 12],
         [ 8, 44,  8,  ...,  8, 36,  8]]))

In [22]:
#     data.save(cache_name)
vocab = databunch.train_ds.vocab
len(databunch.train_ds), len(databunch.valid_ds), len(databunch.train_ds.vocab.itos)

(28151, 1481, 135)

In [23]:
databunch.train_ds.vocab.itos

['xxunk',
 'xxpad',
 'xxbos',
 'xxfld',
 'xxmaj',
 'xxup',
 'xxrep',
 'xxwrep',
 't2',
 't1',
 '||',
 'nG4',
 'nF4',
 'nD4',
 'nC5',
 'nD5',
 'nA4',
 'nC4',
 'nB-4',
 'nF5',
 'nE-4',
 'nE4',
 'nG5',
 'nG3',
 'nA3',
 'nB-3',
 'nE-5',
 'nE5',
 'nB4',
 'nF#4',
 'nG#4',
 'nB3',
 'nF3',
 'nA5',
 'nB-5',
 'nC#4',
 'nC#5',
 'nB-2',
 'nC6',
 'nD3',
 'nG#3',
 'nF#5',
 'nC3',
 'nD6',
 'nG#5',
 'nF#2',
 'nF#3',
 'nE3',
 'nE-3',
 'nB5',
 'nG2',
 'nC#3',
 'nA2',
 'nF2',
 'nF6',
 'nG#2',
 'nC#6',
 'nE-6',
 'nE2',
 'nB2',
 'nE6',
 'nD2',
 'nG6',
 'nC2',
 'nE-2',
 'nF#6',
 'nA6',
 'nG#6',
 'nB1',
 'nC7',
 'nB-6',
 'nC#2',
 'nB6',
 'nD7',
 'nC#7',
 'nE-7',
 'nB-1',
 'nE7',
 'nF7',
 'nA1',
 'nG1',
 'nG#1',
 'nG7',
 'nF#7',
 'nF1',
 'nF#1',
 'nA7',
 'nG#7',
 'nE1',
 'nB-7',
 'nE-1',
 'nB7',
 'nC8',
 'nD1',
 'nD8',
 'nC#8',
 'nC#1',
 'nC1',
 'nE-8',
 'nF8',
 'nE8',
 'nB-0',
 'nB0',
 'nG8',
 'nF#8',
 'nA0',
 'nA8',
 'nG#8',
 'nB-8',
 'nC',
 'nG0',
 'nE0',
 'nG#0',
 'nE-0',
 'nC0',
 'nB',
 'nB-',
 'nE9',
 '

In [9]:
if (path/f'{cache_name}/itos.pkl').exists():
# if False:
    data = TextLMDataBunch.load(path, bs=bs)
else:
    p = [OpenFileProcessor(), TokenizeProcessor(tokenizer=MusicTokenizer(), chunksize=10), NumericalizeProcessor(vocab=None, max_vocab=500)]

    data = (TextList.from_folder(path, recurse=True, processor=p)
            .random_split_by_pct(0.05, seed=6)
            .label_for_lm()
            .databunch(bs=bs))
    data.save(cache_name)
vocab = data.train_ds.vocab
len(data.train_ds), len(data.valid_ds), len(data.train_ds.vocab.itos)

MemoryError: 

In [10]:
%debug

> [0;32m/home/ubuntu/fastai/fastai/core.py[0m(262)[0;36marray[0;34m()[0m
[0;32m    260 [0;31m    [0;32mif[0m [0mnp[0m[0;34m.[0m[0mint_[0m[0;34m==[0m[0mnp[0m[0;34m.[0m[0mint32[0m [0;32mand[0m [0mdtype[0m [0;32mis[0m [0;32mNone[0m [0;32mand[0m [0mis_listy[0m[0;34m([0m[0ma[0m[0;34m)[0m [0;32mand[0m [0mlen[0m[0;34m([0m[0ma[0m[0;34m)[0m [0;32mand[0m [0misinstance[0m[0;34m([0m[0ma[0m[0;34m[[0m[0;36m0[0m[0;34m][0m[0;34m,[0m[0mint[0m[0;34m)[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    261 [0;31m        [0mdtype[0m[0;34m=[0m[0mnp[0m[0;34m.[0m[0mint64[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m--> 262 [0;31m    [0;32mreturn[0m [0mnp[0m[0;34m.[0m[0marray[0m[0;34m([0m[0ma[0m[0;34m,[0m [0mdtype[0m[0;34m=[0m[0mdtype[0m[0;34m,[0m [0;34m**[0m[0mkwargs[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    263 [0;31m[0;34m[0m[0m
[0m[0;32m    264 [0;31m[0;32mclass[0m [0mEmptyLa

In [None]:
t = data.train_ds[0][0]
t.text[:50], t.data

### Testing

In [8]:
data.show_batch()

idx,text
0,xxbos nF o3 t1 i0 nA o3 t1 i0 nC o4 t1 i0 || |s| nF o3 t2 i0 nA o3 t2 i0 nC o4 t2 i0 || |s| nF o3 t2 i0 nA o3 t2 i0 nC o4 t2 i0 || |s| nF o3 t2 i0 nA o3 t2 i0 nC o4 t2 i0 || |e| |s| nF o3 t2 i0 nA o3 t2 i0 nC o4 t2 i0
1,i0 nC o3 t2 i0 || |e| |s| nF o2 t2 i0 nA o2 t2 i0 nC o3 t2 i0 || |s| nF o2 t2 i0 nA o2 t2 i0 nC o3 t2 i0 || |s| nF o2 t2 i0 nA o2 t2 i0 nC o3 t2 i0 || |s| nF o2 t2 i0 nA o2 t2 i0 nC o3 t2 i0 || |e| |s| nF o2 t2 i0 nA
2,o3 t2 i0 nE o3 t2 i0 nG o3 t2 i0 || |e| |s| nC o3 t2 i0 nE o3 t2 i0 nG o3 t2 i0 || |s| nC o3 t2 i0 nE o3 t2 i0 nG o3 t2 i0 || |s| nC o3 t2 i0 nE o3 t2 i0 nG o3 t2 i0 || |s| nC o3 t2 i0 nE o3 t2 i0 nG o3 t2 i0 || |e|
3,i1 nB o2 t2 i1 nD o3 t2 i1 nD o4 t2 i0 || |s| nG o2 t2 i1 nB o2 t2 i1 nD o3 t2 i1 nD o4 t2 i0 || |s| nG o2 t2 i1 nB o2 t2 i1 nD o3 t2 i1 nD o4 t2 i0 || |s| nG o2 t2 i1 nB o2 t2 i1 nD o3 t2 i1 nD o4 t2 i0 || |e| |s|
4,nA o2 t2 i1 nC o3 t2 i1 nE o3 t2 i1 nC o5 t1 i0 || |s| nA o2 t2 i1 nC o3 t2 i1 nE o3 t2 i1 nC o5 t2 i0 || |s| nA o2 t2 i1 nC o3 t2 i1 nE o3 t2 i1 nA o3 t1 i0 || |s| nA o2 t2 i1 nC o3 t2 i1 nE o3 t2 i1 nA o3 t2 i0


In [9]:
ob = data.one_batch()

In [10]:
txt_out = data.vocab.textify(ob[0][0]).replace('xxbos ', ''); txt_out

'nF o3 t1 i0 nA o3 t1 i0 nC o4 t1 i0 || |s| nF o3 t2 i0 nA o3 t2 i0 nC o4 t2 i0 || |s| nF o3 t2 i0 nA o3 t2 i0 nC o4 t2 i0 || |s| nF o3 t2 i0 nA o3 t2 i0 nC o4 t2 i0 || |e| |s| nF o3 t2 i0 nA o3 t2 i0 nC o4 t2 i0'

In [11]:
from encode_data import *

In [19]:
a = data.train_ds[0][0]

In [30]:
atext = a.text

In [31]:
seq = str2seq(atext)

In [32]:
s = str2stream(atext)

In [33]:
s.show('midi')

In [36]:
s.show('text')

{0.0} <music21.stream.Part 0x7fb8186c9b70>
    {0.0} <music21.instrument.Piano Piano>
    {0.0} <music21.meter.TimeSignature 4/4>
    {0.0} <music21.key.KeySignature of no sharps or flats>
    {0.0} <music21.chord.Chord G3>
    {0.25} <music21.chord.Chord F#3>
    {0.5} <music21.chord.Chord A2>
    {0.75} <music21.chord.Chord F3>
    {1.0} <music21.chord.Chord E3>
    {1.25} <music21.chord.Chord A2>
    {1.5} <music21.chord.Chord F3>
    {1.75} <music21.chord.Chord E3>
    {2.0} <music21.chord.Chord A2>
    {2.75} <music21.chord.Chord G3>
    {3.0} <music21.chord.Chord F#3>
    {3.25} <music21.chord.Chord G3>
    {3.5} <music21.chord.Chord C#4>
{3.75} <music21.stream.Part 0x7fb818178b38>
    {0.0} <music21.instrument.Piano Piano>
    {0.0} <music21.meter.TimeSignature 4/4>
    {0.0} <music21.key.KeySignature of no sharps or flats>
    {0.0} <music21.chord.Chord E3 G3 B3>
    {4.0} <music21.chord.Chord E3 G3 A3 C4>
    {6.0} <music21.chord.Chord E3 G3 B3>
    {8.0} <music21.chord.Chord 

In [38]:
s.makeMeasures().show()

TypeError: append() argument must be xml.etree.ElementTree.Element, not Element

In [35]:
s.flat.show()

TypeError: append() argument must be xml.etree.ElementTree.Element, not Element