# Tokenize and save data

In [35]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [36]:
from fastai.text import *

In [37]:
import pandas as pd
from fastai_data import *

## Preparing the data

In [8]:
source_dir = 'midi_transcribe_v2_shortcont'
path = Path(f'data/midi/{source_dir}/')
csv_path = path/f'{source_dir}.csv'
path.ls()

[PosixPath('data/midi/midi_transcribe_v2_shortbin/midi_transcribe_v2_shortbin.csv'),
 PosixPath('data/midi/midi_transcribe_v2_shortbin/ecomp'),
 PosixPath('data/midi/midi_transcribe_v2_shortbin/classic_piano'),
 PosixPath('data/midi/midi_transcribe_v2_shortbin/cprato'),
 PosixPath('data/midi/midi_transcribe_v2_shortbin/freemidi'),
 PosixPath('data/midi/midi_transcribe_v2_shortbin/midiworld'),
 PosixPath('data/midi/midi_transcribe_v2_shortbin/hooktheory')]

In [9]:
csv = pd.read_csv(csv_path)
csv = csv.loc[csv[source_dir].notna()];

In [10]:
def create_databunch(files, cache_name, vocab=None, batch_size=32, load_cached=True):
    if load_cached and (path/f'{cache_name}/itos.pkl').exists():
        data = TextLMDataBunch.load(path, bs=batch_size, cache_name=cache_name)
        data.valid_ds.x.processor[0] = TokenizeProcessor(tokenizer=MusicTokenizer())
    else:
        ps = [LMOpenFileProcessor(), 
             LMTokenizeProcessor(tokenizer=MusicTokenizer(), chunksize=num_cpus()*20),
             LMNumericalizeProcessor(vocab=vocab, max_vocab=500)]

        data = (TextList(items=files, path=path, processor=ps)
                .random_split_by_pct(0.01, seed=6)
                .label_for_lm()
                .databunch(bs=batch_size))
        data.save(cache_name)
    vocab = data.train_ds.vocab
    return data, vocab

In [11]:
def get_files(csv):
    files = csv[source_dir]
    flist = [Path(f) for f in files.values]
    flist = [f for f in flist if f.exists()]
    return flist

### Create Pop Dataset

In [12]:
pop_csv = csv.loc[csv.source.isin(['midiworld', 'hooktheory', 'freemidi', 'cprato'])]
pop_files = get_files(pop_csv)

In [13]:
pop_data, pop_vocab = create_databunch(pop_files, cache_name='tmp_pop')

Numericalizing


## Create Classical Dataset

In [14]:
# csv_filter = csv.loc[csv[f'{source_dir}_timesteps'] < 20000]
clc_csv = csv.loc[csv.source.isin(['classical_piano', 'ecomp'])]
clc_files = get_files(clc_csv)

Note: we are using pop_vocab because we ultimately want to generate pop music

In [15]:
clc_data, clc_vocab = create_databunch(clc_files, cache_name='tmp_clc', vocab=pop_vocab, load_cached=False)

Numericalizing


### Testing

In [16]:
data = pop_data

In [17]:
data.show_batch()

idx,text
0,nG3 t-2 nB3 t-2 || nE3 t-2 nG3 t-2 nB3 t-2 || nE3 t-2 nG3 t-2 nB3 t-2 || nE3 t-2 nG3 t-2 nB3 t-2 || nE3 t-2 nG3 t-2 nB3 t-2 || nE3 t-2 nG3 t-2 nB3 t-2 || nE3 t-2 nG3 t-2 nB3 t-2 || nE3 t-2 nG3 t-2 nB3 t-2 || nE3 t-2 nG3 t-2 nB3 t-2 || nE3 t-2 nG3 t-2 nB3 t-2 || nE3 t-2
1,nG3 t-2 nC4 t-2 || nC3 t-2 nE3 t-2 nG3 t-2 nE5 t4 || nC3 t-2 nE3 t-2 nG3 t-2 nE5 t-2 || nC3 t-2 nE3 t-2 nG3 t-2 nE5 t-2 || nC3 t-2 nE3 t-2 nG3 t-2 nE5 t-2 || nF3 t16 nA3 t16 nC4 t16 nF4 t2 || nF3 t-2 nA3 t-2 nC4 t-2 nF4 t-2 || nF3 t-2 nA3 t-2 nC4 t-2 nF4 t2 || nF3 t-2
2,t-2 nE4 t-2 nA4 t2 || nA3 t-2 nC4 t-2 nE4 t-2 nA4 t-2 || nA3 t-2 nC4 t-2 nE4 t-2 || nA3 t-2 nC4 t-2 nE4 t-2 nG4 t1 || nF3 t8 nA3 t8 nC4 t8 || nF3 t-2 nA3 t-2 nC4 t-2 || nF3 t-2 nA3 t-2 nC4 t-2 nE4 t1 || nF3 t-2 nA3 t-2 nC4 t-2 || nF3 t-2 nA3 t-2 nC4 t-2 nA4 t2 ||
3,t-2 nG3 t-2 nC4 t-2 nE5 t-2 || nE3 t-2 nG3 t-2 nC4 t-2 nE5 t-2 || nE3 t-2 nG3 t-2 nC4 t-2 nE5 t-2 || nC3 t10 nE3 t10 nG3 t10 nC5 t4 || nC3 t-2 nE3 t-2 nG3 t-2 nC5 t-2 || nC3 t-2 nE3 t-2 nG3 t-2 nC5 t-2 || nC3 t-2 nE3 t-2 nG3 t-2 nC5 t-2 || nC3 t-2 nE3 t-2 nG3 t-2 nD5 t2
4,t-2 || nG3 t-2 nB3 t-2 nD4 t-2 nF4 t-2 || nG3 t-2 nB3 t-2 nD4 t-2 nF4 t-2 || nG3 t-2 nB3 t-2 nD4 t-2 nF4 t-2 || nG3 t-2 nB3 t-2 nD4 t-2 nF4 t-2 || nG3 t-2 nB3 t-2 nD4 t-2 nF4 t-2 || nD3 t8 nF#3 t8 nA3 t8 nC4 t8 || nD3 t-2 nF#3 t-2 nA3 t-2 nC4 t-2 || nD3 t-2 nF#3 t-2 nA3


In [18]:
ob = data.one_batch()

In [19]:
txt_out = data.vocab.textify(ob[0][0]).replace('xxbos ', ''); txt_out

'nG3 t-2 nB3 t-2 || nE3 t-2 nG3 t-2 nB3 t-2 || nE3 t-2 nG3 t-2 nB3 t-2 || nE3 t8 nG3 t8 nA3 t8 nC4 t8 || nE3 t-2 nG3 t-2 nA3 t-2 nC4 t-2 || nE3 t-2 nG3 t-2 nA3 t-2 nC4 t-2 || nE3 t-2 nG3 t-2 nA3 t-2 nC4 t-2 || nE3 t-2 nG3 t-2 nA3 t-2 nC4 t-2 || nE3 t-2 nG3 t-2 nA3 t-2'

In [20]:
from encode_data import *

In [21]:
one_text = data.train_ds[0][0].text

In [22]:
seq = str2seq(one_text); seq

[[],
 [G3t1],
 [F#3t1],
 [A2t1],
 [F3t1],
 [E3t1],
 [A2t1],
 [F3t1],
 [E3t1],
 [A2t3],
 [A2t-2],
 [A2t-2],
 [G3t1],
 [F#3t1],
 [G3t1],
 [C#4t1],
 [E3t16, G3t16, B3t16],
 [E3t-2, G3t-2, B3t-2],
 [E3t-2, G3t-2, B3t-2],
 [E3t-2, G3t-2, B3t-2],
 [E3t-2, G3t-2, B3t-2],
 [E3t-2, G3t-2, B3t-2],
 [E3t-2, G3t-2, B3t-2],
 [E3t-2, G3t-2, B3t-2],
 [E3t-2, G3t-2, B3t-2],
 [E3t-2, G3t-2, B3t-2],
 [E3t-2, G3t-2, B3t-2],
 [E3t-2, G3t-2, B3t-2],
 [E3t-2, G3t-2, B3t-2],
 [E3t-2, G3t-2, B3t-2],
 [E3t-2, G3t-2, B3t-2],
 [E3t-2, G3t-2, B3t-2],
 [E3t8, G3t8, A3t8, C4t8],
 [E3t-2, G3t-2, A3t-2, C4t-2],
 [E3t-2, G3t-2, A3t-2, C4t-2],
 [E3t-2, G3t-2, A3t-2, C4t-2],
 [E3t-2, G3t-2, A3t-2, C4t-2],
 [E3t-2, G3t-2, A3t-2, C4t-2],
 [E3t-2, G3t-2, A3t-2, C4t-2],
 [E3t-2, G3t-2, A3t-2, C4t-2],
 [E3t8, G3t8, B3t8],
 [E3t-2, G3t-2, B3t-2],
 [E3t-2, G3t-2, B3t-2],
 [E3t-2, G3t-2, B3t-2],
 [E3t-2, G3t-2, B3t-2],
 [E3t-2, G3t-2, B3t-2],
 [E3t-2, G3t-2, B3t-2],
 [E3t-2, G3t-2, B3t-2],
 [G2t24, B-2t24, D3t24],
 [G2t-2, B-2t

In [31]:
seq2numpy(seq).shape

(129, 1, 127)

In [32]:
s = str2stream(one_text)

In [33]:
s.show('midi')

In [34]:
s.show('text')

{0.0} <music21.stream.Part 0x7fcfea0a44e0>
    {0.0} <music21.instrument.Piano Piano>
    {0.0} <music21.meter.TimeSignature 4/4>
    {0.0} <music21.tempo.MetronomeMark animato Quarter=120>
    {0.0} <music21.key.KeySignature of no sharps or flats>


In [26]:
s.flat.show()

TypeError: append() argument must be xml.etree.ElementTree.Element, not Element