# Tokenize and save data

In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
from fastai.text import *

In [3]:
import pandas as pd
from fastai_data import *

## Preparing the data

In [4]:
version = 'v4'
data_path = Path('data/midi')
version_path = data_path/version

In [16]:
source_dir = 'midi_transcribe/longdur'
out_path = version_path/source_dir
csv_path = out_path/f'midi_transcribe.csv'
version_path.ls()

[PosixPath('data/midi/v4/metadata'),
 PosixPath('data/midi/v4/midi_sources'),
 PosixPath('data/midi/v4/midi_transcribe'),
 PosixPath('data/midi/v4/midi_npz'),
 PosixPath('data/midi/v4/midi_transform')]

In [17]:
csv = pd.read_csv(csv_path)
csv = csv.loc[csv[source_dir].notna()];

In [21]:
def create_databunch(files, cache_name, vocab=None, batch_size=32, load_cached=True):
    if load_cached and (out_path/f'{cache_name}/itos.pkl').exists():
        data = TextLMDataBunch.load(path, bs=batch_size, cache_name=cache_name)
        data.valid_ds.x.processor[0] = TokenizeProcessor(tokenizer=MusicTokenizer())
    else:
        ps = [LMOpenFileProcessor(), 
             LMTokenizeProcessor(tokenizer=MusicTokenizer(), chunksize=num_cpus()*20),
             LMNumericalizeProcessor(vocab=vocab, max_vocab=500)]

        data = (TextList(items=files, path=out_path, processor=ps)
                .random_split_by_pct(0.01, seed=6)
                .label_for_lm()
                .databunch(bs=batch_size))
        data.save(cache_name)
    vocab = data.train_ds.vocab
    return data, vocab

In [22]:
def get_files(csv):
    files = csv[source_dir]
    flist = [Path(version_path/f) for f in files.values]
    flist = [f for f in flist if f.exists()]
    return flist

### Create All Dataset

In [None]:
all_files = get_files(csv)
all_data, all_vocab = create_databunch(all_files, cache_name='tmp/all')

### Create Hooktheory Dataset

In [9]:
hook_csv = csv.loc[csv.source.isin(['hooktheory', 'cprato'])]
hook_files = get_files(hook_csv)

In [10]:
hook_data, hook_vocab = create_databunch(hook_files, cache_name='tmp/hook', vocab=all_vocab)

Numericalizing


### Create Pop Dataset

In [11]:
pop_csv = csv.loc[csv.source.isin(['midiworld', 'freemidi', 'wikifonia'])]
pop_files = get_files(pop_csv)

In [12]:
pop_data, pop_vocab = create_databunch(pop_files, cache_name='tmp/pop', vocab=all_vocab)

Numericalizing


## Create Classical Dataset

In [13]:
# csv_filter = csv.loc[csv[f'{source_dir}_timesteps'] < 20000]
clc_csv = csv.loc[csv.source.isin(['classical_piano', 'ecomp'])]
clc_files = get_files(clc_csv)

Note: we are using pop_vocab because we ultimately want to generate pop music

In [14]:
clc_data, clc_vocab = create_databunch(clc_files, cache_name='tmp/clc', vocab=all_vocab)

Numericalizing


### Testing

In [12]:
data = pop_data

In [13]:
data.show_batch()

idx,text
0,t-1 || nE o3 t-2 nG o3 t-2 nB o3 t-2 || nE o3 t-2 nG o3 t-2 nB o3 t-2 || nE o3 t-2 nG o3 t-2 nB o3 t-2 || nE o3 t-2 nG o3 t-2 nB o3 t-2 || nE o3 t-2 nG o3 t-2 nB o3 t-2 || nE o3 t-2 nG o3 t-2 nB o3 t-2 || nE o3 t-2 nG o3 t-2 nB o3
1,t-2 nC o3 t-2 nE o3 t-2 nG o5 t-1 || nA o2 t-2 nC o3 t-2 nE o3 t-2 nG o5 t-2 || nA o2 t-2 nC o3 t-2 nE o3 t-2 nG o5 t-1 || nA o2 t-2 nC o3 t-2 nE o3 t-2 nG o5 t-2 || nA o2 t-2 nC o3 t-2 nE o3 t-2 nG o5 t-2 || nA o2 t-2 nC o3 t-2 nE
2,nA o2 t-2 nC o3 t-2 nF o3 t-2 nG o3 t-2 || nA o2 t-2 nC o3 t-2 nF o3 t-2 nG o3 t-2 nF o5 t-1 || nA o2 t-2 nC o3 t-2 nF o3 t-2 nG o3 t-2 nF o5 t-2 || nA o2 t-2 nC o3 t-2 nF o3 t-2 nG o3 t-2 nF o5 t-1 || nA o2 t-2 nC o3 t-2 nF o3 t-2
3,o3 t-1 nG o3 t-1 nB- o3 t-1 || nE- o3 t-2 nG o3 t-2 nB- o3 t-2 || nE- o3 t-2 nG o3 t-2 nB- o3 t-2 || nE- o3 t-2 nG o3 t-2 nB- o3 t-2 || nE- o3 t-2 nG o3 t-2 nB- o3 t-2 || nE- o3 t-2 nG o3 t-2 nB- o3 t-2 || nE- o3 t-2 nG o3 t-2 nB- o3 t-2 || nE-
4,nB o2 t-2 nD o3 t-2 || nG o2 t-2 nB o2 t-2 nD o3 t-2 nG o4 t-1 || nG o2 t-2 nB o2 t-2 nD o3 t-2 nA o4 t-1 || nF o2 t-1 nA o2 t-1 nC o3 t-1 nE o3 t-1 nC o5 t-1 || nF o2 t-2 nA o2 t-2 nC o3 t-2 nE o3 t-2 nC o5 t-2 || nF o2 t-2 nA o2


In [14]:
ob = data.one_batch()

In [15]:
txt_out = data.vocab.textify(ob[0][0]).replace('xxbos ', ''); txt_out

't-2 || nE o3 t-2 nG o3 t-2 nB o3 t-2 || nE o3 t-2 nG o3 t-2 nB o3 t-2 || nE o3 t-2 nG o3 t-2 nB o3 t-2 || nE o3 t-2 nG o3 t-2 nB o3 t-2 || nE o3 t-2 nG o3 t-2 nB o3 t-2 || nE o3 t-2 nG o3 t-2 nB o3 t-2 || nE o3 t-2 nG o3 t-2 nB o3'

In [16]:
from encode_data import *

In [17]:
one_text = data.train_ds[0][0].text

In [18]:
seq = str2seq(one_text); seq

[[],
 [G3t-1],
 [F#3t-1],
 [A2t-1],
 [F3t-1],
 [E3t-1],
 [A2t-1],
 [F3t-1],
 [E3t-1],
 [A2t-1],
 [A2t-2],
 [A2t-2],
 [G3t-1],
 [F#3t-1],
 [G3t-1],
 [C#4t-1],
 [E3t-1, G3t-1, B3t-1],
 [E3t-2, G3t-2, B3t-2],
 [E3t-2, G3t-2, B3t-2],
 [E3t-2, G3t-2, B3t-2],
 [E3t-2, G3t-2, B3t-2],
 [E3t-2, G3t-2, B3t-2],
 [E3t-2, G3t-2, B3t-2],
 [E3t-2, G3t-2, B3t-2],
 [E3t-2, G3t-2, B3t-2],
 [E3t-2, G3t-2, B3t-2],
 [E3t-2, G3t-2, B3t-2],
 [E3t-2, G3t-2, B3t-2],
 [E3t-2, G3t-2, B3t-2],
 [E3t-2, G3t-2, B3t-2],
 [E3t-2, G3t-2, B3t-2],
 [E3t-2, G3t-2, B3t-2],
 [E3t-1, G3t-1, A3t-1, C4t-1],
 [E3t-2, G3t-2, A3t-2, C4t-2],
 [E3t-2, G3t-2, A3t-2, C4t-2],
 [E3t-2, G3t-2, A3t-2, C4t-2],
 [E3t-2, G3t-2, A3t-2, C4t-2],
 [E3t-2, G3t-2, A3t-2, C4t-2],
 [E3t-2, G3t-2, A3t-2, C4t-2],
 [E3t-2, G3t-2, A3t-2, C4t-2],
 [E3t-1, G3t-1, B3t-1],
 [E3t-2, G3t-2, B3t-2],
 [E3t-2, G3t-2, B3t-2],
 [E3t-2, G3t-2, B3t-2],
 [E3t-2, G3t-2, B3t-2],
 [E3t-2, G3t-2, B3t-2],
 [E3t-2, G3t-2, B3t-2],
 [E3t-2, G3t-2, B3t-2],
 [G2t-1, B-2t-1, D

In [19]:
seq2numpy(seq).shape

(129, 1, 127)

In [20]:
s = str2stream(one_text)

In [21]:
s.show('midi')

In [22]:
s.show('text')

{0.0} <music21.stream.Part 0x7f0626bf9b70>
    {0.0} <music21.instrument.Piano Piano>
    {0.0} <music21.meter.TimeSignature 4/4>
    {0.0} <music21.tempo.MetronomeMark animato Quarter=120>
    {0.0} <music21.key.KeySignature of no sharps or flats>
    {0.25} <music21.chord.Chord G3>
    {0.5} <music21.chord.Chord F#3>
    {0.75} <music21.chord.Chord A2>
    {1.0} <music21.chord.Chord F3>
    {1.25} <music21.chord.Chord E3>
    {1.5} <music21.chord.Chord A2>
    {1.75} <music21.chord.Chord F3>
    {2.0} <music21.chord.Chord E3>
    {2.25} <music21.chord.Chord A2>
    {3.0} <music21.chord.Chord G3>
    {3.25} <music21.chord.Chord F#3>
    {3.5} <music21.chord.Chord G3>
    {3.75} <music21.chord.Chord C#4>
    {4.0} <music21.chord.Chord E3 G3 B3>
    {8.0} <music21.chord.Chord E3 G3 A3 C4>
    {10.0} <music21.chord.Chord E3 G3 B3>
    {12.0} <music21.chord.Chord G2 B-2 D3>
    {18.0} <music21.chord.Chord G2 A2 C3 E3>
    {19.0} <music21.chord.Chord A2 B2 D3 F#3>
    {20.0} <music21.chord