# Tokenize and save data

In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
from fastai.text import *

In [3]:
import pandas as pd
from fastai_data import *

## Preparing the data

In [4]:
source_dir = 'midi_transcribe_v2_shortdur'
path = Path(f'data/midi/{source_dir}/')
csv_path = path/f'{source_dir}.csv'
path.ls()[:5]

[PosixPath('data/midi/midi_transcribe_v2_shortdur/ecomp'),
 PosixPath('data/midi/midi_transcribe_v2_shortdur/classic_piano'),
 PosixPath('data/midi/midi_transcribe_v2_shortdur/cprato'),
 PosixPath('data/midi/midi_transcribe_v2_shortdur/freemidi'),
 PosixPath('data/midi/midi_transcribe_v2_shortdur/midiworld')]

In [5]:
csv = pd.read_csv(csv_path)
csv = csv.loc[csv[source_dir].notna()];

  interactivity=interactivity, compiler=compiler, result=result)


In [6]:
def create_databunch(files, cache_name, vocab=None, batch_size=32, load_cached=True):
    if load_cached and (path/f'{cache_name}/itos.pkl').exists():
        data = TextLMDataBunch.load(path, bs=batch_size, cache_name=cache_name)
        data.valid_ds.x.processor[0] = TokenizeProcessor(tokenizer=MusicTokenizer())
    else:
        ps = [LMOpenFileProcessor(), 
             LMTokenizeProcessor(tokenizer=MusicTokenizer(), chunksize=num_cpus()*20),
             LMNumericalizeProcessor(vocab=vocab, max_vocab=500)]

        data = (TextList(items=files, path=path, processor=ps)
                .random_split_by_pct(0.01, seed=6)
                .label_for_lm()
                .databunch(bs=batch_size))
        data.save(cache_name)
    vocab = data.train_ds.vocab
    return data, vocab

In [7]:
def get_files(csv):
    files = csv[source_dir]
    flist = [Path(f) for f in files.values]
    flist = [f for f in flist if f.exists()]
    return flist

### Create Pop Dataset

In [8]:
pop_csv = csv.loc[csv.source.isin(['midiworld', 'hooktheory', 'freemidi', 'cprato'])]
pop_files = get_files(pop_csv)

In [9]:
pop_data, pop_vocab = create_databunch(pop_files, cache_name='tmp_pop')

Numericalizing


## Create Classical Dataset

In [10]:
# csv_filter = csv.loc[csv[f'{source_dir}_timesteps'] < 20000]
clc_csv = csv.loc[csv.source.isin(['classical_piano', 'ecomp'])]
clc_files = get_files(clc_csv)

Note: we are using pop_vocab because we ultimately want to generate pop music

In [11]:
clc_data, clc_vocab = create_databunch(clc_files, cache_name='tmp_clc', vocab=pop_vocab, load_cached=False)

Numericalizing


### Testing

In [12]:
data = pop_data

In [13]:
data.show_batch()

idx,text
0,t8 || t7 nE3 t8 nG3 t8 nB3 t8 || t7 nG2 t24 nB-2 t24 nD3 t24 || t23 nG2 t4 nA2 t4 nC3 t4 nE3 t4 || t3 nA2 t4 nB2 t4 nD3 t4 nF#3 t4 || t3 nC3 t16 nE-3 t16 nG3 t16 || t15 nF3 t8 nG#3 t8 nC4 t8 nD4 t8 || t7 nC3 t8 nE-3 t8 nG3 t8 || t7 nE-3 t16 nF#3 t16 nB-3
1,t5 nA2 t32 nC#3 t32 nE3 t32 || t3 nE-5 t1 || t0 nD5 t1 || t0 nC5 t1 || t0 nA4 t1 || t0 nD5 t2 || t1 nC5 t1 || t0 nA4 t2 || t1 nG4 t1 || t0 nG4 t1 || t0 nA4 t2 || t1 nD4 t1 || t0 nC4 t1 || t0 nA3 t2 || t1 nG3 t3 xxbos || t0 nF2 t16 nA2 t16
2,nA4 t4 || t3 nF3 t4 nA3 t4 nC4 t4 nC5 t2 || t1 nA4 t6 || t1 nF3 t4 nA3 t4 nC4 t4 || t3 nF3 t4 nA3 t4 nC4 t4 nG4 t2 || t1 nF4 t6 || t1 nF3 t4 nA3 t4 nC4 t4 || t3 nF3 t4 nA3 t4 nC4 t2 nE4 t2 || t3 nF3 t4 nA3 t4 nC4 t2 || t1 nD4 t4 || t1
3,nA5 t4 || t3 nB5 t2 || t1 nE2 t10 nG2 t10 nB2 t10 nD3 t10 nF#5 t4 || t3 nD5 t2 || t1 nB4 t2 || t1 nD5 t2 || t1 nA2 t6 nC3 t6 nE3 t6 nG3 t6 nF#5 t2 || t1 nG5 t2 || t1 nA5 t2 || t1 nD2 t10 nF#2 t10 nC3 t10 nE3 t10 nB5 t2 || t1 nC6 t1 || t0 nB5 t1
4,|| t0 nF#3 t1 nA4 t2 nA5 t4 || t0 nF#2 t2 nD3 t2 nF#3 t1 nG5 t4 nB-5 t2 || t0 nF#3 t1 nA4 t2 || t0 nD2 t2 nE2 t8 nF#2 t2 nF#3 t1 nF5 t4 nA5 t2 nB-5 t2 || t0 nF#3 t1 nA4 t2 || t0 nF#2 t2 nD3 t2 nF#3 t1 nE5 t4 nG5 t1 nB-5 t2 || t0 nF#3 t1 nA4 t1 || t0


In [14]:
ob = data.one_batch()

In [15]:
txt_out = data.vocab.textify(ob[0][0]).replace('xxbos ', ''); txt_out

't16 || t0 nC4 t2 || t1 nE5 t2 || t1 nG4 t2 || t1 nC5 t2 || t1 nC4 t2 || t1 nE5 t2 || t1 nG4 t2 || t1 nC5 t2 || t1 nC3 t2 nE3 t4 nG3 t8 || t5 nF3 t2 || t1 nB2 t8 nD3 t8 nG3 t4 || t1 nE3 t2 || t1 nA3 t2 || t1 nB3 t2 || t1 nA2 t8'

In [16]:
from encode_data import *

In [17]:
one_text = data.train_ds[0][0].text

In [18]:
seq = str2seq(one_text); seq

[[],
 [],
 [G3t1],
 [F#3t1],
 [A2t1],
 [F3t1],
 [E3t1],
 [A2t1],
 [F3t1],
 [E3t1],
 [A2t3],
 [],
 [],
 [G3t1],
 [F#3t1],
 [G3t1],
 [C#4t1],
 [E3t16, G3t16, B3t16],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [E3t8, G3t8, A3t8, C4t8],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [E3t8, G3t8, B3t8],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [G2t24, B-2t24, D3t24],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [G2t4, A2t4, C3t4, E3t4],
 [],
 [],
 [],
 [A2t4, B2t4, D3t4, F#3t4],
 [],
 [],
 [],
 [C3t16, E-3t16, G3t16],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [F3t8, G#3t8, C4t8, D4t8],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [C3t8, E-3t8, G3t8],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [E-3t16, F#3t16, B-3t16]]

In [19]:
s = str2stream(one_text)

In [20]:
s.show('midi')

In [21]:
s.show('text')

{0.0} <music21.stream.Part 0x7ff3d36ff358>
    {0.0} <music21.instrument.Piano Piano>
    {0.0} <music21.meter.TimeSignature 4/4>
    {0.0} <music21.tempo.MetronomeMark animato Quarter=120>
    {0.0} <music21.key.KeySignature of no sharps or flats>
    {0.5} <music21.chord.Chord G3>
    {0.75} <music21.chord.Chord F#3>
    {1.0} <music21.chord.Chord A2>
    {1.25} <music21.chord.Chord F3>
    {1.5} <music21.chord.Chord E3>
    {1.75} <music21.chord.Chord A2>
    {2.0} <music21.chord.Chord F3>
    {2.25} <music21.chord.Chord E3>
    {2.5} <music21.chord.Chord A2>
    {3.25} <music21.chord.Chord G3>
    {3.5} <music21.chord.Chord F#3>
    {3.75} <music21.chord.Chord G3>
    {4.0} <music21.chord.Chord C#4>
    {4.25} <music21.chord.Chord E3 G3 B3>
    {8.25} <music21.chord.Chord E3 G3 A3 C4>
    {10.25} <music21.chord.Chord E3 G3 B3>
    {12.25} <music21.chord.Chord G2 B-2 D3>
    {18.25} <music21.chord.Chord G2 A2 C3 E3>
    {19.25} <music21.chord.Chord A2 B2 D3 F#3>
    {20.25} <music21

In [22]:
s.flat.show()

TypeError: append() argument must be xml.etree.ElementTree.Element, not Element