# Tokenize and save data

In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
from fastai.text import *

In [3]:
import pandas as pd
from fastai_data import *

## Preparing the data

In [4]:
source_dir = 'midi_transcribe_v1_simple'
path = Path(f'data/midi/{source_dir}/')
csv_path = path/f'{source_dir}.csv'
path.ls()[:5]

[PosixPath('data/midi/midi_transcribe_v1_simple/ecomp'),
 PosixPath('data/midi/midi_transcribe_v1_simple/classic_piano'),
 PosixPath('data/midi/midi_transcribe_v1_simple/tmp_classical'),
 PosixPath('data/midi/midi_transcribe_v1_simple/cprato'),
 PosixPath('data/midi/midi_transcribe_v1_simple/freemidi')]

In [5]:
csv = pd.read_csv(csv_path)
csv = csv.loc[csv[source_dir].notna()];

  interactivity=interactivity, compiler=compiler, result=result)


In [6]:
def create_databunch(files, cache_name, vocab=None, batch_size=32, load_cached=True):
    if load_cached and (path/f'{cache_name}/itos.pkl').exists():
        data = TextLMDataBunch.load(path, bs=batch_size, cache_name=cache_name)
    else:
        ps = [LMOpenFileProcessor(), 
             LMTokenizeProcessor(tokenizer=MusicTokenizer(), chunksize=num_cpus()*20),
             LMNumericalizeProcessor(vocab=vocab, max_vocab=500)]

        data = (TextList(items=files, path=path, processor=ps)
                .random_split_by_pct(0.01, seed=6)
                .label_for_lm()
                .databunch(bs=batch_size))
        data.save(cache_name)
    vocab = data.train_ds.vocab
    return data, vocab

In [7]:
def get_files(csv):
    files = csv[source_dir]
    flist = [Path(f) for f in files.values]
    flist = [f for f in flist if f.exists()]
    return flist

### Create Pop Dataset

In [8]:
pop_csv = csv.loc[csv.source.isin(['midiworld', 'hooktheory', 'freemidi', 'cprato'])]
pop_files = get_files(pop_csv)

In [9]:
pop_data, pop_vocab = create_databunch(pop_files, cache_name='tmp_pop')

Numericalizing


## Create Classical Dataset

In [10]:
# csv_filter = csv.loc[csv[f'{source_dir}_timesteps'] < 20000]
clc_csv = csv.loc[csv.source.isin(['classical_piano', 'ecomp'])]
clc_files = get_files(clc_csv)

Note: we are using pop_vocab because we ultimately want to generate pop music

In [11]:
clc_data, clc_vocab = create_databunch(clc_files, cache_name='tmp_clc', vocab=pop_vocab, load_cached=False)

Numericalizing


### Testing

In [16]:
data = pop_data

In [17]:
data.show_batch()

idx,text
0,t2 || nD4 t2 nF4 t2 nG4 t2 nB-4 t2 || nD4 t2 nF4 t2 nG4 t2 nB-4 t2 || nD4 t1 nF4 t1 nA4 t1 || nD4 t2 nF4 t2 nA4 t2 || nD4 t2 nF4 t2 nA4 t2 || nD4 t2 nF4 t2 nA4 t2 || nD4 t2 nF4 t2 nA4 t2 || nD4 t2 nF4 t2 nA4 t2 || nD4 t2 nF4 t2 nA4 t2 || nD4
1,nA4 t2 nF#5 t2 || nD4 t2 nF#4 t2 nA4 t2 nF#5 t1 || nD4 t2 nF#4 t2 nA4 t2 nF#5 t2 || nD4 t2 nF#4 t2 nA4 t2 nF#5 t1 || nD4 t2 nF#4 t2 nA4 t2 nF#5 t2 || nD4 t2 nF#4 t2 nA4 t2 nE5 t1 || nD4 t2 nF#4 t2 nA4 t2 nE5 t2 || nF#3 t1 nB-3 t1 nC#4 t1 || nF#3 t2 nB-3 t2
2,t2 nB3 t2 nD5 t2 || nE3 t2 nG3 t2 nB3 t2 nB4 t1 || nE3 t2 nG3 t2 nB3 t2 nB4 t2 || nE3 t2 nG3 t2 nB3 t2 nE5 t1 || nE3 t2 nG3 t2 nB3 t2 nE5 t2 || nD4 t1 nG4 t1 nB4 t1 nE5 t2 || nD4 t2 nG4 t2 nB4 t2 nE5 t2 || nD4 t2 nG4 t2 nB4 t2 nD5 t1 || nD4
3,t2 nE-5 t1 || nG#3 t2 nB3 t2 nE-4 t2 nG#5 t1 || nG#3 t2 nB3 t2 nE-4 t2 nG#5 t2 || nG#3 t2 nB3 t2 nE-4 t2 nG#5 t2 || nG#3 t2 nB3 t2 nE-4 t2 nG#5 t2 || nG#3 t2 nB3 t2 nE-4 t2 nE-5 t1 || nG#3 t2 nB3 t2 nE-4 t2 nE-5 t2 || nG#3 t2 nB3 t2 nE-4 t2 nE-5 t2 || nG#3 t2 nB3
4,nC4 t2 nF4 t2 || nB-3 t2 nC4 t2 nF4 t2 || nB-3 t2 nC4 t2 nF4 t2 || nB-3 t2 nC4 t2 nF4 t2 || nB-3 t2 nC4 t2 nF4 t2 || nB-3 t2 nC4 t2 nF4 t2 || nB-3 t2 nC4 t2 nF4 t2 || nB-3 t2 nC4 t2 nF4 t2 || nB-3 t2 nC4 t2 nF4 t2 || nB-3 t2 nC4 t2 nF4 t2 || nB-3 t2


In [18]:
ob = data.one_batch()

In [19]:
txt_out = data.vocab.textify(ob[0][0]).replace('xxbos ', ''); txt_out

't2 nF4 t2 nA4 t2 || nF3 t1 nG#3 t1 nC4 t1 || nF3 t2 nG#3 t2 nC4 t2 || nF3 t2 nG#3 t2 nC4 t2 || nF3 t2 nG#3 t2 nC4 t2 || nF3 t2 nG#3 t2 nC4 t2 || nF3 t2 nG#3 t2 nC4 t2 || nF3 t2 nG#3 t2 nC4 t2 || nF3 t2 nG#3 t2 nC4 t2 || nF3 t2 nG#3 t2 nC4 t2 || nF3'

In [20]:
from encode_data import *

In [21]:
one_text = data.train_ds[0][0].text

In [23]:
seq = str2seq(one_text); seq

In [33]:
s = str2stream(one_text)

In [None]:
s.show('midi')

In [35]:
s.show('text')

{0.0} <music21.stream.Part 0x7f23957414a8>
    {0.0} <music21.instrument.Piano Piano>
    {0.0} <music21.meter.TimeSignature 4/4>
    {0.0} <music21.key.KeySignature of no sharps or flats>
    {0.0} <music21.chord.Chord F4>
    {0.25} <music21.chord.Chord E4>
    {0.5} <music21.chord.Chord G3>
    {0.75} <music21.chord.Chord E-4>
    {1.0} <music21.chord.Chord D4>
    {1.25} <music21.chord.Chord G3>
    {1.5} <music21.chord.Chord E-4>
    {1.75} <music21.chord.Chord D4>
    {2.0} <music21.chord.Chord G3>
    {2.75} <music21.chord.Chord F4>
    {3.0} <music21.chord.Chord E4>
    {3.25} <music21.chord.Chord F4>
    {3.5} <music21.chord.Chord B4>
    {3.75} <music21.chord.Chord D4 F4 A4>
    {7.75} <music21.chord.Chord D4 F4 G4 B-4>
    {9.75} <music21.chord.Chord D4 F4 A4>
    {11.75} <music21.chord.Chord F3 G#3 C4>
    {17.75} <music21.chord.Chord F3 G3 B-3 D4>
    {18.75} <music21.chord.Chord G3 A3 C4 E4>
    {19.75} <music21.chord.Chord B-3 C#4 F4>
    {23.75} <music21.chord.Chord E-4

In [None]:
s.flat.show()