In [27]:
%reload_ext autoreload
%autoreload 2

In [28]:
import errno
import os
import random
import shutil

from fastai.text import * 

# Data structuring

First we need to split our data up into training and validation sets (TODO: Strip out duplicates - different keys is ok though)

In [29]:
DATA_DIR_ALL = Path('data/encoded')

In [30]:
OUT_DIR = Path('data/split')

In [31]:
def ensure_dir_exists(path):
    """ Ensure that a directory at the given path exists """
    try:
        os.makedirs(path)
    except OSError as e:
        if e.errno != errno.EEXIST:
            raise

In [32]:
# all_fnames = [encoded for encoded in os.listdir(DATA_DIR_ALL)]
# RANDOM_SEED = 42
# random.Random(RANDOM_SEED).shuffle(all_fnames)

In [33]:
# training_set = all_fnames[: int(len(all_fnames) * 0.8)]
# validation_set = all_fnames[len(training_set):]

In [34]:
# for set_dir, data in zip(('train', 'valid'), (training_set, validation_set)):
#     ensure_dir_exists(OUT_DIR/set_dir)
#     for fname in data:
#         shutil.copy(DATA_DIR_ALL/fname, OUT_DIR/set_dir)

# Model training

In [35]:
DATA_DIR = OUT_DIR

In [36]:
VOCAB = [f'S-{i}' for i in range(24, 76)] + ['R']
len(VOCAB)

53

We have a very limited 'vocabulary' of musical tokens. Tokenization simply involves splitting the comma-delimited list of strings.

In [37]:
class MIDISeqTokenizer(BaseTokenizer):
    def tokenizer(self, text):
        return text.split(',')
    
class CustomTokenizer(Tokenizer):
    "Put together rules and a tokenizer function to tokenize text with multiprocessing."
    def __init__(self, lang:str='en', pre_rules:ListRules=None,
                 post_rules:ListRules=None, special_cases:Collection[str]=None, n_cpus:int=None):
        super().__init__(tok_func=MIDISeqTokenizer, pre_rules=[], post_rules=[], special_cases=[])

In [43]:
data_lm = TextLMDataBunch.from_folder(DATA_DIR, tokenizer=CustomTokenizer(), include_bos=False, include_eos=False)

In [45]:
data_lm.save('data_lm_export.pkl')

In [46]:
data_lm = load_data(DATA_DIR, 'data_lm_export.pkl')

In [47]:
learn = language_model_learner(data_lm, AWD_LSTM, drop_mult=0.5, pretrained=False)
learn.fit_one_cycle(1, 1e-2)

epoch,train_loss,valid_loss,accuracy,time
0,0.381596,0.347482,0.90148,09:54


In [48]:
learn.save('model_v1_1_epoch')

In [52]:
learn.predict('S-40 H-40 S-42 S-60')

'S-40 H-40 S-42 S-60 S-67'

In [58]:
def generate_seq(start, learner, length=32):
    seq = start
    while len(seq.split(' ')) < length:
        seq = learner.predict(seq)
    return seq

In [60]:
generate_seq('S-27 H-27', learn)

'S-27 H-27 S-25 S-60 S-67 S-60 S-62 S-67 H-50 S-62 S-65 S-60 S-58 S-67 S-62 S-60 S-67 S-60 S-67 S-67 S-63 H-60 H-39 H-54 S-67 S-35 S-67 S-62 S-24 S-52 S-60 S-43'

## More training

In [62]:
learn = learn.load('model_v1_1_epoch')

In [63]:
learn.fit_one_cycle(2, 1e-2)

epoch,train_loss,valid_loss,accuracy,time
0,0.343286,0.459055,0.881116,09:56
1,0.313737,0.275368,0.917603,09:58


In [64]:
learn.save('model_v1_3_epoch')

In [89]:
generate_seq('S-60 S-60', learn, length=16)

'S-60 S-60 S-67 S-60 H-47 S-64 S-60 S-29 S-60 S-67 S-67 S-69 S-67 S-60 S-62 S-60'