# Toy training of text generation models

In [None]:
%matplotlib inline

## Global config

Name of corpus file with extension

In [None]:
corpusname = "chupitos_bpe.json"

Corpus loader method to use

In [None]:
from neurowriter.corpus import Corpus

corpusloader = Corpus.load_json

Tokenizer object to use (can be None if text is already tokenized)

In [None]:
from neurowriter.tokenizer import CharTokenizer, WordTokenizer, SubwordTokenizer

tokenizer = None

Network architecture class to use

In [None]:
from neurowriter.models import DilatedConvModel, WavenetModel, LSTMModel

architecture = LSTMModel

Architecture parameters

In [None]:
from keras.optimizers import Adam, RMSprop, Nadam

# wavenet
params = [
    8, # kernels
    1, # wavenetblocks
    0.1, # dropout
    32 #embedding
]
# dilatedconv
params = [
    1, # convlayers
    8, # kernels
    0.1, # convdrop
    1, # denselayers
    16, # dense units
    0.1, # densedrop
    32 # size of the embedding
]
# lstm
params = [
    1, #layers
    16, #units
    0, #dropout
    32 #embedding
]
optparams = {
    "batchsize" : 64,
    "inputtokens" : 8,
    "learningrate" : 0.001,
    "optimizerclass" : Adam
}

### Process config

Get all relevant file names

In [None]:
corpusfile = 'corpus/' + corpusname
encodername = corpusname + '.enc'
modelname = corpusname + '.h5'

## Load corpus

In [None]:
corpus = corpusloader(corpusfile)

In [None]:
corpus[0][0:1000]

## Encoding

In [None]:
%%time
from neurowriter.encoding import Encoder
encoder = Encoder(corpus, tokenizer)
encoder.save(encodername)

In [None]:
encoder.char2index

## Model training

Train the generator model, trying different hyperparameters and selecting the model producing lower loss in a  validation split of the data.

Note this might take a very long time, so during the optimization temporary versions of the model will be saved.

In [None]:
%%time
from neurowriter.optimizer import trainmodel

model, train_history = trainmodel(architecture, encoder=encoder, corpus=corpus, verbose=2, maxepochs=1000, 
                                  modelparams=params, valmask=[False]*3+[True], patience=10, **optparams)
model.save(modelname)

In [None]:
bestloss = min(train_history.history['val_loss'])
bestloss

In [None]:
model.summary()

## Generation test

In [None]:
from neurowriter.writer import Writer
from neurowriter.encoding import END

writer = Writer(model, encoder, beamsize=1, batchsize=1, creativity=0.5)

tokens = encoder.tokenizer.transform("")
seedtxt = "".join(tokens)
print("Seed:", seedtxt)
print("Generated:")
print(seedtxt, end='')
for token in writer.generate(seedtxt):
    print(token, end='')
    if token == END:
        print('\n')