# Training of text generation models

In [None]:
%matplotlib inline

## Global config

Name of corpus file (without txt extension)

In [None]:
corpusname = "toyseries"

Name of the tokenizer to use

In [None]:
tokenizer = "word"

Number of past input tokens to use for generation

In [None]:
inputtokens = 16

Network architecture to use

In [None]:
architecture = "wavenet"

Number of hyperoptimization trials (recommended at least 15)

In [None]:
hypertrials = 15

### Process config

Get all relevant file names

In [None]:
corpusfile = 'corpus/' + corpusname + '.txt'
encodername = corpusname + '.enc'
modelname = corpusname + '.h5'

Obtain model class

In [None]:
from neurowriter.models import modelbyname
modelclass = modelbyname(architecture)

## Load corpus

In [None]:
with open(corpusfile) as f:
    corpus = f.read()

In [None]:
corpus[0:min(1000,len(corpus))]

## Encoding

In [None]:
from neurowriter.encoding import Encoder
encoder = Encoder(corpus, tokenizer)
encoder.save(encodername)

## Model training

Train the generator model, trying different hyperparameters and selecting the model producing lower loss in a  validation split of the data.

Note this might take a very long time, so during the optimization temporary versions of the model will be saved.

In [None]:
from neurowriter.optimizer import hypertrain

model, train_history = hypertrain(modelclass, inputtokens, encoder, corpus, n_calls=hypertrials, savemodel=modelname)
model.save(modelname)

## Generation test

In [None]:
from neurowriter.writer import Writer

writer = Writer(model, encoder, creativity=0.1)
tokens = encoder.tokenizer.transform(corpus)
seed = tokens[:inputtokens]
seedtxt = encoder.tokenizer.intertoken.join(seed)
print(seedtxt)
''.join(writer.write(seed=seedtxt))

## Possible improvements

* Try training with SGD and the full pecera corpus for a large number of iterations

From Facebook's convolutional translation paper
* Tokens are dealt with embeddings instead of one-hot encoder.
* The position of each token is also added as a parallel embedding
* Dropout for the embeddings and for the input of each convolutional block

## References

* WaveNet paper: https://arxiv.org/pdf/1609.03499.pdf
* A Keras implementation of WaveNet: https://github.com/usernaamee/keras-wavenet/blob/master/simple-generative-model.py
* Another one: https://github.com/basveeling/wavenet/blob/master/wavenet.py
* Facebook's convolutional translation paper: https://arxiv.org/pdf/1705.03122.pdf

## Scrapyard

def sampletext(logs):
    """Function that generates some sample text with the model.

    Intented to be used as a keras callback
    """
    writer = Writer(model, encoder, creativity=0.1)
    print(corpus[:inputtokens])
    print(''.join(writer.write(seed=corpus[:inputtokens])))

# Build model with input parameters
model = modelkind(inputtokens, encoder, *bestparams)
# Prepare callbacks
callbacks = [
    LambdaCallback(on_train_end=sampletext),
    ModelCheckpoint(filepath=modelname,save_best_only=True),
    EarlyStopping(patience=patience)
]
# Train model
model.fit_generator(
    traingenerator,
    steps_per_epoch=int((1-val)*(len(corpus)-inputtokens+1)/batchsize),
    validation_data=valgenerator,
    validation_steps=int(val*(len(corpus)-inputtokens+1)/batchsize),
    epochs=maxepochs,
    verbose=2,
    callbacks=callbacks
)