# Training of text generation models

In [None]:
%matplotlib inline

## Global config

Name of corpus file (without txt extension)

In [None]:
corpusname = "toyseries"

Name of the tokenizer to use

In [None]:
tokenizer = "subword"

Number of past input tokens to use for generation

In [None]:
inputtokens = 32

Network architecture to use

In [None]:
architecture = "lstm"

Number of hyperoptimization trials (recommended at least 15)

In [None]:
hypertrials = 15

### Process config

Get all relevant file names

In [None]:
corpusfile = 'corpus/' + corpusname + '.txt'
encodername = corpusname + '.enc'
modelname = corpusname + '.h5'

Obtain model class

In [None]:
from neurowriter.models import modelbyname
modelclass = modelbyname(architecture)

## Load corpus

In [None]:
with open(corpusfile) as f:
    corpus = f.read()

In [None]:
corpus[0:min(1000,len(corpus))]

## Encoding

In [None]:
from neurowriter.encoding import Encoder
encoder = Encoder(corpus, tokenizer)
encoder.save(encodername)

## Model training

Train the generator model, trying different hyperparameters and selecting the model producing lower loss in a  validation split of the data.

Note this might take a very long time, so during the optimization temporary versions of the model will be saved.

In [None]:
from neurowriter.optimizer import hypertrain

model, train_history = hypertrain(modelclass, inputtokens, encoder, corpus, n_calls=hypertrials, 
                                  savemodel=modelname, verbose=True)
model.save(modelname)

## Generation test

In [None]:
from neurowriter.writer import Writer

writer = Writer(model, encoder, creativity=0.1)
tokens = encoder.tokenizer.transform(corpus)
seed = tokens[:inputtokens]
seedtxt = "".join(seed)
print(seedtxt)
''.join(writer.write(seed=seedtxt))