In [6]:
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.layers import LSTM
from keras.layers import Dropout
from keras.optimizers import RMSprop
from keras_tqdm import TQDMNotebookCallback
from keras.callbacks import ModelCheckpoint

import numpy as np
import random
import sys
import re

In [7]:
path = "corpus.txt"
text = open(path).read().lower()
text = re.sub(r'\s+', ' ', text)
text = re.sub(r'[^a-z0-9!?\-\.,:; ]', '', text)
print('corpus length:', len(text))

chars = sorted(list(set(text)))
print('total chars:', len(chars))
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

corpus length: 2885179
total chars: 44


In [14]:
# cut the text in semi-redundant sequences of maxlen characters
maxlen = 32
step = 3
sentences = []
next_chars = []
for i in range(0, len(text) - maxlen, step):
    sentences.append(text[i : i + maxlen])
    next_chars.append(text[i + maxlen])
print('nb sequences:', len(sentences))

nb sequences: 961716


In [15]:
print('Vectorization...')
X = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
y = np.zeros((len(sentences), len(chars)), dtype=np.bool)
for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        X[i, t, char_indices[char]] = 1
    y[i, char_indices[next_chars[i]]] = 1

Vectorization...


In [26]:
model = Sequential()
model.add(LSTM(512, return_sequences=True, input_shape=(maxlen, len(chars))))
model.add(Dropout(0.2))
model.add(LSTM(512))
model.add(Dropout(0.2))
model.add(Dense(len(chars)))
model.add(Activation('softmax'))
model.summary()

model.compile(loss='categorical_crossentropy', optimizer='adam')

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
lstm_12 (LSTM)                   (None, 32, 512)       1140736     lstm_input_9[0][0]               
____________________________________________________________________________________________________
dropout_9 (Dropout)              (None, 32, 512)       0           lstm_12[0][0]                    
____________________________________________________________________________________________________
lstm_13 (LSTM)                   (None, 512)           2099200     dropout_9[0][0]                  
____________________________________________________________________________________________________
dropout_10 (Dropout)             (None, 512)           0           lstm_13[0][0]                    
___________________________________________________________________________________________

In [22]:
def sample(preds, temperature=1.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

def generate():
    start_index = random.randint(0, len(text) - maxlen - 1)

    for diversity in [0.2, 0.5, 1.0]:
        print()
        print('----- diversity:', diversity)

        generated = ''
        sentence = text[start_index: start_index + maxlen]
        generated += sentence
        print('----- Generating with seed: "' + sentence + '"')
        sys.stdout.write(generated)

        for i in range(120):
            x = np.zeros((1, maxlen, len(chars)))
            for t, char in enumerate(sentence):
                x[0, t, char_indices[char]] = 1.

            preds = model.predict(x, verbose=0)[0]
            next_index = sample(preds, diversity)
            next_char = indices_char[next_index]

            generated += next_char
            sentence = sentence[1:] + next_char

            sys.stdout.write(next_char)
            sys.stdout.flush()
        print()

In [25]:
generate()


----- diversity: 0.2
----- Generating with seed: "yet to believe that any such thi"
yet to believe that any such thin the the cathe the the the conce the the the the the se the the the the the sere the the the the se cere the the the th

----- diversity: 0.5
----- Generating with seed: "yet to believe that any such thi"
yet to believe that any such thin the soment an the son ind fithe ce e coris en the gare comancirate in the in on in the the sof the certhenl ghert as u

----- diversity: 1.0
----- Generating with seed: "yet to believe that any such thi"
yet to believe that any such this bitat of estet ho sag eng riice the jsse sce seve rezcencescoverdtp gitilit 1vipekaenre, soqrereyill if of roonand tag


In [None]:
history = model.fit(X, y,
          batch_size=128, nb_epoch=15,
          validation_split=0.2,
          verbose=0, callbacks=[
              TQDMNotebookCallback(),
              ModelCheckpoint(
                'weights.{epoch:02d}-{val_loss:.2f}.hdf5',
                monitor='val_loss',
                verbose=1,
                save_best_only=True,
                save_weights_only=True,
                mode='auto',
            )
          ])


In [10]:
model.save('model.h5')