In [1]:
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.layers import LSTM
from keras.layers import Dropout
from keras.optimizers import RMSprop
from keras_tqdm import TQDMNotebookCallback
from keras.callbacks import ModelCheckpoint

import numpy as np
import random
import sys
import re

Using TensorFlow backend.


In [2]:
path = "corpus.txt"
text = open(path, encoding='utf8').read().lower()
text = re.sub(r'\s+', ' ', text)
text = re.sub(r'[^a-z0-9!?\-\.,:; ]', '', text)
print('corpus length:', len(text))

chars = sorted(list(set(text)))
print('total chars:', len(chars))
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

corpus length: 2885179
total chars: 44


In [3]:
# cut the text in semi-redundant sequences of maxlen characters
maxlen = 64
step = 3
sentences = []
next_chars = []
for i in range(0, len(text) - maxlen, step):
    sentences.append(text[i : i + maxlen])
    next_chars.append(text[i + maxlen])
print('nb sequences:', len(sentences))

nb sequences: 961705


In [4]:
print('Vectorization...')
X = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
y = np.zeros((len(sentences), len(chars)), dtype=np.bool)
for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        X[i, t, char_indices[char]] = 1
    y[i, char_indices[next_chars[i]]] = 1

Vectorization...


In [5]:
model = Sequential()
model.add(LSTM(512, return_sequences=True, input_shape=(maxlen, len(chars))))
model.add(Dropout(0.2))
model.add(LSTM(512))
model.add(Dropout(0.2))
model.add(Dense(len(chars)))
model.add(Activation('softmax'))
model.summary()

model.compile(loss='categorical_crossentropy', optimizer='adam')

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
lstm_1 (LSTM)                    (None, 64, 512)       1140736     lstm_input_1[0][0]               
____________________________________________________________________________________________________
dropout_1 (Dropout)              (None, 64, 512)       0           lstm_1[0][0]                     
____________________________________________________________________________________________________
lstm_2 (LSTM)                    (None, 512)           2099200     dropout_1[0][0]                  
____________________________________________________________________________________________________
dropout_2 (Dropout)              (None, 512)           0           lstm_2[0][0]                     
___________________________________________________________________________________________

In [13]:
def sample(preds, temperature=1.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

def generate():
    start_index = random.randint(0, len(text) - maxlen - 1)

    for diversity in [0.2, 0.5, 1.0]:
        print()
        print('----- diversity:', diversity)

        generated = ''
        sentence = text[start_index: start_index + maxlen]
        generated += sentence
        print('----- Generating with seed: "' + sentence + '"')
        sys.stdout.write(generated)

        for i in range(400):
            x = np.zeros((1, maxlen, len(chars)))
            for t, char in enumerate(sentence):
                x[0, t, char_indices[char]] = 1.

            preds = model.predict(x, verbose=0)[0]
            next_index = sample(preds, diversity)
            next_char = indices_char[next_index]

            generated += next_char
            sentence = sentence[1:] + next_char

            sys.stdout.write(next_char)
            sys.stdout.flush()
        print()

In [10]:
model.load_weights('weights.07-1.30.hdf5')

In [15]:
generate()


----- diversity: 0.2
----- Generating with seed: "h higher probability to psychic powers than does naturalism, bec"
h higher probability to psychic powers than does naturalism, because the conscious design is that the substance of the problem with the standard mind that the standard model of the statement of the sun that strange consequences i have a belief that the ai design to the ai design to the distance of the sun that the problem is that the answer is that the standard model of a single distance of the first place to the design that the starting probability of the sub

----- diversity: 0.5
----- Generating with seed: "h higher probability to psychic powers than does naturalism, bec"
h higher probability to psychic powers than does naturalism, because it should be experienced that the standard model of starting the correct answer is a traditional rationalist who should seem to predict the box. and the probability of the experiment is that sure depends on people who point to desig

In [25]:

history = model.fit(X, y,
          batch_size=128, nb_epoch=12,
          validation_split=0.2,
          verbose=0, callbacks=[
              TQDMNotebookCallback(),
              ModelCheckpoint(
                'weights.{epoch:02d}-{val_loss:.2f}.hdf5',
                monitor='val_loss',
                verbose=1,
                save_best_only=True,
                save_weights_only=True,
                mode='auto',
            )
          ])


Epoch 00000: val_loss improved from inf to 1.49448, saving model to weights.00-1.49.hdf5
Epoch 00001: val_loss improved from 1.49448 to 1.38234, saving model to weights.01-1.38.hdf5
Epoch 00002: val_loss improved from 1.38234 to 1.34064, saving model to weights.02-1.34.hdf5
Epoch 00003: val_loss improved from 1.34064 to 1.32300, saving model to weights.03-1.32.hdf5
Epoch 00004: val_loss improved from 1.32300 to 1.30645, saving model to weights.04-1.31.hdf5
Epoch 00005: val_loss improved from 1.30645 to 1.30380, saving model to weights.05-1.30.hdf5
Epoch 00006: val_loss did not improve
Epoch 00007: val_loss improved from 1.30380 to 1.30309, saving model to weights.07-1.30.hdf5
Epoch 00008: val_loss did not improve
Epoch 00009: val_loss did not improve
Epoch 00010: val_loss did not improve
Epoch 00011: val_loss did not improve
