In [1]:
from __future__ import print_function
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.layers import SimpleRNN, LSTM, GRU
from keras.optimizers import RMSprop
from keras.initializers import RandomNormal, Identity
from keras.datasets import mnist
from keras.utils import np_utils
import numpy as np
import random

import tensorflow as tf
from keras.backend.tensorflow_backend import set_session
config = tf.ConfigProto()
config.gpu_options.allow_growth=True
set_session(tf.Session(config=config))

Using TensorFlow backend.


In [2]:
def load_text(path):
    with open(path, 'rb') as f:
        text = f.read()
    text = text.decode('utf-8')
    
    return text

In [4]:
text = load_text('FBS.txt')
print('corpus length:', len(text))
text = text[:600000]

text[:30]

corpus length: 5870769


'《斗破苍穹》天蚕土豆\r\n\r\n严正声明：本书为丫丫小说网(ww'

In [5]:
### Generate unique chars(word2id)
chars = sorted(list(set(text)))
print('total chars:', len(chars))
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

total chars: 3132


In [6]:
print(list(char_indices.items())[:5])

[('\t', 0), ('\n', 1), ('\r', 2), (' ', 3), ('!', 4)]


In [7]:
### fixed length sentence
maxlen = 20
step = 3
sentences = []
next_chars = []
for i in range(0, len(text) - maxlen, step):
    sentences.append(text[i: i + maxlen])
    next_chars.append(text[i + maxlen])
print('nb sequences:', len(sentences))

nb sequences: 199994


In [8]:
from copy import deepcopy
sentences = deepcopy(sentences[:40000])
len(sentences)

40000

In [9]:
def Generator(batch_size=128):
    while 1:
        for i in range(len(sentences)//batch_size-1):
            x = np.zeros((batch_size, maxlen, len(chars)), dtype=np.bool)
            y = np.zeros((batch_size, len(chars)), dtype=np.bool)
            batch_sentences = sentences[i*batch_size:(i+1)*batch_size]
            batch_label = next_chars[i*batch_size:(i+1)*batch_size]
            for j, sentence in enumerate(batch_sentences):
                for k, char in enumerate(sentence):
                    x[j, k, char_indices[char]] = 1
                y[j, char_indices[batch_label[j]]] = 1
            yield x, y

In [10]:
print('Vectorization...')
x = np.zeros((len(sentences), maxlen, len(chars)), dtype='bool')
y = np.zeros((len(sentences), len(chars)), dtype=np.bool)
for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        x[i, t, char_indices[char]] = 1
    y[i, char_indices[next_chars[i]]] = 1

Vectorization...


In [11]:
for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        x[i, t, char_indices[char]] = 1
    y[i, char_indices[next_chars[i]]] = 1

In [12]:
print('Build model...')
model = Sequential()
model.add(LSTM(128, input_shape=(maxlen, len(chars))))
model.add(Dense(len(chars)))
model.add(Activation('softmax'))

optimizer = RMSprop(lr=0.01)
model.compile(loss='categorical_crossentropy', optimizer=optimizer)

Build model...
Instructions for updating:
Colocations handled automatically by placer.


In [13]:
def sample(preds, diversity=1.0):
    preds = np.asarray(preds).astype(np.float64)
    preds = np.log(preds) / diversity
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

In [14]:
def on_epoch_end(model, epoch):
    print()
    print('----- Generating text after Epoch: %d -----' % epoch)

    start_index = random.randint(0, len(text) - maxlen - 1)
    for diversity in [0.2, 0.5, 1.0, 1.2]:
        print('----- diversity: -----', diversity)

        generated = ''
        sentence = text[start_index: start_index + maxlen]
        generated += sentence
        print('----- Generating with seed: "' + sentence + '" -----')

        for i in range(400):
            x_pred = np.zeros((1, maxlen, len(chars)))
            for t, char in enumerate(sentence):
                x_pred[0, t, char_indices[char]] = 1.

            preds = model.predict(x_pred, verbose=0)[0]
            next_index = sample(preds, diversity)
            next_char = indices_char[next_index]

            generated += next_char
            sentence = sentence[1:] + next_char
        print()
        print(generated)
        print()

In [None]:
epochs = 60
generator = Generator(128)
for epoch in range(epochs):
    #model.fit(x, y, batch_size=128, epochs=1)
    model.fit_generator(generator, steps_per_epoch=int(len(sentences)/128), epochs=1)
    on_epoch_end(model, epoch)

Instructions for updating:
Use tf.cast instead.
Epoch 1/1