In [1]:
'''Example script to generate text from Nietzsche's writings.
At least 20 epochs are required before the generated text
starts sounding coherent.
It is recommended to run this script on GPU, as recurrent
networks are quite computationally intensive.
If you try this script on new data, make sure your corpus
has at least ~100k characters. ~1M is better.
'''

from __future__ import print_function
from keras.callbacks import LambdaCallback
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.layers import LSTM
from keras.optimizers import RMSprop
from keras.utils.data_utils import get_file
import numpy as np
import random
import sys
import io

Using TensorFlow backend.


In [2]:
path = get_file('nietzsche.txt', origin='https://s3.amazonaws.com/text-datasets/nietzsche.txt')
with io.open(path, encoding='utf-8') as f:
    text = f.read().lower()
print('corpus length:', len(text))

corpus length: 600893


In [3]:
import re
from collections import Counter
from random import random, randint

In [4]:
text = re.sub('[ ]+', ' ', text.lower())
text = re.sub('[\n]+', '\n', text.lower())
lines = text.splitlines()
lines_split = [re.split(' ', l) for l in lines]
common_words = [w for (w, _) in Counter(re.split('\s+', text)).most_common()[:1000]]

def random_word(avoid):
    chosen = avoid
    while chosen == avoid:
        chosen = common_words[int(random()*len(common_words))]
    return chosen


In [5]:
def modify_split_lines(ratio):
    return [[random_word(avoid=w) if random() < ratio and w in common_words else w
            for w in l] for l in lines_split[:100]]

ratio = 0.1

modified_split_lines = modify_split_lines(ratio)

modified_lines = [' '.join(l) for l in modified_split_lines]

# Change probability ratio here
modified_text = '\n'.join(modified_lines)

print(text[:1000])
print('-------')
print(modified_text[:1000])



preface
supposing that truth is a woman--what then? is there not ground
for suspecting that all philosophers, in so far as they have been
dogmatists, have failed to understand women--that the terrible
seriousness and clumsy importunity with which they have usually paid
their addresses to truth, have been unskilled and unseemly methods for
winning a woman? certainly she has never allowed herself to be won; and
at present every kind of dogma stands with sad and discouraged mien--if,
indeed, it stands at all! for there are scoffers who maintain that it
has fallen, that all dogma lies on the ground--nay more, that it is at
its last gasp. but to speak seriously, there are good grounds for hoping
that all dogmatizing in philosophy, whatever solemn, whatever conclusive
and decided airs it has assumed, may have been only a noble puerilism
and tyronism; and probably the time is at hand when it will be once
and again understood what has actually sufficed for the basis of such
imposing and absolu

In [6]:
chars = sorted(list(set(text)))
print('total chars:', len(chars))
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

total chars: 57


In [7]:
# cut the text in semi-redundant sequences of maxlen characters
maxlen = 40
step = 3
sentences = []
next_chars = []
for i in range(0, len(text) - maxlen, step):
    sentences.append(text[i: i + maxlen])
    next_chars.append(text[i + maxlen])
print('nb sequences:', len(sentences))

nb sequences: 199646


In [8]:
print('Vectorization...')
x = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
y = np.zeros((len(sentences), len(chars)), dtype=np.bool)
for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        x[i, t, char_indices[char]] = 1
    y[i, char_indices[next_chars[i]]] = 1

Vectorization...


In [9]:
# build the model: a single LSTM
print('Build model...')
model = Sequential()
model.add(LSTM(128, input_shape=(maxlen, len(chars))))
model.add(Dense(len(chars)))
model.add(Activation('softmax'))

optimizer = RMSprop(lr=0.01)
model.compile(loss='categorical_crossentropy', optimizer=optimizer)

Build model...


In [11]:
def sample(preds, temperature=1.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)


def on_epoch_end(epoch, logs):
    # Function invoked at end of each epoch. Prints generated text.
    print()
    print('----- Generating text after Epoch: %d' % epoch)

    start_index = randint(0, len(text) - maxlen - 1)
    for diversity in [0.2, 0.5, 1.0, 1.2]:
        print('----- diversity:', diversity)

        generated = ''
        sentence = text[start_index: start_index + maxlen]
        generated += sentence
        print('----- Generating with seed: "' + sentence + '"')
        sys.stdout.write(generated)

        for i in range(400):
            x_pred = np.zeros((1, maxlen, len(chars)))
            for t, char in enumerate(sentence):
                x_pred[0, t, char_indices[char]] = 1.

            preds = model.predict(x_pred, verbose=0)[0]
            next_index = sample(preds, diversity)
            next_char = indices_char[next_index]

            generated += next_char
            sentence = sentence[1:] + next_char

            sys.stdout.write(next_char)
            sys.stdout.flush()
        print()

In [22]:
print_callback = LambdaCallback(on_epoch_end=on_epoch_end)

from keras.models import load_model
model = load_model('nietzsche.h5')
model.fit(x, y,
          batch_size=128,
          epochs=60,
          callbacks=[print_callback])

Epoch 1/60
  5760/199646 [..............................] - ETA: 7:13 - loss: 3.2481 

KeyboardInterrupt: 

In [16]:
model.save('nietzsche.h5')


In [18]:
def previous_sentence_of_line(number, use_modified_lines = True):
    sentence = ''
    prepended = number - 1
    ls = modified_lines if use_modified_lines else lines
    while len(sentence) < maxlen:
        sentence = modified_lines[prepended] + '\n' + sentence
        prepended = prepended - 1
    return sentence[-maxlen:]

In [19]:
def compute_entropy_of_line(number, use_modified_lines):
    sentence = previous_sentence_of_line(2, use_modified_lines)
    ls = modified_lines if use_modified_lines else lines
    line = ls[number]
    i_next_char = 0
    err = 0
    for i, c in enumerate(line):
        x_pred = np.zeros((1, maxlen, len(chars)))
        for t, char in enumerate(sentence):
            x_pred[0, t, char_indices[char]] = 1
        y_pred = np.zeros((1, len(chars)), dtype=np.bool)
        next_char = line[i + 1] if i < len(line) - 1 else '\n' 
        y_pred[0, char_indices[next_char]] = 1.
        err += model.evaluate(x_pred, y_pred, verbose=0)
        sentence = sentence[1:] + next_char
    return err / len(line)

In [20]:
for i in range(2, 10):
    if lines[i] == modified_lines[i]:
        continue
    n = compute_entropy_of_line(i, use_modified_lines=False)
    m = compute_entropy_of_line(i, use_modified_lines=True)
    print(i, lines[i], '---',  modified_lines[i])
    print(n, m)

6 winning a woman? certainly she has never allowed herself to be won; and --- winning a woman? certainly she has never allowed herself to superiority won; and
3.11738399385 3.0840727061
7 at present every kind of dogma stands with sad and discouraged mien--if, --- at had every kind of dogma stands us, sad and discouraged mien--if,
3.27161880996 3.35718474815
8 indeed, it stands at all! for there are scoffers who maintain that it --- indeed, it stands at all! for there looks scoffers who maintain that it
2.97823480247 3.0342672207
