In [1]:
# Downloading and parsing the initial text file
import keras
import numpy as np

path = keras.utils.get_file(
    'nietzsche.txt',
    origin='https://s3.amazonaws.com/text-datasets/nietzsche.txt')
text = open(path).read().lower()
print('Corpus length:', len(text))


Using TensorFlow backend.


Downloading data from https://s3.amazonaws.com/text-datasets/nietzsche.txt
('Corpus length:', 600901)


In [2]:
# Vectorizing sequences of characters

maxlen = 60
step = 3
sentences = []
next_chars = []

for i in range(0,len(text) - maxlen, step):
    sentences.append(text[i:i+maxlen])
    next_chars.append(text[i+maxlen])
    
print('Number of sequences:', len(sentences))

chars = sorted(list(set(text)))
print('Unique characters:', len(chars))
char_indices = dict((char, chars.index(char)) for char in chars)

print('Vectoriziation...')
x = np.zeros((len(sentences),maxlen,len(chars)), dtype=np.bool)
y = np.zeros((len(sentences), len(chars)), dtype = np.bool)

for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        x[i, t, char_indices[char]] = 1
    y[i, char_indices[next_chars[i]]] = 1
    

('Number of sequences:', 200281)
('Unique characters:', 59)
Vectoriziation...


In [3]:
# Building single-layer LSTM model for next-character prediction
from keras import layers

model = keras.models.Sequential()
model.add(layers.LSTM(128, input_shape=(maxlen, len(chars))))
model.add(layers.Dense(len(chars), activation='softmax'))

In [4]:
# Model compilation configuration
optimizer = keras.optimizers.RMSprop(lr=0.01)
model.compile(loss ='categorical_crossentropy', optimizer = optimizer)

Given a trained model and a seed text snippet, new text can be generated by repeating:
1. Draw from the model a probability distribution for the next character, given the generated text available so far
2. Reweight the distribution to a certain temperature
3. Sample the next character at random according to the reweighted distribution
4. Add the new character at the end of the available text

In [5]:
# Function to sample the next character given the model's predictions
def sample(preds, temperature = 1.0):
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds)/temperature
    exp_preds=np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1,preds,1)
    return np.argmax(probas)

In [None]:
# Text-generation loop
import random
import sys

for epoch in range(1,60):
    print('epoch', epoch)
    model.fit(x,y,batch_size=128, epochs=1)
    start_index = random.randint(0,len(text)-maxlen-1)
    generated_text = text[start_index:start_index+maxlen]
    print('--- Generating with seed: "' + generated_text+'"')
    
    for temperature in [0.2,0.5,1.0,1.2]:
        print('--- temperature: ',temperature)
        sys.stdout.write(generated_text)
        
    for i in range(400): #generating 400 characters starting from the seed text
        sampled = np.zeros((1,maxlen, len(chars)))
        for t, char in enumerate(generated_text):
            sampled[0,t,char_indices[char]]=1.
            
        preds = model.predict(sampled,verbose=0)[0]
        next_index=sample(preds,temperature)
        next_char = chars[next_index]
        
        generated_text += next_char
        generated_text = generated_text[1:]
        
        sys.stdout.write(next_char)
                    

('epoch', 1)
Epoch 1/1
--- Generating with seed: ", not because you have deceived me, but because i can
no lon"
('--- temperature: ', 0.2)
, not because you have deceived me, but because i can
no lon('--- temperature: ', 0.5)
, not because you have deceived me, but because i can
no lon('--- temperature: ', 1.0)
, not because you have deceived me, but because i can
no lon('--- temperature: ', 1.2)
, not because you have deceived me, but because i can
no longerure on natesthysser, nou
 frean upon the lerd whearnatine: which restratiea. tond, of bithingses, any
dimal,e.
cintle in the supriywerstind--which longer feopety
toops,
that m speciining in vortematy," bus we nout
a
but hapsmery, and pave ily for at preasimation!,
thailve simpor; wretherin to the knode which last dadiminits,ably i 3adear exility his 
slive be of    napoly,o gon incamposayo, ava('epoch', 2)
Epoch 1/1
--- Generating with seed: "paths ascend to the highest steeps in order to laugh to scor"
('--- temperature: ', 0.2)
p

## Wrapping up

It is possible to generate discrete sequence data by training a model to predict the next tokens, given the precvious tokens.
In the case of text, such a model is called a *language model*. It can be based on either words or characters. Sampling the next token requires balance between adhering to what the model judges likely and introducing randomness.
This can be done by softmax temperature. ! Always experiement with different temperatures to find the right one. 