# Text Generation using LSTMs

In [1]:
import os
import urllib.request

import numpy as np

from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM
from keras.utils import np_utils

## Download the data

The best place to access books that are no longer under Copyright is [Project Gutenberg](https://www.gutenberg.org/). Today we recommend using [Alice’s Adventures in Wonderland by Lewis Carroll](https://www.gutenberg.org/files/11/11-0.txt) for consistency. Of course you can experiment with other books as well.

In [2]:
data_url = 'https://www.gutenberg.org/files/219/219-0.txt'
fname = 'heart_of_darkness.txt'

if fname not in os.listdir():
    urllib.request.urlretrieve(data_url, fname)

## Load data and create character to integer mappings

- Open the text file, read the data then convert it to lowercase letters.
- Map each character to a respective number. Keep 2 dictionaries in order to have more easily access to the mappings both ways around.

In [3]:
# Load data
def read_doc(document):
    with open (document, encoding='UTF-8') as f:
        return f.read().lower()

# Characters to integers
def chars(text):

    txt = ''
    
    for word in text:
        for char in word:
            txt += char
    
    char_to_idx = {}
    idx_to_char = {}
    for i, char in enumerate(set(txt)):
        char_to_idx[char] = i + 1
        idx_to_char[i+1] = char
    
    return char_to_idx, idx_to_char


In [4]:
doc_path = os.path.join(os.getcwd(), fname)
text = read_doc(doc_path)

char_to_idx, idx_to_char = chars(text)

## Prepare the data
- We are "thinking" in sequences of 100 characters: 99 characters in the input and 1 in the output.  
E.g. for the sequence *\['h', 'e', 'l', 'l'\]* as input, we will have *\['o'\]* as the expected output.
- Reshape X such that it has the shape expected by a LSTM: \[samples, time steps, features\].
  - samples: number of data points (len(X));
  - time steps: number of time-dependent steps that are in a single data point (100);
  - features: number of variables for the true value in Y (1).
- Scale the values in X to be in \[0, 1\].
- One-hot encode the true values in Y_modified.

In [5]:
# Initialize the input and output with empty lists
seq_x = []
seq_y = []
n_chars = len(text)
for i in range(0, n_chars - 100, 1):
    # Consider sequences of 99 characters starting from i
    input = text[i:i+100]
    # The 100th character is the label
    output = text[i+100]

    # Append to the input the list of ints corresponding to the characters in the current sequence
    seq_x.append([char_to_idx[char] for char in input])
    # Append to the output the int corresponding to the label (as list)
    seq_y.append(char_to_idx[output])
# Re-shape the inputs
X = np.array(seq_x).reshape(len(seq_x), 100, 1)
# Scale the inputs
X_norm = (X.astype('float32') / 49.)
# One-hot encode labels
y = np_utils.to_categorical(seq_y)

## Define the LSTM model

- Instantiate the model: a linear stack of layers.
- First layer: LSTM with 256 memory units, input shape from X_new (1st and 2nd). Make sure that this layer returns sequences, such that the next LSTM layer receives sequences and not just random data.
- Second layer: dropout 20% of the neurons of the previous layer in order to avoid overfitting.

****** 
Optional:
- Third layer: LSTM(256).
- Fourth layer: dropout 20% of the neurons.
******
- Last layer: fully connected with a 'softmax' activation function, and as many neurons as the number of unique characters (the output is one-hot encoded).


Compile the model: categorical_crossentropy, adam.

In [6]:
# Instantiate the model
model = Sequential()
# Add LSTM layer
model.add(LSTM(256, return_sequences=True, input_shape=(X_norm.shape[1], X_norm.shape[2])))
# Add dropout
# model.add(Dropout(0.2))
model.add(Dropout(0.2))
# Add another LSTM layer
model.add(LSTM(256, return_sequences=True))
# Add dropout
model.add(Dropout(0.2))
# Add a Dense layer
model.add(LSTM(256))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))
# Compile the model
model.compile(optimizer='adam', loss = 'categorical_crossentropy', metrics=['accuracy'])

## Train the model and generate characters

Fit the model for over 100 epochs as the batch size is 30 (ideally). In this case, given the time constraints, we are going to use 5 epochs and a batch size of 128. 

Fix a random seed and start generating characters.  The prediction from the model gives out the character encoding of the predicted character, it is then decoded back to the character value and appended to the pattern.  

After enough training time it is going to look like something.

In [7]:
from keras.callbacks import ModelCheckpoint

filepath="weights-improvement-{epoch:02d}-{loss:.4f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=2, save_best_only=True, mode='min')
callbacks_list = [checkpoint]

# fit the model
model.fit(X_norm, y, epochs=50, batch_size=30)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x1f64dc830d0>

In [9]:
# pick a random seed
begin = np.random.randint(0, len(seq_x)-1)
pattern = seq_x[begin]
print("\"", ''.join([idx_to_char[value] for value in pattern]), "\"")

for i in range(1000):
	
	#predict
	x = np.reshape(pattern, (1, len(pattern), 1))
	x = x / float(len(char_to_idx))
	prediction = model.predict(x, verbose=0)
	index = np.argmax(prediction)
	result = idx_to_char[index]
	seq_in = [idx_to_char[value] for value in pattern]
	sys.stdout.write(result)
	pattern.append(index)
	pattern = pattern[1:len(pattern)]
	print(result)
print("\nDone.")

" lies within the shell of a cracked nut. but marlow was not typical (if
his propensity to spin yarns  "
oo
ff
  
tt
hh
ee
  
ss
tt
aa
tt
ii
oo
nn
  
ww
aa
ss
  
aa
  
ll
ii
tt
tt
ll
ee
  
tt
aa
cc
kk
  
oo
ff
  
tt
hh
ee
  
ss
tt
aa
tt
ii
oo
nn
  
ww
aa
ss
  
aa
  
ll
ii
tt
tt
ll
ee
  
tt
aa
cc
kk
  
oo
ff
  
tt
hh
ee
  
ss
tt
aa
tt
ii
oo
nn
  
ww
aa
ss
  
aa
  
ll
ii
tt
tt
ll
ee
  
tt
aa
cc
kk
  
oo
ff
  
tt
hh
ee
  
ss
tt
aa
tt
ii
oo
nn
  
ww
aa
ss
  
aa
  
ll
ii
tt
tt
ll
ee
  
tt
aa
cc
kk
  
oo
ff
  
tt
hh
ee
  
ss
tt
aa
tt
ii
oo
nn
  
ww
aa
ss
  
aa
  
ll
ii
tt
tt
ll
ee
  
tt
aa
cc
kk
  
oo
ff
  
tt
hh
ee
  
ss
tt
aa
tt
ii
oo
nn
  
ww
aa
ss
  
aa
  
ll
ii
tt
tt
ll
ee
  
tt
aa
cc
kk
  
oo
ff
  
tt
hh
ee
  
ss
tt
aa
tt
ii
oo
nn
  
ww
aa
ss
  
aa
  
ll
ii
tt
tt
ll
ee
  
tt
aa
cc
kk
  
oo
ff
  
tt
hh
ee
  
ss
tt
aa
tt
ii
oo
nn
  
ww
aa
ss
  
aa
  
ll
ii
tt
tt
ll
ee
  
tt
aa
cc
kk
  
oo
ff
  
tt
hh
ee
  
ss
tt
aa
tt
ii
oo
nn
  
ww
aa
ss
  
aa
  
ll
ii
tt
tt
ll
ee
  
tt
aa
cc
kk
  
oo
f

# Bonus: Words as features

Code here:

https://machinelearningmastery.com/how-to-develop-a-word-level-neural-language-model-in-keras/ 