# Generating Text with LSTM

## 1. Preparation

In [62]:
import numpy as np
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers import Dense, Reshape, LSTM, Dropout, Flatten
from keras.callbacks import ModelCheckpoint


# fix random seed
seed = 7
np.random.seed(seed)


# setting
look_back = 100

## 2. Prepara Dataset

In [63]:
# load text
filename = "./data_set/wonderland.txt"
raw_text = open(filename).read()
raw_text = raw_text.lower()
print(len(raw_text))

# show alphabets
chars = sorted(list(set(raw_text)))
print len(chars)
print chars

163182
57
['\n', ' ', '!', '"', '$', '%', "'", '(', ')', '*', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', '@', '[', ']', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']


In [64]:
# create map of chars to integers
char_to_int = dict((c, i) for i, c in enumerate(chars))
int_to_char = dict((i, c) for i, c in enumerate(chars))
print char_to_int

{'\n': 0, '!': 2, ' ': 1, '"': 3, '%': 5, '$': 4, "'": 6, ')': 8, '(': 7, '*': 9, '-': 11, ',': 10, '/': 13, '.': 12, '1': 15, '0': 14, '3': 17, '2': 16, '5': 19, '4': 18, '7': 21, '6': 20, '9': 23, '8': 22, ';': 25, ':': 24, '?': 26, '@': 27, '[': 28, ']': 29, '_': 30, 'a': 31, 'c': 33, 'b': 32, 'e': 35, 'd': 34, 'g': 37, 'f': 36, 'i': 39, 'h': 38, 'k': 41, 'j': 40, 'm': 43, 'l': 42, 'o': 45, 'n': 44, 'q': 47, 'p': 46, 's': 49, 'r': 48, 'u': 51, 't': 50, 'w': 53, 'v': 52, 'y': 55, 'x': 54, 'z': 56}


Create dataset of input and output from raw_text

In [65]:
# create dataset function 
def create_dataset(sequence, look_back=1):
    X, Y = [], []
    for i in range(0, len(sequence) - look_back, 1):
        X.append(sequence[i: i + look_back])
        Y.append(sequence[i + look_back])
    return np.array(X), np.array(Y)

# create dataset
int_text = [char_to_int[c] for c in raw_text]
X, Y = create_dataset(int_text, look_back)
print X.shape, Y.shape

# translate Y to one-hot-vercor
Y = np_utils.to_categorical(Y)
print Y.shape

# get some variables we will use
len_y_vec = Y.shape[1]

(163082, 100) (163082,)
(163082, 57)


## 2. Build a LSTM Model

In [66]:
def create_lstm_model():
    model = Sequential()
    model.add(Reshape((look_back, 1), input_shape=(look_back, )))
    model.add(LSTM(256, return_sequences=True))
    model.add(Flatten())
    model.add(Dense(len_y_vec, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam')
    print(model.summary())
    return model

lstm = create_lstm_model()    

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
reshape_11 (Reshape)             (None, 100, 1)        0           reshape_input_8[0][0]            
____________________________________________________________________________________________________
lstm_9 (LSTM)                    (None, 100, 256)      264192      reshape_11[0][0]                 
____________________________________________________________________________________________________
flatten_9 (Flatten)              (None, 25600)         0           lstm_9[0][0]                     
____________________________________________________________________________________________________
dense_11 (Dense)                 (None, 57)            1459257     flatten_9[0][0]                  
Total params: 1723449
_____________________________________________________________________

# 3. Train Model

The network is slow to train (about 300 seconds per epoch on an Nvidia K520 GPU). Because of the slowness and because of our optimization requirements, we will use model checkpointing to record all of the network weights to file each time an improvement in loss is observed at the end of the epoch. We will use the best set of weights (lowest loss) to instantiate our generative model in the next section.

In [68]:
filepath = "./models/c28/best_model.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')

lstm.fit(X, Y, nb_epoch=2, batch_size=128, callbacks=[checkpoint], verbose=1)

Epoch 1/2
 12160/163082 [=>............................] - ETA: 1360s - loss: 3.8778

KeyboardInterrupt: 

## 4. Generating Text

In [28]:
# load model
model = create_lstm_model()
model.load_weights(filepath)
model.compile(loss='categorical_crossentropy', optimizer='adam')

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
reshape_5 (Reshape)              (None, 2, 1)          0           reshape_input_4[0][0]            
____________________________________________________________________________________________________
lstm_4 (LSTM)                    (None, 2, 256)        264192      reshape_5[0][0]                  
____________________________________________________________________________________________________
flatten_3 (Flatten)              (None, 512)           0           lstm_4[0][0]                     
____________________________________________________________________________________________________
dense_3 (Dense)                  (None, 57)            29241       flatten_3[0][0]                  
Total params: 293433
______________________________________________________________________

In [39]:
# pick a random seed
start = np.random.randint(0, len(X) - 1)
start_x = X[start].reshape(look_back, )
print [int_to_char[i] for i in start_x]  # change bcak to char

['n', "'"]


In [60]:
# generate text
nb_gen_chars = 1000
x = start_x
gene_text = ''
for i in range(nb_gen_chars):
    input_x = x.reshape(1, len(x), )
    next_int_vec = model.predict(input_x, verbose=0)
    next_int = np.argmax(next_int_vec)
    next_char = int_to_char[next_int]
    gene_text += next_char
    x = np.append(x, next_int)[1: ]
print "\nDone"


Done


In [61]:
print gene_text

t the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the th