In [1]:
#first practice of text generation - to familiarise myself
#character prediction 

import numpy
import sys
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM
from keras.utils import np_utils
from keras.callbacks import ModelCheckpoint


def tokenize_words(input):
    # lowercase everything to standardize it
    input = input.lower()

    # instantiate the tokenizer
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(input)

    # if the created token isn't in the stop words, make it part of "filtered"
    filtered = filter(lambda token: token not in stopwords.words('english'), tokens)
    return " ".join(filtered)



def main():

    #MAKE IT ALL FILES
    file = open("data/1952.txt").read()
    #file = open("").read()
    processed = tokenize_words(file)
    set_p = set(processed)
    list_p = list(set_p)
    chars = sorted(list_p)
    char_to_num = dict((c, i) for i, c in enumerate(chars))
    
    input_len = len(processed)
    vocab_len = len(chars)
    print('total characters: ', input_len)
    print('total vocab: ', vocab_len)
    
    seq_length = 100
    x_data = []
    y_data = []
    
    for i in range(0, input_len - seq_length, 1):
        in_seq = processed[i:i + seq_length]
        out_seq = processed[i + seq_length]
        x_data.append([char_to_num[char] for char in in_seq])
        y_data.append(char_to_num[out_seq])
    
    n_patterns = len(x_data)
    print('total patterns ', n_patterns)
    
    x = numpy.reshape(x_data, (n_patterns, seq_length, 1))
    x = x/float(vocab_len)
    y = np_utils.to_categorical(y_data)
    
    model = Sequential()
    model.add(LSTM(256, input_shape=(x.shape[1], x.shape[2]), return_sequences=True))
    model.add(Dropout(0.2))
    model.add(LSTM(256, return_sequences=True))
    model.add(Dropout(0.2))
    model.add(LSTM(128))
    model.add(Dropout(0.2))
    model.add(Dense(y.shape[1], activation='softmax'))
    
    model.compile(loss='categorical_crossentropy', optimizer='adam')
    
    filepath = "model_weights_saved.hdf5"
    checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
    desired_callbacks = [checkpoint]
    
    model.fit(x, y, epochs=4, batch_size=256, callbacks=desired_callbacks)
    
    filename = "model_weights_saved.hdf5"
    model.load_weights(filename)
    model.compile(loss='categorical_crossentropy', optimizer='adam')
    
    num_to_char = dict((i, c) for i, c in enumerate(chars))
    
    start = numpy.random.randint(0, len(x_data) - 1)
    pattern = x_data[start]
    print("random seed: ")
    print("\"", ''.join([num_to_char[value] for value in pattern]), "\"")
    
    
    for i in range(1000):
        x = numpy.reshape(pattern, (1, len(pattern), 1))
        x = x / float(vocab_len)
        prediction = model.predict(x, verbose=0)
        index = numpy.argmax(prediction)
        result = num_to_char[index]
        seq_in = [num_to_char[value] for value in pattern]
        
        sys.stdout.write(result)
        
        pattern.append(index)
        pattern = pattern[1:len(pattern)]
    
    
    #increase epochs for better reuslts 

    
if __name__ == "__main__":
    main()


Using TensorFlow backend.
W0719 09:28:04.939786   744 deprecation_wrapper.py:119] From C:\Users\annah\Anaconda4\lib\site-packages\keras\backend\tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W0719 09:28:05.010596   744 deprecation_wrapper.py:119] From C:\Users\annah\Anaconda4\lib\site-packages\keras\backend\tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0719 09:28:05.030542   744 deprecation_wrapper.py:119] From C:\Users\annah\Anaconda4\lib\site-packages\keras\backend\tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.



total characters:  2150
total vocab:  26
total patterns  2050


W0719 09:28:05.822425   744 deprecation_wrapper.py:119] From C:\Users\annah\Anaconda4\lib\site-packages\keras\backend\tensorflow_backend.py:133: The name tf.placeholder_with_default is deprecated. Please use tf.compat.v1.placeholder_with_default instead.

W0719 09:28:05.836389   744 deprecation.py:506] From C:\Users\annah\Anaconda4\lib\site-packages\keras\backend\tensorflow_backend.py:3445: calling dropout (from tensorflow.python.ops.nn_ops) with keep_prob is deprecated and will be removed in a future version.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
W0719 09:28:06.673152   744 deprecation_wrapper.py:119] From C:\Users\annah\Anaconda4\lib\site-packages\keras\optimizers.py:790: The name tf.train.Optimizer is deprecated. Please use tf.compat.v1.train.Optimizer instead.

W0719 09:28:06.720025   744 deprecation_wrapper.py:119] From C:\Users\annah\Anaconda4\lib\site-packages\keras\backend\tensorflow_backend.py:3295: T

Epoch 1/4

Epoch 00001: loss improved from inf to 3.10040, saving model to model_weights_saved.hdf5
Epoch 2/4

Epoch 00002: loss improved from 3.10040 to 2.97121, saving model to model_weights_saved.hdf5
Epoch 3/4

Epoch 00003: loss improved from 2.97121 to 2.95645, saving model to model_weights_saved.hdf5
Epoch 4/4

Epoch 00004: loss improved from 2.95645 to 2.95605, saving model to model_weights_saved.hdf5
random seed: 
" rnal message christmas desire us coronation next june shall dedicate anew service shall presence gre "
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     