In [2]:
#first practice of text generation - to familiarise myself
#character prediction 

import numpy
import sys
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM
from keras.utils import np_utils
from keras.callbacks import ModelCheckpoint


def tokenize_words(input):
    # lowercase everything to standardize it
    input = input.lower()

    # instantiate the tokenizer
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(input)

    # if the created token isn't in the stop words, make it part of "filtered"
    filtered = filter(lambda token: token not in stopwords.words('english'), tokens)
    return " ".join(filtered)



def main():

    #MAKE IT ALL FILES
    file = open("data/1952.txt").read()
    #file = open("").read()
    processed = tokenize_words(file)
    set_p = set(processed)
    list_p = list(set_p)
    chars = sorted(list_p)
    char_to_num = dict((c, i) for i, c in enumerate(chars))
    
    input_len = len(processed)
    vocab_len = len(chars)
    print('total characters: ', input_len)
    print('total vocab: ', vocab_len)
    
    seq_length = 100
    x_data = []
    y_data = []
    
    for i in range(0, input_len - seq_length, 1):
        in_seq = processed[i:i + seq_length]
        out_seq = processed[i + seq_length]
        x_data.append([char_to_num[char] for char in in_seq])
        y_data.append(char_to_num[out_seq])
    
    n_patterns = len(x_data)
    print('total patterns ', n_patterns)
    
    X = numpy.reshape(x_data, (n_patterns, seq_length, 1))
    X = X/float(vocab_len)
    y = np_utils.to_categorical(y_data)
    
    model = Sequential()
    model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2]), return_sequences=True))
    model.add(Dropout(0.2))
    model.add(LSTM(256, return_sequences=True))
    model.add(Dropout(0.2))
    model.add(LSTM(128))
    model.add(Dropout(0.2))
    model.add(Dense(y.shape[1], activation='softmax'))
    
    model.compile(loss='categorical_crossentropy', optimizer='adam')
    
    filepath = "model_weights_saved.hdf5"
    checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
    desired_callbacks = [checkpoint]
    
    model.fit(X, y, epochs=20, batch_size=256, callbacks=desired_callbacks)
    
    filename = "model_weights_saved.hdf5"
    model.load_weights(filename)
    model.compile(loss='categorical_crossentropy', optimizer='adam')
    
    num_to_char = dict((i, c) for i, c in enumerate(chars))
    
    start = numpy.random.randint(0, len(x_data) - 1)
    pattern = x_data[start]
    print("random seed: ")
    print("\"", ''.join([num_to_char[value] for value in pattern]), "\"")
    
    for i in range(1000):
        x = numpy.reshape(pattern, (1, len(pattern), 1))
        x = x / float(vocab_len)
        prediction = model.predict(x, verbose=0)
        index = numpy.argmax(prediction)
        result = num_to_char[index]
        seq_in = [num_to_char[value] for value in pattern]

        sys.stdout.write(result)

        pattern.append(index)
        pattern = pattern[1:len(pattern)]
   
    
    
    #increase epochs for better reuslts 

    
if __name__ == "__main__":
    main()


total characters:  2150
total vocab:  26
total patterns  2050
Epoch 1/20

Epoch 00001: loss improved from inf to 3.11444, saving model to model_weights_saved.hdf5
Epoch 2/20

Epoch 00002: loss improved from 3.11444 to 2.99798, saving model to model_weights_saved.hdf5
Epoch 3/20

Epoch 00003: loss improved from 2.99798 to 2.97527, saving model to model_weights_saved.hdf5
Epoch 4/20

Epoch 00004: loss improved from 2.97527 to 2.95593, saving model to model_weights_saved.hdf5
Epoch 5/20

Epoch 00005: loss did not improve from 2.95593
Epoch 6/20

Epoch 00006: loss improved from 2.95593 to 2.94874, saving model to model_weights_saved.hdf5
Epoch 7/20

Epoch 00007: loss improved from 2.94874 to 2.93636, saving model to model_weights_saved.hdf5
Epoch 8/20

Epoch 00008: loss did not improve from 2.93636
Epoch 9/20

Epoch 00009: loss did not improve from 2.93636
Epoch 10/20

Epoch 00010: loss did not improve from 2.93636
Epoch 11/20

Epoch 00011: loss improved from 2.93636 to 2.92338, saving mod