In [None]:
import numpy as np
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM

BORDER = "==============================================================="

In [None]:
def load_data(filename):
    raw_text = open(filename).read()
    text_list = raw_text.strip().split('\n')
    return text_list

In [None]:
def remove_int(text):
    '''
    Remove numbers from strings
    '''
    new_text = []
    for line in text:
        # Remove integer values
        no_digits = ''.join([i for i in line if not i.isdigit()])

        # Remove punctuation
        new_text.append(no_digits)
    return new_text

In [None]:
def remove_empty(text):
    '''
    Removes all empty string
    '''
    new_text = []
    for line in text:
        if line != '':
            new_text.append(line)
    return new_text

In [None]:
def process_data_RNN(text, verbose=0):
    '''
    Create fixed length training sequences of length 40 char from the sonnet
    corpus.

    Input: Text file in the form of list of words

    Output: X, Y, dataX, dataY, int_to_char, n_vocab
    '''

    print("Processing datafile....")

    new_text_list = remove_int(text)
    new_text_list = remove_empty(new_text_list)
    new_text = '\n'.join(new_text_list)

    if verbose == 1:
        print(BORDER)
        print("Processed text")
        print(new_text)
        print(BORDER)

    # create mapping of unique chars to integers, and a reverse mapping
    chars = sorted(list(set(new_text)))
    char_to_int = dict((c, i) for i, c in enumerate(chars))
    int_to_char = dict((i, c) for i, c in enumerate(chars))

    # summarize the loaded data
    n_chars = len(new_text)
    n_vocab = len(chars)

    # prepare the dataset of input to output pairs encoded as integers
    seq_length = 40
    dataX = []
    dataY = []
    for i in range(0, n_chars - seq_length):
        seq_in = new_text[i:i + seq_length]
        seq_out = new_text[i + seq_length]
        dataX.append([char_to_int[char] for char in seq_in])
        dataY.append(char_to_int[seq_out])
    n_patterns = len(dataX)

    if verbose == 1:
        print(BORDER)
        print("Processed Text Summary")
        print("Total Characters: ", n_chars)
        print("Total Vocab: ", n_vocab)
        print("Total Patterns: ", n_patterns)
        print(BORDER)

    X = np.zeros((n_patterns, seq_length, n_vocab))
    y = np.zeros((n_patterns, n_vocab))
    for i, sentence in enumerate(dataX):
        for t, ind in enumerate(sentence):
            X[i, t, ind] = 1
        y[i, dataY[i]] = 1

    return X, y, dataX, dataY, int_to_char, char_to_int

In [None]:
def train_LSTM(X, y, verbose=0):
    '''
    Takes training data X and Y and returns the fitted LSTM model

    Input:
        X : a list of sequences of int
        Y : one-hot encoding of the int coming after the sequence
    '''

    print("Building Model...")

    # Take a submit of sequences
    # X = X[0::10]
    # y = y[0::10]

    # define the LSTM model
    model = Sequential()
    model.add(LSTM(128, input_shape=(X.shape[1], X.shape[2])))
    model.add(Dense(y.shape[1], activation='softmax'))
    model.compile(loss='categorical_crossentropy',
                  optimizer='RMSprop', metrics=['accuracy'])

    # fit the model
    model.fit(X, y, epochs=50, batch_size=128, verbose=verbose)
    return model


In [None]:
def sample(preds, temperature=1.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

In [None]:
def generate_text(model, dataX, int_to_char, verbose=0):
    '''
    Given model, dataX, int_to_char, n_vocab returns generated_text using
    predict function

    Input:
        model: the LSTM model that we trained
        dataX: list of sequences
        int_to_char: a dictionary matching interger to specific character
        n_vocab: number of unique characters we have

    Output: generate_text as string

    '''

    print("Generating text...")

    n_vocab = len(int_to_char)
    diversity = 0.2

    # pick a random seed
    start = np.random.randint(0, len(dataX) - 1)
    pattern = dataX[start]
    seq = [int_to_char[value] for value in pattern]
    size = len(pattern)

    if verbose == 1:
        print("Seed: ", ''.join(seq))

    # generate characters
    for i in range(600):
        # Create and normalize x to be input of RNN
        x = np.zeros((1, size, n_vocab))
        for t, char in enumerate(pattern):
            x[0, t, char] = 1

        # Make prediction using trained model
        prediction = model.predict(x, verbose=verbose)[0]
        index = sample(prediction, diversity)

        # Convert prediction to character
        result = int_to_char[index]

        # Add prediction to pattern and set to size 40
        pattern.append(index)
        pattern = pattern[1:1 + size]

        # Add result to seq
        seq.append(result)

        if verbose == 1:
            print(BORDER)
            print("character ", i)
            print("selected char: ", result)
            print("new pattern: ",
                  ''.join([int_to_char[value] for value in pattern]))
            print(BORDER)

    # Return seq as string
    return ''.join(seq)

In [None]:
def save_textfile(filename, text):
    '''
    Given filename and text, save text in file

    Input: filename and text as string
    '''
    print("Saving generated text...")
    f = open(filename, 'w')
    f.write(text)
    f.close()
    return 0

In [None]:
file = 'data/shakespeare.txt'
save = 'generated/shakespeare_2.txt'
verbose = 1

text_list = load_data(file)
(X, y, dataX, dataY, int_to_char, char_to_int) = process_data_RNN(text_list)

In [None]:
model = train_LSTM(X, y, verbose=verbose)

In [None]:
generated = generate_text(model, dataX, int_to_char, verbose=verbose)
save_textfile(save, generated)
print(generated)