In [1]:
import numpy as np
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam
from keras.layers import LSTM
from keras.models import load_model

import h5py

import datetime

BORDER = "==============================================================="

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
def load_data(filename):
    raw_text = open(filename).read()
    text_list = raw_text.strip().split('\n')
    return text_list

In [3]:
def remove_int(text):
    '''
    Remove numbers from strings
    '''
    new_text = []
    for line in text:
        # Remove integer values
        no_digits = ''.join([i for i in line if not i.isdigit()])

        # Remove punctuation
        new_text.append(no_digits)
    return new_text


def remove_empty(text):
    '''
    Removes all empty string
    '''
    new_text = []
    for line in text:
        if not line.isspace():
            new_text.append(line)
    return new_text


def lowercase(text):
    '''
    Convert text to all lowercase and remove punctuation and numbers
    '''
    new_text = []
    for line in text:
        # Make text lowercase
        line = line.lower()

        new_text.append(line)
    return new_text


def remove_punctuation(text):
    '''
    Convert text to all lowercase and remove punctuation and numbers
    '''
    new_text = []
    for line in text:
        sentence = ''.join([i for i in line])

        # Remove punctuation
        new_text.append(sentence.translate(
            str.maketrans('', '', string.punctuation)))
    return new_text

In [4]:
def separate_sonnets(text):
    return text.split('\n\n\n')

def separate_paragraphs(text):
    return text.split('\n')

In [5]:
def process_data_RNN(text_list, verbose=0):
    '''
    Create fixed length training sequences of length 40 char from the sonnet
    corpus.

    Input: Text file in the form of list of words

    Output: X, Y, dataX, dataY, int_to_char, n_vocab
    '''

    print("Processing datafile....")

    # Preprocessing
    text_list = remove_int(text_list)
    text_list = remove_empty(text_list)
    text_list = lowercase(text_list)

    new_text = '\n'.join(text_list)
    
    # Separate into sonnets
    sonnets = separate_sonnets(new_text)

    if verbose == 1:
        print(BORDER)
        print("Processed text")
        for i, sonnet in enumerate(sonnets):
            print(BORDER)
            print("Sonnet ", i, ": ")
            print(sonnet)
            print(BORDER)
        print(new_text)
        print(BORDER)

    # create mapping of unique chars to integers, and a reverse mapping
    chars = sorted(list(set(new_text)))
    char_to_int = dict((c, i) for i, c in enumerate(chars))
    int_to_char = dict((i, c) for i, c in enumerate(chars))

    # summarize the loaded data
    n_chars = len(new_text)
    n_vocab = len(chars)
    n_sonnets = len(sonnets)

    # prepare the dataset of input to output pairs encoded as integers
    seq_length = 40
    dataX = []
    dataY = []
    sonnetInd = []  # list of indices that denotes the start
    dataStart = []
    for sonnet in sonnets:
        for i in range(0, len(sonnet) - seq_length):
            seq_in = sonnet[i:i + seq_length]
            seq_out = sonnet[i + seq_length]
            dataX.append([char_to_int[char] for char in seq_in])
            dataY.append(char_to_int[seq_out])

            if i == 0:
                sonnetInd.append(len(dataX) - 1)
                
                startPattern = [char_to_int[char] for char in seq_in]
                if len(startPattern) != 40:
                    print("ERROR -- START PATTERN IS NOT OF CHAR LEN 40!!")
                    print("Seq_in: {}".format(seq_in))
                    print("Seq_out: {}".format(seq_out))

                dataStart.append(startPattern)
                
        
                
    n_patterns = len(dataX)

    if verbose == 1:
        print(BORDER)
        print("Processed Text Summary")
        print("Total Characters: ", n_chars)
        print("Total Vocab: ", n_vocab)
        print("Total Patterns: ", n_patterns)
        print("Number of Sonnets: ", n_sonnets)
        print(BORDER)

    X_start = np.zeros((n_sonnets, seq_length, n_vocab))
    for j, i in enumerate(sonnetInd):
        sentence = dataX[i]
        for t, ind in enumerate(sentence):
            X_start[j, t, ind] = 1

    X = np.zeros((n_patterns, seq_length, n_vocab))
    y = np.zeros((n_patterns, n_vocab))
    for i, sentence in enumerate(dataX):
        for t, ind in enumerate(sentence):
            X[i, t, ind] = 1
        y[i, dataY[i]] = 1

    return X, y, dataX, dataY, dataStart, int_to_char, char_to_int


In [6]:
def process_data_Naruto(text_list, verbose=0):
    '''
    Create fixed length training sequences of length 40 char from the paragraph
    corpus.

    Input: Text file in the form of list of words

    Output: X, Y, dataX, dataY, int_to_char, n_vocab
    '''

    print("Processing datafile....")

    # Preprocessing
    text_list = remove_int(text_list)
    text_list = remove_empty(text_list)
    text_list = lowercase(text_list)

    new_text = '\n'.join(text_list)
    
    # Separate into paragraphs
    paragraphs = separate_paragraphs(new_text)
    
    # make sure each paragraph now ends in a new line. Messy? Maybe. Effective? Yes.
    for i in range(len(paragraphs)):
        paragraphs[i] += "\n"
    
    if verbose == 1:
        print(BORDER)
        print("Processed text")
        for i, paragraph in enumerate(paragraphs):
            print(BORDER)
            print("paragraph ", i, ": ")
            print(paragraph)
            print(BORDER)
        print(new_text)
        print(BORDER)

    # create mapping of unique chars to integers, and a reverse mapping
    chars = sorted(list(set(new_text)))
    char_to_int = dict((c, i) for i, c in enumerate(chars))
    int_to_char = dict((i, c) for i, c in enumerate(chars))

    # summarize the loaded data
    n_chars = len(new_text)
    n_vocab = len(chars)
    n_paragraphs = len(paragraphs)

    # prepare the dataset of input to output pairs encoded as integers
    seq_length = 40
    dataX = []
    dataY = []
    paragraphInd = []  # list of indices that denotes the start
    dataStart = []
    for paragraph in paragraphs:
        for i in range(0, len(paragraph) - seq_length):
            seq_in = paragraph[i:i + seq_length]
            seq_out = paragraph[i + seq_length]
            dataX.append([char_to_int[char] for char in seq_in])
            dataY.append(char_to_int[seq_out])

            if i == 0:
                paragraphInd.append(len(dataX) - 1)
                
                startPattern = [char_to_int[char] for char in seq_in]
                if len(startPattern) != 40:
                    print("ERROR -- START PATTERN IS NOT OF CHAR LEN 40!!")
                    print("Seq_in: {}".format(seq_in))
                    print("Seq_out: {}".format(seq_out))

                dataStart.append(startPattern)
                
        
                
    n_patterns = len(dataX)

    if verbose == 1:
        print(BORDER)
        print("Processed Text Summary")
        print("Total Characters: ", n_chars)
        print("Total Vocab: ", n_vocab)
        print("Total Patterns: ", n_patterns)
        print("Number of paragraphs: ", n_paragraphs)
        print(BORDER)

    X_start = np.zeros((n_paragraphs, seq_length, n_vocab))
    for j, i in enumerate(paragraphInd):
        sentence = dataX[i]
        for t, ind in enumerate(sentence):
            X_start[j, t, ind] = 1

    X = np.zeros((n_patterns, seq_length, n_vocab))
    y = np.zeros((n_patterns, n_vocab))
    for i, sentence in enumerate(dataX):
        for t, ind in enumerate(sentence):
            X[i, t, ind] = 1
        y[i, dataY[i]] = 1

    return X, y, dataX, dataY, dataStart, int_to_char, char_to_int

In [7]:
def train_LSTM(X, y, verbose=0):
    '''
    Takes training data X and Y and returns the fitted LSTM model

    Input:
        X : a list of sequences of int
        Y : one-hot encoding of the int coming after the sequence
    '''

    print("Building Model...")

    # Take a subset of sequences
#     X = X[0::5]
#     y = y[0::5]

    # define the LSTM model
    model = Sequential()
    model.add(LSTM(180, input_shape=(X.shape[1], X.shape[2])))
    model.add(Dense(y.shape[1], activation='softmax'))

    optimizer = Adam(lr=0.01)
    model.compile(loss='categorical_crossentropy',
                  optimizer=optimizer, metrics=['accuracy'])

    # fit the model
    model.fit(X, y, epochs=65, batch_size=512, verbose=verbose)
    return model


In [8]:
def sample(preds, temperature=1.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

In [9]:
def generate_text(model, dataStart, int_to_char, char_to_int,
                  diversity=1.0, seed=0, verbose=0):
    '''
    Given model, dataX, int_to_char, n_vocab returns generated_text using
    predict function

    Input:
        model: the LSTM model that we trained
        dataX: list of sequences
        int_to_char: a dictionary matching interger to specific character
        n_vocab: number of unique characters we have

    Output: generate_text as string

    '''

    print("Generating text...")

    n_vocab = len(int_to_char)
    size = len(dataStart[0])

    if seed == 0:
        # pick a random seed
        start = np.random.randint(0, len(dataStart) - 1)
        pattern = dataStart[start]
        
    else:
        seed = 'shall i compare thee to a summer\'s day?\n'
        pattern = [char_to_int[char] for char in seed]

        
    if len(pattern) == 41:
        pattern = [pattern[i] for i in range(40)]
#         print("THIS PATTERN =================================")
#         print("THIS PATTERN =================================")
#         print("THIS PATTERN =================================")
#         print("THIS PATTERN =================================")
#         print("".join([int_to_char[value] for value in pattern]))
#         print("THIS PATTERN =================================")
#         print("THIS PATTERN =================================")
#         print("THIS PATTERN =================================")
#         print("THIS PATTERN =================================")
        
    seq = [int_to_char[value] for value in pattern]

    if verbose == 1:
        print("Seed: ", ''.join(seq))

    # generate characters
    num_lines = 1
    max_num_lines = 14
    while num_lines < max_num_lines:
        
        # Create and normalize x to be input of RNN
        x = np.zeros((1, size, n_vocab))
        for t, char in enumerate(pattern):
            x[0, t, char] = 1

        # Make prediction using trained model
        prediction = model.predict(x, verbose=verbose)[0]
        index = sample(prediction, diversity)

        # Convert prediction to character
        result = int_to_char[index]
        
        # Add prediction to pattern and set to size 40
        oldPattern = [i for i in pattern]
        pattern.append(index)
        pattern = pattern[1:1 + size]

        # Add result to seq
        seq.append(result)

        if result == '\n':
            num_lines += 1

        if verbose == 1:
            print(BORDER)
            print("character ", i)
            print("selected char: ", result)
            print("new pattern: ",
                  ''.join([int_to_char[value] for value in pattern]))
            print(BORDER)
        
        

    # Return seq as string
    return ''.join(seq)

In [10]:
def generate_paragraph(model, dataStart, int_to_char, char_to_int,
                       diversity=1.0, seed=0, verbose=0):
    '''
    Given model, dataX, int_to_char, n_vocab returns generated_text using
    predict function

    Input:
        model: the LSTM model that we trained
        dataX: list of sequences
        int_to_char: a dictionary matching interger to specific character
        n_vocab: number of unique characters we have

    Output: generate_text as string

    '''

    print("Generating text...")

    n_vocab = len(int_to_char)
    size = len(dataStart[0])

    if seed == 0:
        # pick a random seed
        start = np.random.randint(0, len(dataStart) - 1)
        pattern = dataStart[start]
        
    else:
        seed = 'Believe it!'
        pattern = [char_to_int[char] for char in seed]

        
    if len(pattern) == 41:
        pattern = [pattern[i] for i in range(40)]
#         print("THIS PATTERN =================================")
#         print("THIS PATTERN =================================")
#         print("THIS PATTERN =================================")
#         print("THIS PATTERN =================================")
#         print("".join([int_to_char[value] for value in pattern]))
#         print("THIS PATTERN =================================")
#         print("THIS PATTERN =================================")
#         print("THIS PATTERN =================================")
#         print("THIS PATTERN =================================")
        
    seq = [int_to_char[value] for value in pattern]

    if verbose == 1:
        print(BORDER)
        print("Seed: ", ''.join(seq))
        print("Diversity: {}".format(diversity))
        print(BORDER)

    # generate characters
    no_new_line = True
    while no_new_line:
        
        # Create and normalize x to be input of RNN
        x = np.zeros((1, size, n_vocab))
        for t, char in enumerate(pattern):
            x[0, t, char] = 1

        # Make prediction using trained model
        prediction = model.predict(x, verbose=0)[0]
        index = sample(prediction, diversity)

        # Convert prediction to character
        result = int_to_char[index]
        
        # Add prediction to pattern and set to size 40
        oldPattern = [i for i in pattern]
        pattern.append(index)
        pattern = pattern[1:1 + size]

        # Add result to seq
        seq.append(result)

        if result == '\n':
            no_new_line = False

        if verbose == 1:
            print(result, end='')
        
        

    # Return seq as string
    return ''.join(seq)

In [11]:
def save_textfile(filename, text):
    '''
    Given filename and text, save text in file

    Input: filename and text as string
    '''
    print("Saving generated text...")
    f = open(filename, 'w')
    f.write(text)
    f.close()
    return 0

In [12]:
datasetName = 'naruto'
file = 'data/' + datasetName + '.txt'

verbose = 0

text_list = load_data(file)
(X, y, dataX, dataY, dataStart, int_to_char, char_to_int) = (process_data_Naruto(text_list, verbose=verbose))

Processing datafile....


In [13]:
print(len(dataStart))

for i in range(10):
    line = "".join([int_to_char[value] for value in dataStart[i]])
    print("line {}: {}".format(i, line))

5225
line 0: naruto has been hearing it for as long a
line 1: it's not a voice or anything, just a gro
line 2: naruto knows better than to talk about i
line 3: he doesn't sleep much on those nights. h
line 4: he's seven when he tries to talk to it f
line 5: it's one of those dark, lonely nights ag
line 6: he's lying on his bed, running his eyes 
line 7: naruto closes his eyes, reaches back int
line 8: there's no response, but the growling st
line 9: but he's not looking at his bedroom ceil


In [14]:
model = train_LSTM(X, y, verbose=1)

Building Model...
Epoch 1/65
Epoch 2/65
Epoch 3/65
Epoch 4/65
Epoch 5/65
Epoch 6/65
Epoch 7/65
Epoch 8/65
Epoch 9/65
Epoch 10/65
Epoch 11/65
Epoch 12/65
Epoch 13/65
Epoch 14/65
Epoch 15/65
Epoch 16/65
Epoch 17/65
Epoch 18/65
Epoch 19/65
Epoch 20/65
Epoch 21/65
Epoch 22/65
Epoch 23/65
Epoch 24/65
Epoch 25/65
Epoch 26/65
Epoch 27/65
Epoch 28/65
Epoch 29/65
Epoch 30/65
Epoch 31/65
Epoch 32/65
Epoch 33/65
Epoch 34/65
Epoch 35/65
Epoch 36/65
Epoch 37/65
Epoch 38/65
Epoch 39/65
Epoch 40/65
Epoch 41/65
Epoch 42/65
Epoch 43/65
Epoch 44/65
Epoch 45/65
Epoch 46/65
Epoch 47/65
Epoch 48/65
Epoch 49/65
Epoch 50/65
Epoch 51/65
Epoch 52/65
Epoch 53/65
Epoch 54/65
Epoch 55/65
Epoch 56/65
Epoch 57/65
Epoch 58/65
Epoch 59/65
Epoch 60/65
Epoch 61/65
Epoch 62/65
Epoch 63/65
Epoch 64/65
Epoch 65/65


In [16]:

now = datetime.datetime.now();
datestamp = "{}.{}.{}_{}.{}.{}".format(now.year, now.month, now.day, now.hour, now.minute, now.second)

# Save model out
filename = "{}_{}_LSTM_model.h5".format(datestamp, datasetName)
model.save(filename)

In [20]:


verbose = 0
seed = 0

minDiversity = 0.05
stepSize = 0.05

diversity = 0.3

output = ""

for i in range(20):

#     generated = generate_text(model, dataStart, int_to_char, char_to_int, diversity=diversity, seed=seed, verbose=1)
    generated = generate_paragraph(model, dataStart, int_to_char, char_to_int, diversity=diversity, seed=seed, verbose=1)

#     currentPoem = ("DIVERSITY {} ===============================\n".format(diversity))
    currentPoem = BORDER + "\n"
    currentPoem += generated + "\n"
    
    output += currentPoem
    
now = datetime.datetime.now();
filename = "{}_{}_seeded.txt".format(datestamp, datasetName)
save_textfile(filename, output)
    

Generating text...
Seed:  sasuke has been compulsively checking hi
Diversity: 0.3
s fingers to him and settles over his head and has no move to the one that had been still an emotion, and he doesn't move the time the same than a couple of strength is a power of the same part of the whole of the body distance and his throat the strongest fingers and the dead as the forest of the second in the back of his head.
Generating text...
Seed:  "same bet on my team. ,." kurenai adds w
Diversity: 0.3
ith a hand on his lap and starts to see the second from it.
Generating text...
Seed:  there's an adhesive bandage with another
Diversity: 0.3
 seals and dead to get him and the same entirely the same statement are a second for the silence of the part of the fire is still still an interest and the possible little steps of strength and the words of a second of the same way he still looks to the blood and the first finger of sasuke's face, and h

  after removing the cwd from sys.path.


e can see the same than a pain and where the second of the most stone he doesn't see the same thing it's a few seconds and the stuff starts to do with a strength and starts to protect him and settles to his head and reaching for the side of the scrolls and darts the same a particular hours of the shinobi back and starts to start them to say hard to the single stripping the chakra in the stone of the space of the sight of the strength and the same thing he has to see the same soft hand to the back of his head. "i don't see the counter of the back of his head.
Generating text...
Seed:  iruka freezes immediately, his mug halfw
Diversity: 0.3
ay to the scroll of his eyes off one of the stream of his feet.
Generating text...
Seed:  naruto crosses his legs, placing a hand 
Diversity: 0.3
over the shinobi sense of the deep body and the shower of the black and the most of the same soft his the same of the arena with the water of his feet.
Generating text...
Seed:  "worst hit, my ass." sasuke m

his own off and strength with the strongest being the entire chakra signatures of a strength between them and the point of his face. "i don't have a black black stone and the second of a few seconds of the back of his head and the second that has been still the second part of the second of the same answer is the shadows and strongest as well.
Generating text...
Seed:  "ikari-sensei says that's just an old wi
Diversity: 0.3
nd of shinobi of the forest of the rest of the second of a finger and powerful strength has been able to pull the more than shikamaru's shoulders that he doesn't have to do the same than a second and the way he doesn't look at the shinobi training he has to start them to the wall and reaching for the rest of the end of his chest with a second of the stuff in the back of his back and his brat and sasuke now have the sharp statue of the flutter of chakra in the most being and the same as he starts to disturb the shinobi space before they were a second and an arm and st

y off to the floor of his fingers and the last of them with the water of his shoulders and has not already took the second of his body and the rest of the surface of the stuff in the second has been probably any of the same seconds of the second chakra seal to the single space and stretches the strange particular state of a second to his fingers that he starts to see a few seconds of the end of the same complex of the sight of some weakned blue of the shinobi student has been a part of the back of his head, and he should be a long for a heart and starts to go off the shinobi statement and strange hands on his eyes and an anchor of the shadow shoulders to the same soft back in right and the shinobi parts of the strength that has to start the same chakra in the corner of his head. he can do it. he's still finished and pain and reaching for a strange shinobi with a read and an angry and with the same fact that he doesn't matter the rest of the back of his back and his fingers and not to s

0

In [None]:
generated = generate_paragraph(model, dataStart, int_to_char, char_to_int, diversity=0.5, seed=seed, verbose=1)
print(generated)

In [None]:
newModel = load_model('2018.3.8_0.9.27_LSTM_model.h5')

In [None]:
generated = generate_text(newModel, dataStart, int_to_char, char_to_int, diversity=0.5, seed=1, verbose=verbose)
print(generated)