Scrape the source plaintext from the webpage

In [4]:
from bs4 import BeautifulSoup
import urllib.request as urllib

In [11]:
# sym is a symbolic link to help portability
def source_from_html(url):
    html = urllib.urlopen(url)
    soup = BeautifulSoup(html, 'html.parser')
    # if url == "file:/sym/ML/LOTR_fellowship_of_ring.html":
    if url == "file:/home/iamroot/LOTR_temp/LOTR_fellowship_of_ring.htm":
        source_tagged = soup.h3.parent.find_next_siblings('p')
    else:
        source_tagged = soup.h3.parent.parent.find_next_siblings('p')
    source = ' '.join([t.get_text().replace('\r\n', ' ').replace('\n', ' ') for t in source_tagged[:-2]])
    return source

In [12]:
# TODO: is removing punctuation necessary?  first try it without
# doing so and see what happens
source_urls = [
    "file:/home/iamroot/LOTR_temp/LOTR_fellowship_of_ring.htm",
    "file:/home/iamroot/LOTR_temp/LOTR_two_towers.htm",
    "file:/home/iamroot/LOTR_temp/LOTR_return_of_king.htm"]
    #"file:/sym/ML/LOTR_fellowship_of_ring.html",
    #"file:/sym/ML/LOTR_two_towers.html",
    #"file:/sym/ML/LOTR_return_of_king.html"]
sources = [source_from_html(source_url) for source_url in source_urls]
source = ' '.join(sources)

In [38]:
source_len = len(source)

Split the source text into overlapping word sequences

In [13]:
# split the source into overlapping subsequences of words
# each subsequence will be converted into a vector input for the NN
# step defines the offset of the overlapping
def split_source(source, sequence_length, step):
    words = source.split()
    num_words = len(words)
    for i in range(0, num_words-sequence_length, step):
        yield ' '.join(words[i:i+sequence_length])


In [198]:
sequence_length = 40
step = 3
sequences = list(split_source(source, sequence_length, step))

In [199]:
# don't forget sequences is a generator and so must be explicitly
# cast to a list
print(sequences[:3])

['This book is largely concerned with Hobbits, and from its pages a reader may discover much of their character and a little of their history. Further information will also be found in the selection from the Red Book of Westmarch', 'largely concerned with Hobbits, and from its pages a reader may discover much of their character and a little of their history. Further information will also be found in the selection from the Red Book of Westmarch that has already', 'Hobbits, and from its pages a reader may discover much of their character and a little of their history. Further information will also be found in the selection from the Red Book of Westmarch that has already been published, under']


convert the word sequences into numerical vectors to feed into the RNN

In [189]:
# NOTE: CANNOT USE TOKENIZER THAT ONLY WORKS FOR BAG OF WORDS
# AND IGNORES WORD ORDER, HAVE TO DO IT THE WAY THE EXAMPLE
# DOES INSTEAD, MANUALLY
# TODO: the above, what the example does is creates a dictionary
# of (key, word) pairs, then from there creates a 3d vector where:
#     dimension 1 is the sequences
#     dimension 2 is the words in a given sequence (of length sequence_length)
#     dimension 3 contains all 0s except for a 1 in the index corresponding to
#      the given word's dictionary key value
# that may not be exactly how the example does it (it seems to do stuff with
#  characters too), but that's how I should probably do it,
# the result will be the input tensor that can be fed into the model

In [200]:
# assign each word to a unique number as a dict key
words = source.split()
word_indices = dict((w, i) for i, w in enumerate(words))
index_words = dict((i, w) for i, w in enumerate(words))
corpus_size = sum(word_indices.values())

In [26]:
import numpy as np

In [214]:
# convert the word sequences to vectors
# TODO: it gives a memory error, because the matrix is too big!
# need to split it up into separate training batches
print(len(sequences))
print(sequence_length)
print(corpus_size)
input_vectors = np.zeros((len(sequences), sequence_length, corpus_size))

# TODO: set the index values of the words to 1 as in the example code

155720
40
9933778244


MemoryError: 

since words are too much, use characters instead:

In [23]:
# assign each char to a number as a dict key
chars = sorted(list(set(source)))
char_count = len(chars)
print('char count:', char_count)
char_to_index = dict((c, i) for i, c in enumerate(chars))
index_to_char = dict((i, c) for i, c in enumerate(chars))

char count: 102


In [24]:
# split the source into overlapping sequences of characters
sequence_length = 40 # all sequences are this length if the source is
sequence_step = 3 # amount offset from each sequence to the next
sequences = []
ground_truth = []
for i in range(0, len(source)-sequence_length, sequence_step):
    sequences.append(source[i: i+sequence_length])
    ground_truth.append(source[i+sequence_length])
sequence_count = len(sequences)
print('sequence count:', sequence_count)

sequence count: 822057


In [21]:
# convert sequences to a 3d vector where:
#     dimension 1 is the sequences
#     dimension 2 is the chars in a given sequence (of length sequence_length)
#     dimension 3 contains all 0s except for a 1 in the index corresponding to
#      the given char's dictionary key value

In [39]:
# the given sequences
x_train = np.zeros((sequence_count, sequence_length, char_count), dtype=np.bool)
# what the next char after each sequence is
y_train = np.zeros((sequence_count, char_count), dtype=np.bool)
for i, sequence in enumerate(sequences):
    for t, char in enumerate(sequence): # t for time!
        x_train[i, t, char_to_index[char]] = 1
    y_train[i, char_to_index[ground_truth[i]]] = 1

specify the model for the RNN

In [40]:
from keras.models import Sequential
from keras.layers import Dense, Activation, LSTM
from keras.optimizers import RMSprop
# TODO: I saw another example that used 3 LSTM layers, which
# would likely allow it to capture higher level abstractions,
# at least on a grammatical level.  run it with 1 first since
# it will train quicker and then once I have an MVP, improve it
out_dim = 128

model = Sequential()
model.add(LSTM(out_dim, input_shape=(sequence_length, char_count)))
model.add(Dense(char_count))
model.add(Activation('softmax'))

optimizer = RMSprop(lr=.01)
# we use crossentropy because it's a form of classification problem,
# where the classes are which character comes next
model.compile(loss='categorical_crossentropy', optimizer=optimizer)

define helper functions for generating text and displaying progress

In [43]:
import random

# get the highest probability index based on our predictions, the best guess from our network
def sample_index(predictions, temperature=1.0):
    # not sure what all the math is for, hopefully this will
    # become more clear as I continue to work with it
    predictions = np.asarray(predictions).astype('float64')
    predictions = np.log(predictions) / temperature
    exp_predictions = np.exp(predictions)
    predictions = exp_predictions / np.sum(exp_predictions)
    probabilities = np.random.multinomial(1, predictions, 1)
    return np.argmax(probabilities)

    
def generate_text(output_len):
    start_index = random.randint(0, source_len - sequence_length - 1)
    for diversity in [.2, .5, 1.0, 1.2]:
        
        # grab an input sequence
        sequence = source[start_index: start_index+sequence_length]
        print('generating with input: "', str(sequence), '"')
        
        # vectorize it
        # TODO: make this a function
        for i in range(output_len):
            x_pred = np.zeros((1, sequence_length, char_count))
            for t, char in enumerate(sequence):
                x_pred[0, t, char_to_index[char]] = 1
        
        # predict the next character
        # not sure what the [0] is for, an added dimension for the number of vectors?  in this case, we only have one
        predictions = model.predict(x_pred, verbose=0)[0]
        
        # and add it to the sequence, cutting off the earliest char so it stays the same length
        # next_index is our best guess as to what character's index comes next
        # predictions finds that for us from the predictions array
        next_index = sample_index(predictions, diversity)
        next_char = index_to_char[next_index]
        sequence = sequence[1:] + next_char
        

def report_epoch_progress(epoch, logs):
    print('\n===EPOCH COMPLETE===')
    print('end of epoch', epoch)
    print('generating sample text...')
    generate_text(400)

run the RNN with the vector input to learn word usage patterns

In [46]:
from keras.callbacks import LambdaCallback

progress_callback = LambdaCallback(on_epoch_end=report_epoch_progress)

In [None]:
batch_size = 128
epochs=60
callbacks=[progress_callback]
model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, callbacks=callbacks)

Make predictions with the RNN to generate new text