In [1]:
from gutenbergpy.gutenbergcache import GutenbergCache
# create cache from scratch
# GutenbergCache.create(refresh=False, download=True, unpack=True, parse=True, cache=True, deleteTemp=True)

cache = GutenbergCache.get_cache()

In [2]:
# the cache.query function's kwargs expect lists, like this:
cache.query(authors=['Kant, Immanuel'], languages=['en'])
# i'm not sure how to make it return the titles too

[5637, 28800, 50922, 48433, 5682, 46060, 26585, 5683, 5684, 52821, 4280, 59023]

In [3]:
from gutenbergpy.textget import get_text_by_id, strip_headers

In [4]:
def get_author_text(author):
    # wrapper function returning a cleaned and concatenated huge string of all of `author`'s english texts. Doesn't work very well
    assert cache
    book_ids = cache.query(authors=[author], languages=['en'], downloadtype=['application/plain', 'text/plain'])
    corpus = ""
    for book_id in book_ids:
        print("concatenating book id " + str(book_id) + " ...")
        corpus += str(strip_headers(get_text_by_id(book_id)))
    return corpus.replace('\n', ' ')

In [5]:
from nltk.tokenize import TreebankWordTokenizer
import re
#from nlpia.loaders import get_data
#word_vectors = get_data('wv')

In [6]:
kant_book_ids = [4280, 5682, 5683, 5684, 46060, 48433, 50922, 52821]
kant_book_ids_kingsmill = [5682, 5683, 5684]  # trans. Kingsmill

In [7]:
def get_books_from_ids(l):
    corpus = ""
    for id in l:
        print("concatenating book id " + str(id) + " ...")
        corpus += str(strip_headers(get_text_by_id(id)))
    return str(corpus)

In [8]:
def clean_text(text):
    r_space = re.compile(r'((\\n)+)|((\\x..)+)|((\\t)+)')
    r_del = re.compile(r"(\\)|'b'|\"b'|'b\"")
    return re.sub(r_del, '', re.sub(r_space, ' ', text))

In [10]:
def tokenize_and_vectorize(dataset):
    tokenizer = TreebankWordTokenizer()
    vectorized_data = []
    for sample in dataset:
        tokens = tokenizer.tokenize(sample)
        sample_vecs = []
        skipped_tokens = set()
        for token in tokens:
            try:
                sample_vecs.append(word_vectors[token])
            except KeyError:
                skipped_tokens.add(token)
                pass
        vectorized_data.append(sample_vecs)
    print("skipped tokens:")
    print(skipped_tokens)
    return vectorized_data

In [9]:
kant = clean_text(get_books_from_ids(kant_book_ids))

concatenating book id 4280 ...
concatenating book id 5682 ...
concatenating book id 5683 ...
concatenating book id 5684 ...
concatenating book id 46060 ...
concatenating book id 48433 ...
concatenating book id 50922 ...
concatenating book id 52821 ...


In [12]:
tokenizer = TreebankWordTokenizer()
kant_tokens = tokenizer.tokenize(kant)

In [10]:
from tensorflow import keras
from tensorflow.keras import layers
import numpy as np
import random

In [14]:
# following tut https://keras.io/examples/generative/lstm_character_level_text_generation/
def train_character_model(text,   # string
                          lower=False,  # make it lowercase
                          print_progress=True, 
                          save_model=False, 
                          save_checkpoints=False, 
                          epochs=40, 
                          batch_size=128
                         ):
    if lower:
        text = text.lower()
    chars = sorted(list(set(text)))
    char_indices = dict((c, i) for i, c in enumerate(chars))
    indices_char = dict((i, c) for i, c in enumerate(chars))

    maxlen = 40
    step = 3
    sentences = []
    next_chars = []
    for i in range(0, len(text) - maxlen, step):
        sentences.append(text[i : i + maxlen])
        next_chars.append(text[i + maxlen])
    print("Number of sequences:", len(sentences))

    x = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
    y = np.zeros((len(sentences), len(chars)), dtype=np.bool)
    for i, sentence in enumerate(sentences):
        for t, char in enumerate(sentence):
            x[i, t, char_indices[char]] = 1
        y[i, char_indices[next_chars[i]]] = 1

    model = keras.Sequential()
    model.add(keras.layers.GRU(128, input_shape=(maxlen, len(chars))))
    model.add(keras.layers.Dense(len(chars)))
    model.add(keras.layers.Activation('softmax'))
    optimizer = keras.optimizers.RMSprop(lr=0.01)
    model.compile(loss="categorical_crossentropy", optimizer=optimizer)
    model.summary()
    
    if save_checkpoints:
        checkpoint_cb = keras.callbacks.ModelCheckpoint(
            filepath="checkpoints/kant_character_model_{epoch:02d}_{loss:.2f}",
            save_weights_only=True,
            monitor='loss',
            mode='min',
        )
    
    if print_progress:
        for epoch in range(epochs):
            if save_checkpoints:
                model.fit(x, y, batch_size=batch_size, epochs=1, callbacks=[checkpoint_cb])
            else:
                model.fit(x, y, batch_size=batch_size, epochs=1)
            print()
            print("Generating text after epoch: %d" % epoch)

            start_index = random.randint(0, len(text) - maxlen - 1)
            for diversity in [0.2, 0.5, 1.0]:
                print("...Diversity:", diversity)

                generated = ""
                sentence = text[start_index : start_index + maxlen]
                print('...Generating with seed: "' + sentence + '"')

                for i in range(400):
                    x_pred = np.zeros((1, maxlen, len(chars)))
                    for t, char in enumerate(sentence):
                        x_pred[0, t, char_indices[char]] = 1.0
                    preds = model.predict(x_pred, verbose=0)[0]
                    next_index = sample(preds, diversity)
                    next_char = indices_char[next_index]
                    sentence = sentence[1:] + next_char
                    generated += next_char

                print("...Generated: ", generated)
                print()
    else:
        if save_checkpoints:
            model.fit(x, y, batch_size=batch_size, epochs=epochs, callbacks=[checkpoint_cb])
        else:
            model.fit(x, y, batch_size=batch_size, epochs=epochs)
    
    if save_model:
        keras.models.save_model(model, os.path.join('saved_models', 'kant_character_model'))
    return model

In [12]:
def sample(preds, temperature=1.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype("float64")
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

In [15]:
char_model = train_character_model(kant)

Number of sequences: 1109853
Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
gru_1 (GRU)                  (None, 128)               82944     
_________________________________________________________________
dense_1 (Dense)              (None, 86)                11094     
_________________________________________________________________
activation (Activation)      (None, 86)                0         
Total params: 94,038
Trainable params: 94,038
Non-trainable params: 0
_________________________________________________________________
Train on 1109853 samples

Generating text after epoch: 0
...Diversity: 0.2
...Generating with seed: " suppose the case of a righteous man [_e"
...Generated:  mpirical principles of the constitution of the condition of the condition of the condition of the condition of the conception of the condition of the condition of the condition of the condition

KeyboardInterrupt: 

In [15]:
def train_word_model(text, print_progress=True, maxlen=40, step=3, epochs=40, batch_size=128):
    # wow this is monstrous apparently. An epoch takes an hour to run, and running this as a function crashes Jupyter.
    tokenizer = TreebankWordTokenizer()
    tokens = tokenizer.tokenize(text)
    unique_tokens = sorted(list(set(tokens)))
    token_indices = dict((t, i) for i, t in enumerate(unique_tokens))
    indices_tokens = dict((i, t) for i, t in enumerate(unique_tokens))
    
    sentences = []
    next_tokens = []
    for i in range(0, len(tokens)-maxlen, step):
        sentences.append(tokens[i : i + maxlen])
        next_tokens.append(tokens[i + maxlen])
    print("Number of sequences:", len(sentences))
    
    # i think these are one-hot encoded matrices?
    x = np.zeros((len(sentences), maxlen, len(unique_tokens)), dtype=np.bool)
    y = np.zeros((len(sentences), len(unique_tokens)), dtype=np.bool)
    for i, sentence in enumerate(sentences):
        for t, token in enumerate(sentence):
            x[i, t, token_indices[token]] = 1
        y[i, token_indices[next_tokens[i]]] = 1
        
    model = keras.Sequential(
        [
            keras.Input(shape=(maxlen, len(unique_tokens))),
            layers.LSTM(128),
            layers.Dense(len(unique_tokens), activation="softmax"),
        ]
    )
    optimizer = keras.optimizers.RMSprop(learning_rate=0.01)
    model.compile(loss="categorical_crossentropy", optimizer=optimizer)
    
    if print_progress:
        for epoch in range(epochs):
            model.fit(x, y, batch_size=batch_size, epochs=1)
            print()
            print("Generating text after epoch: %d" % epoch)

            start_index = random.randint(0, len(tokens) - maxlen - 1)
            for diversity in [0.2, 0.5, 1.0, 1.2]:
                print("...Diversity:", diversity)

                generated = ""
                sentence = tokens[start_index : start_index + maxlen]
                print('...Generating with seed: "' + sentence + '"')

                for i in range(400):
                    x_pred = np.zeros((1, maxlen, len(unique_tokens)))
                    for t, token in enumerate(sentence):
                        x_pred[0, t, token_indices[token]] = 1.0
                    preds = model.predict(x_pred, verbose=0)[0]
                    next_index = sample(preds, diversity)
                    next_token = indices_tokens[next_index]
                    sentence = sentence[1:] + next_token
                    generated += next_token

                print("...Generated: ", generated)
                print()
    else:
        model.fit(x, y, batch_size=batch_size, epochs=epochs)
    return model

In [None]:
tokens = kant_tokens
maxlen=40
step=3
print_progress=True
epochs=40
batch_size=128
unique_tokens = sorted(list(set(tokens)))
token_indices = dict((t, i) for i, t in enumerate(unique_tokens))
indices_tokens = dict((i, t) for i, t in enumerate(unique_tokens))
sentences = []
next_tokens = []
for i in range(0, len(tokens)-maxlen, step):
    sentences.append(tokens[i : i + maxlen])
    next_tokens.append(tokens[i + maxlen])
print("Number of sequences:", len(sentences))

# i think these are one-hot encoded matrices?
x = np.zeros((len(sentences), maxlen, len(unique_tokens)), dtype=np.bool)
y = np.zeros((len(sentences), len(unique_tokens)), dtype=np.bool)
for i, sentence in enumerate(sentences):
    for t, token in enumerate(sentence):
        x[i, t, token_indices[token]] = 1
    y[i, token_indices[next_tokens[i]]] = 1
    
model = keras.Sequential(
    [
        keras.Input(shape=(maxlen, len(unique_tokens))),
        layers.LSTM(128),
        layers.Dense(len(unique_tokens), activation="softmax"),
    ]
)
optimizer = keras.optimizers.RMSprop(learning_rate=0.01)
model.compile(loss="categorical_crossentropy", optimizer=optimizer)

if print_progress:
    for epoch in range(epochs):
        model.fit(x, y, batch_size=batch_size, epochs=1)
        print()
        print("Generating text after epoch: %d" % epoch)

        start_index = random.randint(0, len(tokens) - maxlen - 1)
        for diversity in [0.2, 0.5, 1.0, 1.2]:
            print("...Diversity:", diversity)

            generated = ""
            sentence = tokens[start_index : start_index + maxlen]
            print('...Generating with seed: "' + sentence + '"')

            for i in range(400):
                x_pred = np.zeros((1, maxlen, len(unique_tokens)))
                for t, token in enumerate(sentence):
                    x_pred[0, t, token_indices[token]] = 1.0
                preds = model.predict(x_pred, verbose=0)[0]
                next_index = sample(preds, diversity)
                next_token = indices_tokens[next_index]
                sentence = sentence[1:] + next_token
                generated += next_token

            print("...Generated: ", generated)
            print()

Number of sequences: 206887

In [18]:
print(x.shape)
print(y.shape)
print(len(unique_tokens))

(206887, 40, 20590)
(206887, 20590)
20590


In [None]:
word_model = train_word_model(kant)

Number of sequences: 206887
