Brennan Duff


Assignment 5: Text Generation Using LSTM on Project Gutenberg Training Data

In [1]:
import numpy as np
import re
import string
import tensorflow as tf
from tensorflow.keras import layers, models, callbacks, losses
import requests

# Constants
VOCAB_SIZE = 10000
MAX_LEN = 200
EMBEDDING_DIM = 100
N_UNITS = 128
VALIDATION_SPLIT = 0.2
SEED = 42
LOAD_MODEL = False
BATCH_SIZE = 32
EPOCHS = 25

# List of URLs for additional texts (Shakespeare plays)
urls = [
    "https://www.gutenberg.org/files/1524/1524-0.txt",  # Hamlet
    "https://www.gutenberg.org/files/1533/1533-0.txt",  # Macbeth
    "https://www.gutenberg.org/files/1112/1112-0.txt"   # Othello
]

# Preprocessing function to remove Gutenberg headers and footers
def clean_text(text):
    start_idx = text.find("*** START OF THIS PROJECT GUTENBERG EBOOK")
    end_idx = text.find("*** END OF THIS PROJECT GUTENBERG EBOOK")
    if start_idx != -1 and end_idx != -1:
        text = text[start_idx:end_idx]
    return text.replace("\r", "").strip()

# Download and preprocess texts
all_text = ""
for url in urls:
    response = requests.get(url)
    text = response.text
    all_text += clean_text(text) + "\n\n"

# Save combined text for confirmation
file_path = "combined_shakespeare.txt"
with open(file_path, "w", encoding="utf-8") as file:
    file.write(all_text)

# Tokenize the entire text into sentences
sentences = all_text.split("\n")
sentences = [s for s in sentences if len(s) > 0]  # Filter out empty lines

# Pad punctuation
def pad_punctuation(s):
    s = re.sub(f"([{string.punctuation}])", r" \1 ", s)
    s = re.sub(" +", " ", s)
    return s

text_data = [pad_punctuation(sentence.lower()) for sentence in sentences]

# Convert to TensorFlow Dataset
text_ds = tf.data.Dataset.from_tensor_slices(text_data).batch(BATCH_SIZE).shuffle(1000, seed=SEED)

# Create a vectorization layer
vectorize_layer = layers.TextVectorization(
    standardize="lower",
    max_tokens=VOCAB_SIZE,
    output_mode="int",
    output_sequence_length=MAX_LEN + 1,
)
vectorize_layer.adapt(text_ds)

# Tokenize data for training
def prepare_inputs(text):
    text = tf.expand_dims(text, -1)
    tokenized_sentences = vectorize_layer(text)
    x = tokenized_sentences[:, :-1]
    y = tokenized_sentences[:, 1:]
    return x, y

train_ds = text_ds.map(prepare_inputs)

# Model Definition
inputs = layers.Input(shape=(None,), dtype="int32")
x = layers.Embedding(VOCAB_SIZE, EMBEDDING_DIM)(inputs)
x = layers.LSTM(N_UNITS, return_sequences=True)(x)
outputs = layers.Dense(VOCAB_SIZE, activation="softmax")(x)
lstm = models.Model(inputs, outputs)
lstm.summary()

# Compile the model
loss_fn = losses.SparseCategoricalCrossentropy()
lstm.compile(optimizer="adam", loss=loss_fn)

# Callback for generating text during training
class TextGenerator(callbacks.Callback):
    def __init__(self, index_to_word, top_k=10):
        self.index_to_word = index_to_word
        self.word_to_index = {
            word: index for index, word in enumerate(index_to_word)
        }

    def sample_from(self, probs, temperature):
        probs = probs ** (1 / temperature)
        probs = probs / np.sum(probs)
        return np.random.choice(len(probs), p=probs), probs

    def generate(self, start_prompt, max_tokens, temperature):
        start_tokens = [
            self.word_to_index.get(x, 1) for x in start_prompt.split()
        ]
        sample_token = None
        info = []
        while len(start_tokens) < max_tokens and sample_token != 0:
            x = np.array([start_tokens])
            y = self.model.predict(x, verbose=0)
            sample_token, probs = self.sample_from(y[0][-1], temperature)
            info.append({"prompt": start_prompt, "word_probs": probs})
            start_tokens.append(sample_token)
            start_prompt = start_prompt + " " + self.index_to_word[sample_token]
        print(f"\nGenerated text:\n{start_prompt}\n")
        return info

    def on_epoch_end(self, epoch, logs=None):
        self.generate("to be or not to be", max_tokens=100, temperature=1.0)

# Prepare callback
vocab = vectorize_layer.get_vocabulary()
text_generator = TextGenerator(vocab)

# Train the model
lstm.fit(
    train_ds,
    epochs=EPOCHS,
    callbacks=[text_generator],
)


Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, None)]            0         
                                                                 
 embedding (Embedding)       (None, None, 100)         1000000   
                                                                 
 lstm (LSTM)                 (None, None, 128)         117248    
                                                                 
 dense (Dense)               (None, None, 10000)       1290000   
                                                                 
Total params: 2407248 (9.18 MB)
Trainable params: 2407248 (9.18 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Epoch 1/25
Generated text:
to be or not to be delights 

Epoch 2/25
Generated text:
to be or not to be yesty thy confess longer . 

Epoch 3/25
Generated te

<keras.src.callbacks.History at 0x7ef5b85bacb0>

In [10]:
def print_probs(info, vocab, top_k=5):
    for i in info:
        print(f"\nPROMPT: {i['prompt']}")
        word_probs = i["word_probs"]
        p_sorted = np.sort(word_probs)[::-1][:top_k]
        i_sorted = np.argsort(word_probs)[::-1][:top_k]
        for p, i in zip(p_sorted, i_sorted):
            print(f"{vocab[i]}:   \t{np.round(100*p,2)}%")
        print("--------\n")

In [11]:
info = text_generator.generate(
    "To be or not to be:", max_tokens=50, temperature=0.2
)
print_probs(info, vocab)


Generated text:
To be or not to be: ; for it is a man . 


PROMPT: To be or not to be:
.:   	92.88%
;:   	6.4%
,:   	0.72%
of:   	0.0%
::   	0.0%
--------


PROMPT: To be or not to be: ;
for:   	86.23%
i:   	11.22%
and:   	0.67%
so:   	0.61%
the:   	0.51%
--------


PROMPT: To be or not to be: ; for
the:   	42.58%
it:   	17.01%
my:   	13.54%
you:   	10.95%
your:   	9.53%
--------


PROMPT: To be or not to be: ; for it
is:   	98.44%
not:   	1.48%
be:   	0.05%
i:   	0.02%
will:   	0.01%
--------


PROMPT: To be or not to be: ; for it is
a:   	90.64%
not:   	8.37%
the:   	0.82%
too:   	0.11%
no:   	0.06%
--------


PROMPT: To be or not to be: ; for it is a
man:   	72.46%
king:   	18.56%
tale:   	4.57%
little:   	1.31%
day:   	0.61%
--------


PROMPT: To be or not to be: ; for it is a man
.:   	98.93%
,:   	1.07%
;:   	0.0%
::   	0.0%
:   	0.0%
--------


PROMPT: To be or not to be: ; for it is a man .
:   	100.0%
i:   	0.0%
—:   	0.0%
but:   	0.0%
you:   	0.0%
--------



In [29]:
info = text_generator.generate(
    "A rose by any other name would ", max_tokens=100, temperature=0.2
)
print_probs(info, vocab)


Generated text:
A rose by any other name would  set the work 


PROMPT: A rose by any other name would 
set:   	77.61%
:   	12.36%
any:   	5.06%
the:   	4.59%
online:   	0.17%
--------


PROMPT: A rose by any other name would  set
the:   	46.84%
,:   	46.46%
forth:   	4.94%
a:   	1.29%
:   	0.33%
--------


PROMPT: A rose by any other name would  set the
:   	48.57%
work:   	31.27%
project:   	19.99%
day:   	0.16%
capulets:   	0.01%
--------


PROMPT: A rose by any other name would  set the work
:   	99.06%
,:   	0.94%
;:   	0.0%
::   	0.0%
!:   	0.0%
--------



In [17]:
info = text_generator.generate(
    "Brevity is the", max_tokens=100, temperature=0.2
)
print_probs(info, vocab)


Generated text:
Brevity is the very coinage of my brain . 


PROMPT: Brevity is the
very:   	72.93%
king:   	26.48%
great:   	0.16%
most:   	0.13%
more:   	0.1%
--------


PROMPT: Brevity is the very
witching:   	35.29%
flame:   	17.38%
coinage:   	16.45%
king:   	13.08%
ecstasy:   	5.0%
--------


PROMPT: Brevity is the very coinage
of:   	100.0%
-:   	0.0%
,:   	0.0%
o’:   	0.0%
.:   	0.0%
--------


PROMPT: Brevity is the very coinage of
my:   	99.95%
the:   	0.03%
his:   	0.01%
a:   	0.01%
your:   	0.01%
--------


PROMPT: Brevity is the very coinage of my
brain:   	87.56%
life:   	5.82%
blood:   	2.72%
heart:   	2.23%
thoughts:   	0.54%
--------


PROMPT: Brevity is the very coinage of my brain
.:   	99.68%
,:   	0.31%
;:   	0.01%
?:   	0.0%
!:   	0.0%
--------


PROMPT: Brevity is the very coinage of my brain .
:   	100.0%
you:   	0.0%
i:   	0.0%
but:   	0.0%
—:   	0.0%
--------



Using 128 units per layer worked the best in terms time-to-train and learning capabilty for this case. Others resulted in either incoherence or disconnected runtimes.

The temperature set at .2 worked the best for coherence, the other temperatures I tested were either incoherent, or inferior to this temperature. "Brevity is the very coinage of my brain" somewhat captures the style of Shakespeare (being generous), so I thought this temperature worked the best.

The generated text overall was decent at .2 and lackluster at other temperatures. Other temperatures were not coherent.