# Imports

In [1]:
import numpy as np
import json
import re
import string

import tensorflow as tf
from tensorflow.keras import layers, models, callbacks, losses

# Params

In [2]:
VOCAB_SIZE = 10000
MAX_LEN = 300
EMBEDDING_DIM = 100
N_UNITS = 128
VALIDATION_SPLIT = 0.2
SEED = 42
LOAD_MODEL = False
BATCH_SIZE = 32
EPOCHS = 25

# Download and Condense Data from gutenberg

In [3]:
import re
import requests

all_text = ""

urls = [
        "https://www.gutenberg.org/cache/epub/1533/pg1533.txt",
        "https://www.gutenberg.org/cache/epub/1531/pg1531.txt",
        "https://www.gutenberg.org/cache/epub/1524/pg1524.txt",
        "https://www.gutenberg.org/cache/epub/1526/pg1526.txt",
        "https://www.gutenberg.org/cache/epub/1514/pg1514.txt",
        ]

# Needs to be run individually on downloaded plays
def clean_text_before_act(text):
    # Remove everything before the first "ACT"
    # Each play has two "ACT" after which the text follows
    match_start = re.search(r'(ACT I\s)', text)
    if match_start:
        text = text.split(match_start.group(0), 1)[1]
    return text

for url in urls:
  response = requests.get(url)
  text = response.text

  # Regex to clean the junk and notes from the top
  match_start = re.search(r'\*\*\* START OF THE PROJECT GUTENBERG EBOOK.*\*\*\*', text)

  # Split at the match and only take what's after (the actual ebook)
  if match_start:
    text = text.split(match_start.group(0), 1)[1]

  # Split at the match and only take before their legal stuff at the end
  match_end = re.search(r'\*\*\* END OF THE PROJECT GUTENBERG EBOOK.*\*\*\*', text)
  if match_end:
    text = text.split(match_end.group(0), 1)[0]

  text = clean_text_before_act(text)
  text = clean_text_before_act(text)

  # Add all
  all_text += text + "\n\n"

# Save to file
with open("combined_shakespeare.txt", "w", encoding="utf-8") as file:
    file.write(all_text)

# Loading Data

In [29]:
with open("combined_shakespeare.txt", "r", encoding="utf-8") as file:
    text = file.read()

# Tokenize Data

In [6]:
!pip install langchain

Collecting langchain
  Downloading langchain-0.3.7-py3-none-any.whl.metadata (7.1 kB)
Collecting SQLAlchemy<3,>=1.4 (from langchain)
  Downloading SQLAlchemy-2.0.36-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.7 kB)
Collecting aiohttp<4.0.0,>=3.8.3 (from langchain)
  Downloading aiohttp-3.11.7-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.7 kB)
Collecting async-timeout<5.0.0,>=4.0.0 (from langchain)
  Downloading async_timeout-4.0.3-py3-none-any.whl.metadata (4.2 kB)
Collecting langchain-core<0.4.0,>=0.3.15 (from langchain)
  Downloading langchain_core-0.3.19-py3-none-any.whl.metadata (6.3 kB)
Collecting langchain-text-splitters<0.4.0,>=0.3.0 (from langchain)
  Downloading langchain_text_splitters-0.3.2-py3-none-any.whl.metadata (2.3 kB)
Collecting langsmith<0.2.0,>=0.1.17 (from langchain)
  Downloading langsmith-0.1.144-py3-none-any.whl.metadata (14 kB)
Collecting aiohappyeyeballs>=2.3.0 (from aiohttp<4.0.0,>=3.8.3->langchain)
  Downl

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

# We need to do a few things to make this text not awful:
# 1) Pad punctiation because LSTM models are awful if you don't
# 2) Remove stage directions & act stuff since it will confuse and are not useful.
# 3) Set to lowercase so it has less vocab to consider
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
import matplotlib.pyplot as plt

# STEP 1
def pad_punctuation(s):
    s = re.sub(f"([{string.punctuation}])", r" \1 ", s)
    s = re.sub(" +", " ", s)  # Remove multiple spaces
    return s

# STEP 2 & 3
def clean_text(text):
    text = re.sub(r'(act [ivxlcdm]+|scene [ivxlcdm]*)', '', text)  # Remove act & scene markers
    text = re.sub(r'\[.*?\]', '', text)  # Remove stage directions and related
    text = re.sub(r'\n+', ' <newline> ', text) # Replace new line with token to preserve structure
    text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces with a single space

    # Remove single word speaker introductions
    # Some classic characters like "second clown" are too hard to distinguish from short lines
    text = re.sub(r'<newline> [A-Za-z]+ \. <newline>', '', text)
    text = re.sub(r'\s+', ' ', text)  # Fix awkward speaker removal
    return text.strip() # Strip just normal stuff

text = pad_punctuation(text).lower()
text = clean_text(text)

# Chunk to our max length
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=MAX_LEN,
    chunk_overlap=1,
    length_function=len,
)

chunks = text_splitter.split_text(text) # Split into MAX_LEN chunks for LSTM to read easy

text = list(filter(None, chunks)) # Filter any funky chunks

# Happy with this
print(text)

In [31]:
# Create a vectorisation layer
vectorize_layer = layers.TextVectorization(
    standardize="lower",
    max_tokens=VOCAB_SIZE,
    output_mode="int",
    output_sequence_length=MAX_LEN + 1,
)

# Convert for tensorflow and batch
text_ds = (
    tf.data.Dataset.from_tensor_slices(text)
    .batch(BATCH_SIZE)
    .shuffle(1000)
)


# Adapt the layer to the training set
vectorize_layer.adapt(text_ds)
vocab = vectorize_layer.get_vocabulary()

# Display some token:word mappings
for i, word in enumerate(vocab[:10]):
    print(f"{i}: {word}")



0: 
1: [UNK]
2: ,
3: <newline>
4: .
5: the
6: and
7: i
8: to
9: of


In [32]:
def prepare_inputs(text):
    tokenized_sentences = vectorize_layer(text)

    x = tokenized_sentences[:, :-1]
    y = tokenized_sentences[:, 1:]

    return x, y

train_ds = text_ds.map(prepare_inputs)

# Build The LSTM

In [33]:
inputs = layers.Input(shape=(None,), dtype="int32")
x = layers.Embedding(VOCAB_SIZE, EMBEDDING_DIM)(inputs)
x = layers.LSTM(N_UNITS, return_sequences=True)(x)
x = layers.Dropout(0.2)(x) # Add Dropout
outputs = layers.Dense(VOCAB_SIZE, activation="softmax")(x)
lstm = models.Model(inputs, outputs)
lstm.summary()

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, None)]            0         
                                                                 
 embedding_1 (Embedding)     (None, None, 100)         1000000   
                                                                 
 lstm_1 (LSTM)               (None, None, 128)         117248    
                                                                 
 dropout_1 (Dropout)         (None, None, 128)         0         
                                                                 
 dense_1 (Dense)             (None, None, 10000)       1290000   
                                                                 
Total params: 2407248 (9.18 MB)
Trainable params: 2407248 (9.18 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


# Training LSTM

In [34]:
loss_fn = losses.SparseCategoricalCrossentropy()
lstm.compile("adam", loss_fn)

In [35]:
# Create a TextGenerator checkpoint
class TextGenerator(callbacks.Callback):
    def __init__(self, index_to_word, top_k=10):
        self.index_to_word = index_to_word
        self.word_to_index = {
            word: index for index, word in enumerate(index_to_word)
        }  # <1>

    def sample_from(self, probs, temperature):  # <2>
        probs = probs ** (1 / temperature)
        probs = probs / np.sum(probs)
        return np.random.choice(len(probs), p=probs), probs

    def generate(self, start_prompt, max_tokens, temperature):
        start_tokens = [
            self.word_to_index.get(x, 1) for x in start_prompt.split()
        ]  # <3>
        sample_token = None
        info = []
        while len(start_tokens) < max_tokens and sample_token != 0:  # <4>
            x = np.array([start_tokens])
            y = self.model.predict(x, verbose=0)  # <5>
            sample_token, probs = self.sample_from(y[0][-1], temperature)  # <6>
            info.append({"prompt": start_prompt, "word_probs": probs})
            start_tokens.append(sample_token)  # <7>
            start_prompt = start_prompt + " " + self.index_to_word[sample_token]
        print(f"\ngenerated text:\n{start_prompt}\n")
        return info

    def on_epoch_end(self, epoch, logs=None):
        self.generate("Now is the winter of our discontent", max_tokens=100, temperature=1.0)


In [37]:
text_generator = TextGenerator(vocab)

lstm.fit(train_ds, epochs=EPOCHS, callbacks=[text_generator])

Epoch 1/25
generated text:
Now is the winter of our discontent has 

Epoch 2/25
generated text:
Now is the winter of our discontent 

Epoch 3/25
generated text:
Now is the winter of our discontent fear 

Epoch 4/25
generated text:
Now is the winter of our discontent kisses come too ’tis <newline> that ” prays 

Epoch 5/25
generated text:
Now is the winter of our discontent nigh the such purgatory ; this <newline> maria’s sorrow i amen , true what unless thy madam frame , <newline> ? enter not kindnesses is of go witness , take invention a thing out <newline> is . help whom tomorrow <newline> advis’d 

Epoch 6/25
generated text:
Now is the winter of our discontent legs enter better wits and _ child <newline> his carried enter toby would never in make we’ll cold aerial that the , gold hast not be —put sport thy master ‘behold too <newline> rest ? her <newline> ranker <newline> must here <newline> dangerous engines 

Epoch 7/25
generated text:
Now is the winter of our discontent hold hele

<keras.src.callbacks.History at 0x7ce528446f20>

# Generating Text

In [36]:
def print_probs(info, vocab, top_k=5):
    for i in info:
        print(f"\nPROMPT: {i['prompt']}")
        word_probs = i["word_probs"]
        p_sorted = np.sort(word_probs)[::-1][:top_k]
        i_sorted = np.argsort(word_probs)[::-1][:top_k]
        for p, i in zip(p_sorted, i_sorted):
            print(f"{vocab[i]}:   \t{np.round(100*p,2)}%")
        print("--------\n")


# Slowly Decreasing Temperature Testing

There are a few interesting notes. It seems there are two increasing pieces as the temperature goes down: religion and \<newline>.
I still believe leaving \<newline> in the text is a good choice to try and keep the structure of the play in tact.

However it is quite apparent that at lower temperatures the generator becomes very reliant on new lines. Because they showed up so often and consistently throughout the play the generator considers it "safe" to pick. This overall hurts the generator at lower temperatures but I believe it equally enriches the output at higher temperatures.

At lower temperatures it also seems to always introduce God into the promp which I beleive is for the same reason as the new line issue.

Overall I believe the output is not exactly coherent, but the prose do at time feel like poetry especially at higher temperatures.

The text output does feel *very* shakespearean to me as a casual reader. There aren't any moments where it feels too off, and if it were slightly more coherent I would believe it to be.

Finally, I think the model does stay relevant to the story it's telling. The high temperature output focused on the connection between journey's of oneself and dispite the pain finding magic in love. At least until it brought in the clown line.

In [53]:
info = text_generator.generate("to be or not to be", max_tokens=100, temperature=1.0)
print_probs(info, vocab)


generated text:
to be or not to be more falls . as so sins troubled now , pitiful to the voyage . o thisbe the stars . <newline> come , the fustian encounter of any vixen loving ; there’s magic , hath hearts him with pyramus ? if despite , give him up me my clown . for with this once 


PROMPT: to be or not to be
.:   	3.82%
,:   	3.62%
the:   	2.3%
<newline>:   	1.93%
a:   	1.53%
--------


PROMPT: to be or not to be more
.:   	12.08%
,:   	10.18%
;:   	4.27%
of:   	2.67%
?:   	2.49%
--------


PROMPT: to be or not to be more falls
.:   	30.1%
,:   	21.29%
;:   	7.91%
of:   	4.66%
?:   	4.08%
--------


PROMPT: to be or not to be more falls .
<newline>:   	46.81%
and:   	6.16%
i:   	3.5%
but:   	2.03%
what:   	1.22%
--------


PROMPT: to be or not to be more falls . as
i:   	3.71%
the:   	3.65%
<newline>:   	2.28%
and:   	2.0%
you:   	1.99%
--------


PROMPT: to be or not to be more falls . as so
,:   	7.25%
is:   	3.37%
.:   	1.41%
in:   	1.4%
of:   	1.33%
--------


PROMPT: to be o

In [49]:
info = text_generator.generate("Is this a dagger which I see before me, the handle toward my hand?", max_tokens=100, temperature=0.9)
print_probs(info, vocab)


generated text:
Is this a dagger which I see before me, the handle toward my hand? to the beard with her , a desdemona but they it . prey <newline> free , <newline> othello with a pilot ! i you do do that me your mind <newline> sheathe for children . for them about absolute a soul of witch . i my heart , <newline> the lord 


PROMPT: Is this a dagger which I see before me, the handle toward my hand?
<newline>:   	18.33%
,:   	5.23%
and:   	4.17%
the:   	3.89%
of:   	3.57%
--------


PROMPT: Is this a dagger which I see before me, the handle toward my hand? to
the:   	11.15%
a:   	5.17%
his:   	3.36%
my:   	2.95%
be:   	2.28%
--------


PROMPT: Is this a dagger which I see before me, the handle toward my hand? to the
man:   	0.94%
lord:   	0.92%
heart:   	0.83%
love:   	0.83%
soul:   	0.82%
--------


PROMPT: Is this a dagger which I see before me, the handle toward my hand? to the beard
,:   	27.04%
.:   	20.92%
of:   	9.88%
<newline>:   	7.03%
;:   	3.7%
--------


PROMPT: Is this a 

In [45]:
info = text_generator.generate("How sharper than a serpent's too it is to have a thankless child", max_tokens=100, temperature=0.8)
print_probs(info, vocab)


generated text:
How sharper than a serpent's too it is to have a thankless child and so in many more . <newline> the sail and the sight , <newline> enter , i my are moans , <newline> thy have that but good that have more of news of the father ; <newline> whose couldst is as so my more - bootless black lost 


PROMPT: How sharper than a serpent's too it is to have a thankless child
.:   	41.04%
,:   	35.87%
<newline>:   	6.86%
of:   	3.33%
;:   	2.69%
--------


PROMPT: How sharper than a serpent's too it is to have a thankless child and
the:   	11.55%
<newline>:   	9.47%
a:   	5.16%
his:   	3.81%
my:   	3.53%
--------


PROMPT: How sharper than a serpent's too it is to have a thankless child and so
,:   	21.28%
.:   	21.03%
;:   	5.9%
of:   	3.58%
?:   	3.33%
--------


PROMPT: How sharper than a serpent's too it is to have a thankless child and so in
the:   	15.96%
a:   	6.43%
his:   	5.24%
<newline>:   	4.93%
my:   	3.69%
--------


PROMPT: How sharper than a serpent's too it is to 

In [47]:
info = text_generator.generate("We are such stuff as dreams are made on, and our little life is rounded with a sleep", max_tokens=100, temperature=0.6)
print_probs(info, vocab)


generated text:
We are such stuff as dreams are made on, and our little life is rounded with a sleep , <newline> <newline> but <newline> if <newline> i as a lord of have so , <newline> and time . i do be you , <newline> the advice ; a king him , this , <newline> and the lord , you , i hear not tell me 


PROMPT: We are such stuff as dreams are made on, and our little life is rounded with a sleep
.:   	40.15%
,:   	36.11%
<newline>:   	9.91%
of:   	6.01%
;:   	2.09%
--------


PROMPT: We are such stuff as dreams are made on, and our little life is rounded with a sleep ,
<newline>:   	91.35%
and:   	3.81%
i:   	0.71%
but:   	0.38%
to:   	0.32%
--------


PROMPT: We are such stuff as dreams are made on, and our little life is rounded with a sleep , <newline>
<newline>:   	31.72%
and:   	14.45%
i:   	9.85%
the:   	5.34%
that:   	3.67%
--------


PROMPT: We are such stuff as dreams are made on, and our little life is rounded with a sleep , <newline> <newline>
i:   	16.76%
<newline>:   	13.

In [48]:
info = text_generator.generate("What’s in a name? A rose by any other name would smell as sweet", max_tokens=100, temperature=0.4)
print_probs(info, vocab)


generated text:
What’s in a name? A rose by any other name would smell as sweet more . <newline> i have you , <newline> <newline> <newline> the other of not is a wit , <newline> the most king , <newline> i am you so . <newline> you , <newline> we be not and you , <newline> i am not you the man . <newline> sir , 


PROMPT: What’s in a name? A rose by any other name would smell as sweet
,:   	37.16%
.:   	29.04%
the:   	4.67%
<newline>:   	2.58%
to:   	2.5%
--------


PROMPT: What’s in a name? A rose by any other name would smell as sweet more
.:   	51.46%
,:   	38.4%
;:   	4.55%
of:   	1.72%
?:   	1.16%
--------


PROMPT: What’s in a name? A rose by any other name would smell as sweet more .
<newline>:   	98.68%
and:   	0.83%
i:   	0.27%
but:   	0.06%
what:   	0.02%
--------


PROMPT: What’s in a name? A rose by any other name would smell as sweet more . <newline>
i:   	30.33%
<newline>:   	28.45%
and:   	15.2%
the:   	5.16%
but:   	3.16%
--------


PROMPT: What’s in a name? A rose by 