<a href="https://colab.research.google.com/github/autumnWindigo/GenAI/blob/main/HW5/Problem1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports

In [1]:
import numpy as np
import json
import re
import string

import tensorflow as tf
from tensorflow.keras import layers, models, callbacks, losses

# Params

In [2]:
VOCAB_SIZE = 10000
MAX_LEN = 300
EMBEDDING_DIM = 100
N_UNITS = 128
VALIDATION_SPLIT = 0.2
SEED = 42
LOAD_MODEL = False
BATCH_SIZE = 32
EPOCHS = 25

# Download and Condense Data from gutenberg

In [3]:
import re
import requests

all_text = ""

urls = [
        "https://www.gutenberg.org/cache/epub/1533/pg1533.txt",
        "https://www.gutenberg.org/cache/epub/1531/pg1531.txt",
        "https://www.gutenberg.org/cache/epub/1524/pg1524.txt",
        "https://www.gutenberg.org/cache/epub/1526/pg1526.txt",
        "https://www.gutenberg.org/cache/epub/1514/pg1514.txt",
        ]

# Needs to be run individually on downloaded plays
def clean_text_before_act(text):
    # Remove everything before the first "ACT"
    # Each play has two "ACT" after which the text follows
    match_start = re.search(r'(ACT I\s)', text)
    if match_start:
        text = text.split(match_start.group(0), 1)[1]
    return text

for url in urls:
  response = requests.get(url)
  text = response.text

  # Regex to clean the junk and notes from the top
  match_start = re.search(r'\*\*\* START OF THE PROJECT GUTENBERG EBOOK.*\*\*\*', text)

  # Split at the match and only take what's after (the actual ebook)
  if match_start:
    text = text.split(match_start.group(0), 1)[1]

  # Split at the match and only take before their legal stuff at the end
  match_end = re.search(r'\*\*\* END OF THE PROJECT GUTENBERG EBOOK.*\*\*\*', text)
  if match_end:
    text = text.split(match_end.group(0), 1)[0]

  text = clean_text_before_act(text)
  text = clean_text_before_act(text)

  # Add all
  all_text += text + "\n\n"

# Save to file
with open("combined_shakespeare.txt", "w", encoding="utf-8") as file:
    file.write(all_text)

# Loading Data

In [59]:
with open("combined_shakespeare.txt", "r", encoding="utf-8") as file:
    text = file.read()

# Tokenize Data

In [6]:
!pip install langchain

Collecting langchain
  Downloading langchain-0.3.7-py3-none-any.whl.metadata (7.1 kB)
Collecting SQLAlchemy<3,>=1.4 (from langchain)
  Downloading SQLAlchemy-2.0.36-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.7 kB)
Collecting aiohttp<4.0.0,>=3.8.3 (from langchain)
  Downloading aiohttp-3.11.7-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.7 kB)
Collecting async-timeout<5.0.0,>=4.0.0 (from langchain)
  Downloading async_timeout-4.0.3-py3-none-any.whl.metadata (4.2 kB)
Collecting langchain-core<0.4.0,>=0.3.15 (from langchain)
  Downloading langchain_core-0.3.19-py3-none-any.whl.metadata (6.3 kB)
Collecting langchain-text-splitters<0.4.0,>=0.3.0 (from langchain)
  Downloading langchain_text_splitters-0.3.2-py3-none-any.whl.metadata (2.3 kB)
Collecting langsmith<0.2.0,>=0.1.17 (from langchain)
  Downloading langsmith-0.1.144-py3-none-any.whl.metadata (14 kB)
Collecting aiohappyeyeballs>=2.3.0 (from aiohttp<4.0.0,>=3.8.3->langchain)
  Downl

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

# We need to do a few things to make this text not awful:
# 1) Pad punctiation because LSTM models are awful if you don't
# 2) Remove stage directions & act stuff since it will confuse and are not useful.
# 3) Set to lowercase so it has less vocab to consider
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
import matplotlib.pyplot as plt

# STEP 1
def pad_punctuation(s):
    s = re.sub(f"([{string.punctuation}])", r" \1 ", s)
    s = re.sub(" +", " ", s)  # Remove multiple spaces
    return s

# STEP 2 & 3
def clean_text(text):
    text = re.sub(r'(act [ivxlcdm]+|scene [ivxlcdm]*)', '', text)  # Remove act & scene markers
    text = re.sub(r'\[.*?\]', '', text)  # Remove stage directions and related
    text = re.sub(r'\n+', ' <newline> ', text) # Replace new line with token to preserve structure
    text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces with a single space

    # Remove single word speaker introductions
    # Some classic characters like "second clown" are too hard to distinguish from short lines
    text = re.sub(r'<newline> [A-Za-z]+ \. <newline>', '', text)
    text = re.sub(r'\s+', ' ', text)  # Fix awkward speaker removal
    return text.strip() # Strip just normal stuff

text = pad_punctuation(text).lower()
text = clean_text(text)

# Chunk to our max length
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=MAX_LEN,
    chunk_overlap=1,
    length_function=len,
)

chunks = text_splitter.split_text(text) # Split into MAX_LEN chunks for LSTM to read easy

text = list(filter(None, chunks)) # Filter any funky chunks

# Happy with this
print(text)

In [61]:
# Create a vectorisation layer
vectorize_layer = layers.TextVectorization(
    standardize="lower",
    max_tokens=VOCAB_SIZE,
    output_mode="int",
    output_sequence_length=MAX_LEN + 1,
)

# Convert for tensorflow and batch
text_ds = (
    tf.data.Dataset.from_tensor_slices(text)
    .batch(BATCH_SIZE)
    .shuffle(1000)
)


# Adapt the layer to the training set
vectorize_layer.adapt(text_ds)
vocab = vectorize_layer.get_vocabulary()

# Display some token:word mappings
for i, word in enumerate(vocab[:10]):
    print(f"{i}: {word}")



0: 
1: [UNK]
2: ,
3: <newline>
4: .
5: the
6: and
7: i
8: to
9: of


In [62]:
def prepare_inputs(text):
    tokenized_sentences = vectorize_layer(text)

    x = tokenized_sentences[:, :-1]
    y = tokenized_sentences[:, 1:]

    return x, y

train_ds = text_ds.map(prepare_inputs)

# Build The LSTM

In [63]:
inputs = layers.Input(shape=(None,), dtype="int32")
x = layers.Embedding(VOCAB_SIZE, EMBEDDING_DIM)(inputs)
x = layers.LSTM(128, return_sequences=True, dropout=0.2)(x)  # Dropout to avoid overfitting <newline>
x = layers.LSTM(128, return_sequences=True, dropout=0.2)(x)  # Also 128 units
x = layers.LSTM(128, return_sequences=True, dropout=0.2)(x)
outputs = layers.Dense(VOCAB_SIZE, activation="softmax")(x)
lstm = models.Model(inputs, outputs)
lstm.summary()

Model: "model_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_5 (InputLayer)        [(None, None)]            0         
                                                                 
 embedding_4 (Embedding)     (None, None, 100)         1000000   
                                                                 
 lstm_7 (LSTM)               (None, None, 128)         117248    
                                                                 
 lstm_8 (LSTM)               (None, None, 128)         131584    
                                                                 
 lstm_9 (LSTM)               (None, None, 128)         131584    
                                                                 
 dense_4 (Dense)             (None, None, 10000)       1290000   
                                                                 
Total params: 2670416 (10.19 MB)
Trainable params: 2670416 

# Training LSTM

In [64]:
loss_fn = losses.SparseCategoricalCrossentropy()
lstm.compile("adam", loss_fn)

In [65]:
# Create a TextGenerator checkpoint
class TextGenerator(callbacks.Callback):
    def __init__(self, index_to_word, top_k=10):
        self.index_to_word = index_to_word
        self.word_to_index = {
            word: index for index, word in enumerate(index_to_word)
        }  # <1>

    def sample_from(self, probs, temperature):  # <2>
        probs = probs ** (1 / temperature)
        probs = probs / np.sum(probs)
        return np.random.choice(len(probs), p=probs), probs

    def generate(self, start_prompt, max_tokens, temperature):
        start_tokens = [
            self.word_to_index.get(x, 1) for x in start_prompt.split()
        ]  # <3>
        sample_token = None
        info = []
        while len(start_tokens) < max_tokens and sample_token != 0:  # <4>
            x = np.array([start_tokens])
            y = self.model.predict(x, verbose=0)  # <5>
            sample_token, probs = self.sample_from(y[0][-1], temperature)  # <6>
            info.append({"prompt": start_prompt, "word_probs": probs})
            start_tokens.append(sample_token)  # <7>
            start_prompt = start_prompt + " " + self.index_to_word[sample_token]
        print(f"\ngenerated text:\n{start_prompt}\n")
        return info

    def on_epoch_end(self, epoch, logs=None):
        self.generate("Now is the winter of our discontent", max_tokens=100, temperature=1.0)


In [66]:
text_generator = TextGenerator(vocab)

lstm.fit(train_ds, epochs=EPOCHS, callbacks=[text_generator])

Epoch 1/25
generated text:
Now is the winter of our discontent 

Epoch 2/25
generated text:
Now is the winter of our discontent 

Epoch 3/25
generated text:
Now is the winter of our discontent <newline> <newline> charm i sad , other - pare to me them all and a all odours excepted habits 

Epoch 4/25
generated text:
Now is the winter of our discontent may o thee ; hate moon . knacks thrown lady make , cursed her implorators you customary ! , to 

Epoch 5/25
generated text:
Now is the winter of our discontent , 

Epoch 6/25
generated text:
Now is the winter of our discontent with thrift , la our with the . whose curses jest for up of me thought let and , saying poor of but ? they if of listen this ? the passed goodness , state ; threw ? what’s so what <newline> palace enter the obey 

Epoch 7/25
generated text:
Now is the winter of our discontent , thank ? bed and <newline> fabian grown but reserve in <newline> <newline> thy general first you him both the ; hermia hurt . to lady - mercy 

<keras.src.callbacks.History at 0x7ce5b042ae90>

# Generating Text

In [36]:
def print_probs(info, vocab, top_k=5):
    for i in info:
        print(f"\nPROMPT: {i['prompt']}")
        word_probs = i["word_probs"]
        p_sorted = np.sort(word_probs)[::-1][:top_k]
        i_sorted = np.argsort(word_probs)[::-1][:top_k]
        for p, i in zip(p_sorted, i_sorted):
            print(f"{vocab[i]}:   \t{np.round(100*p,2)}%")
        print("--------\n")


# Slowly Decreasing Temperature Testing

## Single Layer LSTM

There are a few interesting notes. It seems there are two increasing pieces as the temperature goes down: religion and \<newline>.
I still believe leaving \<newline> in the text is a good choice to try and keep the structure of the play in tact.

However it is quite apparent that at lower temperatures the generator becomes very reliant on new lines. Because they showed up so often and consistently throughout the play the generator considers it "safe" to pick. This overall hurts the generator at lower temperatures but I believe it equally enriches the output at higher temperatures.

At lower temperatures it also seems to always introduce God into the promp which I beleive is for the same reason as the new line issue.

Overall I believe the output is not exactly coherent, but the prose do at time feel like poetry especially at higher temperatures.

The text output does feel *very* shakespearean to me as a casual reader. There aren't any moments where it feels too off, and if it were slightly more coherent I would believe it to be.

Finally, I think the model does stay relevant to the story it's telling. The high temperature output focused on the connection between journey's of oneself and dispite the pain finding magic in love. At least until it brought in the clown line.

the output: "to be or not to be more falls . as so sins troubled now , pitiful to the voyage . o thisbe the stars . \<newline> come , the fustian encounter of any vixen loving ; theres magic , hath hearts him with pyramus ? if despite , give him up me my clown"

## Multi Layer LSTM
Interestingly many of the same issues which were apparent in the single layer are still within the multilayer. I tried many inputs and it seems 128 units gives the most coherent outputs. The high temperature outputs are safest and it seems multilayer LSTM's become more dependent on punctuation as the temperature lowers- even moreso than single layer LSTM's.
I want to try a training set without \<newline> and see if it makes a difference.

In [73]:
info = text_generator.generate("to be or not to be", max_tokens=100, temperature=1.0)
print_probs(info, vocab)


generated text:
to be or not to be and silk it . name knight give of contract i grounds you therefore of desolate ? heard <newline> <newline> a old profess yielders air . <newline> walk ? england lion <newline> you ha two oft your funeral of presence not pleasure boys desire then ? <newline> service particulars text believe robes are walk remain 


PROMPT: to be or not to be
<newline>:   	8.36%
,:   	8.03%
.:   	4.94%
the:   	3.09%
and:   	2.54%
--------


PROMPT: to be or not to be and
<newline>:   	8.35%
,:   	8.01%
.:   	4.93%
the:   	3.08%
and:   	2.53%
--------


PROMPT: to be or not to be and silk
<newline>:   	8.33%
,:   	7.99%
.:   	4.92%
the:   	3.06%
and:   	2.51%
--------


PROMPT: to be or not to be and silk it
<newline>:   	8.35%
,:   	8.02%
.:   	4.93%
the:   	3.06%
and:   	2.51%
--------


PROMPT: to be or not to be and silk it .
<newline>:   	8.35%
,:   	8.03%
.:   	4.94%
the:   	3.06%
and:   	2.5%
--------


PROMPT: to be or not to be and silk it . name
<newline>:   	

In [68]:
info = text_generator.generate("Is this a dagger which I see before me, the handle toward my hand?", max_tokens=100, temperature=0.9)
print_probs(info, vocab)


generated text:
Is this a dagger which I see before me, the handle toward my hand? <newline> company speak were women’s so thou , against , the very ho - , but they but let willow <newline> he and hath make while meet <newline> true from love , —i , ; not right . rankly and 


PROMPT: Is this a dagger which I see before me, the handle toward my hand?
<newline>:   	11.69%
,:   	11.31%
.:   	6.62%
the:   	3.83%
and:   	3.03%
--------


PROMPT: Is this a dagger which I see before me, the handle toward my hand? <newline>
<newline>:   	11.71%
,:   	11.38%
.:   	6.66%
the:   	3.83%
and:   	3.03%
--------


PROMPT: Is this a dagger which I see before me, the handle toward my hand? <newline> company
<newline>:   	11.75%
,:   	11.47%
.:   	6.73%
the:   	3.84%
and:   	3.03%
--------


PROMPT: Is this a dagger which I see before me, the handle toward my hand? <newline> company speak
<newline>:   	11.79%
,:   	11.54%
.:   	6.79%
the:   	3.85%
and:   	3.02%
--------


PROMPT: Is this a dagger whic

In [75]:
info = text_generator.generate("How sharper than a serpent's too it is to have a thankless child", max_tokens=100, temperature=0.8)
print_probs(info, vocab)


generated text:
How sharper than a serpent's too it is to have a thankless child <newline> if the without the do this that . laertes will my , through like , . defend with the <newline> ; toward i letter and <newline> . husband <newline> now <newline> <newline> of prattle <newline> a play <newline> <newline> the , of and 


PROMPT: How sharper than a serpent's too it is to have a thankless child
<newline>:   	16.4%
,:   	15.51%
.:   	8.59%
the:   	4.68%
and:   	3.59%
--------


PROMPT: How sharper than a serpent's too it is to have a thankless child <newline>
<newline>:   	16.35%
,:   	15.52%
.:   	8.6%
the:   	4.66%
and:   	3.57%
--------


PROMPT: How sharper than a serpent's too it is to have a thankless child <newline> if
<newline>:   	16.31%
,:   	15.55%
.:   	8.61%
the:   	4.64%
and:   	3.55%
--------


PROMPT: How sharper than a serpent's too it is to have a thankless child <newline> if the
<newline>:   	16.27%
,:   	15.59%
.:   	8.64%
the:   	4.63%
and:   	3.54%
--------


PRO

In [70]:
info = text_generator.generate("We are such stuff as dreams are made on, and our little life is rounded with a sleep", max_tokens=100, temperature=0.6)
print_probs(info, vocab)


generated text:
We are such stuff as dreams are made on, and our little life is rounded with a sleep will , , <newline> . love , , you , you , the <newline> the . you and <newline> your <newline> the how ! . i <newline> i do the have . , , , <newline> , <newline> . <newline> <newline> it 


PROMPT: We are such stuff as dreams are made on, and our little life is rounded with a sleep
<newline>:   	27.48%
,:   	26.78%
.:   	12.04%
the:   	5.13%
and:   	3.58%
--------


PROMPT: We are such stuff as dreams are made on, and our little life is rounded with a sleep will
<newline>:   	27.42%
,:   	26.87%
.:   	12.12%
the:   	5.11%
and:   	3.56%
--------


PROMPT: We are such stuff as dreams are made on, and our little life is rounded with a sleep will ,
<newline>:   	27.36%
,:   	26.94%
.:   	12.19%
the:   	5.1%
and:   	3.53%
--------


PROMPT: We are such stuff as dreams are made on, and our little life is rounded with a sleep will , ,
<newline>:   	27.31%
,:   	27.01%
.:   	12.25%
the:   	5.

In [71]:
info = text_generator.generate("What’s in a name? A rose by any other name would smell as sweet", max_tokens=100, temperature=0.4)
print_probs(info, vocab)


generated text:
What’s in a name? A rose by any other name would smell as sweet <newline> , , <newline> the <newline> <newline> <newline> <newline> <newline> and you <newline> , , . the <newline> to , you , , . . <newline> <newline> <newline> and love , the <newline> and <newline> <newline> , , and <newline> <newline> <newline> 


PROMPT: What’s in a name? A rose by any other name would smell as sweet
<newline>:   	40.49%
,:   	37.68%
.:   	11.26%
the:   	3.28%
and:   	1.95%
--------


PROMPT: What’s in a name? A rose by any other name would smell as sweet <newline>
<newline>:   	40.28%
,:   	37.85%
.:   	11.34%
the:   	3.26%
and:   	1.92%
--------


PROMPT: What’s in a name? A rose by any other name would smell as sweet <newline> ,
<newline>:   	40.08%
,:   	38.0%
.:   	11.42%
the:   	3.24%
and:   	1.9%
--------


PROMPT: What’s in a name? A rose by any other name would smell as sweet <newline> , ,
<newline>:   	39.88%
,:   	38.16%
.:   	11.51%
the:   	3.22%
and:   	1.88%
--------


