# Macbeth meets RNN
We'll be using the story "Macbeth" by William Shakeaspeare.
The story is in public domain; the text file was obtained from nltk.

In [1]:
import nltk
import spacy
from spacy import displacy
import random
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense,LSTM,Embedding
from keras.preprocessing.sequence import pad_sequences
from keras.models import load_model
from pickle import dump,load

Using TensorFlow backend.


## Functions

In [2]:
# desconsider the punctuation
def separate_punc(doc_text):
    return [token.text.lower() for token in nlp(doc_text) \
            if token.text not in ' \n\n   \n \n\n \n\n\n!"-#$%&()--.*+,-/:;<=>?@[\\]^_`{|}~\t\n ']

# create the LSTM model
def create_model(vocab_size, seq_len):
    model = Sequential()
    model.add(Embedding(vocab_size,seq_len,input_length = seq_len))
    model.add(LSTM(seq_len*2,return_sequences=True))
    model.add(LSTM(seq_len*2))
    model.add(Dense(50,activation='relu'))
    model.add(Dense(vocab_size,activation='softmax'))
    model.compile(loss = 'categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    
    model.summary()
    return model

# the function takes in the 'model' the tokenizer which has knowledge about the vocabulary in what ID number goes
# with what word. Some seed text you want to start off with. And this is robust enough to have shorter seed text
# or longer seed text and sequence length.

# for better results, you should make the text the same length as what was trained on. Otherwise, you have to pad it
# and then the number of words we want to generate.

def generator_text(model, tokenizer,seq_len, seed_text,num_gen_words):
    
    # the output text which is the final output
    output_text = []
    input_text = seed_text # 25 words
    
    for i in range(num_gen_words):
        # take the input text string and encode it to be a sequence
        # we transform those raw text data into sequences of numbers.
        encoded_text = tokenizer.texts_to_sequences([input_text])[0]
        
        # then if the seed text is too short or to long I may need to pad it (cut off or add it)
        pad_encoded = pad_sequences([encoded_text], maxlen=seq_len, truncating = 'pre')
        # after that I'm going to predict the class probabilities for each word.
        # the line below is essentially going to throw out the entire vocabulary.
        # Assign a probability to the most likely next word.
        pred_word_ind = model.predict_classes(pad_encoded, verbose = 0)[0] #[0] return the index of that particular word
        pred_word = tokenizer.index_word[pred_word_ind]
        # then we're going to take in the input text and I'm going to add the predicted word
        input_text += ' ' + pred_word
        output_text.append(pred_word)
    
    return ' '.join(output_text)

## Data Preprocessing

In [3]:
# import the "Macbeth" by William Shakeaspeare
path = nltk.data.find('corpora/gutenberg/shakespeare-macbeth.txt')
macbeth = open(path, 'r').read()

In [4]:
# setting the basics configs
nlp = spacy.load('en_core_web_sm', disable=['parser','tagger','ner'])
nlp.max_length = 1198623
doc = nlp(macbeth)
tokens = separate_punc(str(doc))

In [5]:
len(tokens)

18089

In [6]:
# pass 25 words --> network predict 26th
train_len = 25+1
text_sequences = []
for i in range(train_len, len(tokens)):
    seq = tokens[i-train_len:i]
    text_sequences.append(seq)

In [7]:
' '.join(text_sequences[0])

'the tragedie of macbeth by william shakespeare 1603 actus primus scoena prima thunder and lightning enter three witches 1 when shall we three meet againe in'

In [8]:
# So what we`ve done here is we`ve essentially replaced our original text sequences which are just sequences
# of text (26 long) to sequences that happen to have numbers in place of those words
tokenizer = Tokenizer()
tokenizer.fit_on_texts(text_sequences)
sequences = tokenizer.texts_to_sequences(text_sequences)

In [9]:
tokenizer.index_word

{1: 'the',
 2: 'and',
 3: 'to',
 4: 'of',
 5: 'i',
 6: 'a',
 7: 'that',
 8: 'my',
 9: 'you',
 10: 'in',
 11: 'not',
 12: 'is',
 13: 'it',
 14: 'with',
 15: 'his',
 16: 'be',
 17: 'macb',
 18: "'s",
 19: 'your',
 20: 'our',
 21: 'haue',
 22: 'but',
 23: 'what',
 24: "'",
 25: 'me',
 26: 'he',
 27: 'for',
 28: 'this',
 29: 'all',
 30: 'so',
 31: 'him',
 32: 'as',
 33: 'thou',
 34: 'we',
 35: 'enter',
 36: 'which',
 37: 'are',
 38: 'will',
 39: 'they',
 40: 'shall',
 41: 'no',
 42: 'then',
 43: 'do',
 44: 'their',
 45: 'thee',
 46: 'macbeth',
 47: 'vpon',
 48: 'on',
 49: 'macd',
 50: 'from',
 51: 'yet',
 52: 'thy',
 53: 'king',
 54: 'come',
 55: 'vs',
 56: 'there',
 57: 'now',
 58: 'hath',
 59: 'at',
 60: 'more',
 61: 'who',
 62: 'good',
 63: 'rosse',
 64: 'by',
 65: 'them',
 66: 'lady',
 67: 'would',
 68: 'time',
 69: 'can',
 70: 'was',
 71: 'like',
 72: 'her',
 73: 'if',
 74: 'let',
 75: 'should',
 76: 'did',
 77: 'when',
 78: 'where',
 79: 'say',
 80: 'were',
 81: 'make',
 82: 'banquo'

In [10]:
for i in sequences[0]:
    print(f"{i} : {tokenizer.index_word[i]}")

1 : the
3526 : tragedie
4 : of
46 : macbeth
64 : by
3525 : william
3524 : shakespeare
3523 : 1603
505 : actus
3522 : primus
3521 : scoena
504 : prima
294 : thunder
2 : and
1376 : lightning
35 : enter
213 : three
331 : witches
100 : 1
77 : when
40 : shall
34 : we
213 : three
359 : meet
136 : againe
10 : in


In [11]:
# it also has things like word counts which essentially counts how many times these words show up
tokenizer.word_counts

OrderedDict([('the', 16774),
             ('tragedie', 4),
             ('of', 8740),
             ('macbeth', 1564),
             ('by', 1253),
             ('william', 6),
             ('shakespeare', 7),
             ('1603', 8),
             ('actus', 113),
             ('primus', 10),
             ('scoena', 11),
             ('prima', 116),
             ('thunder', 221),
             ('and', 14152),
             ('lightning', 41),
             ('enter', 2096),
             ('three', 326),
             ('witches', 200),
             ('1', 825),
             ('when', 1034),
             ('shall', 1763),
             ('we', 2143),
             ('meet', 180),
             ('againe', 545),
             ('in', 5200),
             ('or', 962),
             ('raine', 26),
             ('2', 520),
             ('hurley', 26),
             ('burley', 26),
             ("'s", 3406),
             ('done', 910),
             ('battaile', 26),
             ('lost', 156),
             ('wonne',

In [12]:
vocab_size = len(tokenizer.word_counts)
vocab_size

3527

In [13]:
sequences = np.array(sequences)
sequences

array([[   1, 3526,    4, ...,  359,  136,   10],
       [3526,    4,   46, ...,  136,   10,  294],
       [   4,   46,   64, ...,   10,  294, 1376],
       ...,
       [   2,  224,   30, ..., 1379, 3527,    1],
       [ 224,   30,  503, ..., 3527,    1, 3526],
       [  30,  503,    3, ...,    1, 3526,    4]])

In [14]:
# grab everthing excepts the last column
X = sequences[:,:-1]
# grab the last column
y = sequences[:,-1]
# the plus 1 is because the way Keras padding works. It needs an extra one to holds zero
y = to_categorical(y,num_classes = vocab_size+1)
seq_len = X.shape[1]
X.shape
# 18711 sequences and so those were essentially those shifted 25 words sentences and then how many words are in each sentence

(18063, 25)

## Build the Neural Network

In [15]:
model = create_model(vocab_size+1,seq_len)

Instructions for updating:
Colocations handled automatically by placer.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 25, 25)            88200     
_________________________________________________________________
lstm_1 (LSTM)                (None, 25, 50)            15200     
_________________________________________________________________
lstm_2 (LSTM)                (None, 50)                20200     
_________________________________________________________________
dense_1 (Dense)              (None, 50)                2550      
_________________________________________________________________
dense_2 (Dense)              (None, 3528)              179928    
Total params: 306,078
Trainable params: 306,078
Non-trainable params: 0
_________________________________________________________________


In [16]:
# previously I was using the terms train test split it wasn't really a train split
# it was actually features labels split. Because there's nothing to test
# There's no right answer as far as what text generated should look like.
# instead we are really just texting these features against predicted label

# the batch_size is how many sequences you want to passen a time.
model.fit(X,y,batch_size=128,epochs=390,verbose=1)

Instructions for updating:
Use tf.cast instead.
Epoch 1/390
Epoch 2/390
Epoch 3/390


Epoch 4/390
Epoch 5/390


Epoch 6/390
Epoch 7/390


Epoch 8/390
Epoch 9/390


Epoch 10/390
Epoch 11/390


Epoch 12/390
Epoch 13/390


Epoch 14/390
Epoch 15/390


Epoch 16/390
Epoch 17/390


Epoch 18/390
Epoch 19/390


Epoch 20/390
Epoch 21/390


Epoch 22/390
Epoch 23/390


Epoch 24/390
Epoch 25/390


Epoch 26/390
Epoch 27/390


Epoch 28/390
Epoch 29/390


Epoch 30/390
Epoch 31/390


Epoch 32/390
Epoch 33/390


Epoch 34/390
Epoch 35/390


Epoch 36/390
Epoch 37/390


Epoch 38/390
Epoch 39/390


Epoch 40/390
Epoch 41/390


Epoch 42/390
Epoch 43/390


Epoch 44/390
Epoch 45/390


Epoch 46/390
Epoch 47/390


Epoch 48/390
Epoch 49/390


Epoch 50/390
Epoch 51/390


Epoch 52/390
Epoch 53/390


Epoch 54/390
Epoch 55/390


Epoch 56/390
Epoch 57/390


Epoch 58/390
Epoch 59/390


Epoch 60/390
Epoch 61/390


Epoch 62/390
Epoch 63/390


Epoch 64/390
Epoch 65/390


Epoch 66/390
Epoch 67/390


Epoch 68/390
Epoch 69/390


Epoch 70/390
Epoch 71/390


Epoch 72/390
Epoch 73/390


Epoch 74/390
Epoch 75/390


Epoch 76/390
Epoch 77/390


Epoch 78/390
Epoch 79/390


Epoch 80/390
Epoch 81/390


Epoch 82/390
Epoch 83/390


Epoch 84/390
Epoch 85/390


Epoch 86/390
Epoch 87/390


Epoch 88/390
Epoch 89/390


Epoch 90/390
Epoch 91/390


Epoch 92/390
Epoch 93/390


Epoch 94/390
Epoch 95/390


Epoch 96/390
Epoch 97/390


Epoch 98/390
Epoch 99/390


Epoch 100/390
Epoch 101/390


Epoch 102/390
Epoch 103/390


Epoch 104/390
Epoch 105/390


Epoch 106/390
Epoch 107/390


Epoch 108/390
Epoch 109/390


Epoch 110/390
Epoch 111/390


Epoch 112/390
Epoch 113/390


Epoch 114/390
Epoch 115/390


Epoch 116/390
Epoch 117/390


Epoch 118/390
Epoch 119/390


Epoch 120/390
Epoch 121/390


Epoch 122/390
Epoch 123/390


Epoch 124/390
Epoch 125/390


Epoch 126/390
Epoch 127/390


Epoch 128/390
Epoch 129/390


Epoch 130/390
Epoch 131/390


Epoch 132/390
Epoch 133/390


Epoch 134/390
Epoch 135/390


Epoch 136/390
Epoch 137/390


Epoch 138/390
Epoch 139/390


Epoch 140/390
Epoch 141/390


Epoch 142/390
Epoch 143/390


Epoch 144/390
Epoch 145/390


Epoch 146/390
Epoch 147/390


Epoch 148/390
Epoch 149/390


Epoch 150/390
Epoch 151/390


Epoch 152/390
Epoch 153/390


Epoch 154/390
Epoch 155/390


Epoch 156/390
Epoch 157/390


Epoch 158/390
Epoch 159/390


Epoch 160/390
Epoch 161/390


Epoch 162/390
Epoch 163/390


Epoch 164/390
Epoch 165/390


Epoch 166/390
Epoch 167/390


Epoch 168/390
Epoch 169/390


Epoch 170/390
Epoch 171/390


Epoch 172/390
Epoch 173/390


Epoch 174/390
Epoch 175/390


Epoch 176/390
Epoch 177/390


Epoch 178/390
Epoch 179/390


Epoch 180/390
Epoch 181/390


Epoch 182/390
Epoch 183/390


Epoch 184/390
Epoch 185/390


Epoch 186/390
Epoch 187/390


Epoch 188/390
Epoch 189/390


Epoch 190/390
Epoch 191/390


Epoch 228/390
Epoch 229/390


Epoch 230/390
Epoch 231/390


Epoch 250/390
Epoch 251/390


Epoch 252/390
Epoch 253/390


Epoch 254/390
Epoch 255/390


Epoch 256/390
Epoch 257/390


Epoch 258/390
Epoch 259/390


Epoch 260/390
Epoch 261/390


Epoch 262/390
Epoch 263/390


Epoch 264/390
Epoch 265/390


Epoch 266/390
Epoch 267/390


Epoch 268/390
Epoch 269/390


Epoch 270/390
Epoch 271/390


Epoch 272/390
Epoch 273/390


Epoch 274/390
Epoch 275/390


Epoch 276/390
Epoch 277/390


Epoch 278/390
Epoch 279/390


Epoch 280/390
Epoch 281/390


Epoch 282/390
Epoch 283/390


Epoch 284/390
Epoch 285/390


Epoch 286/390
Epoch 287/390


Epoch 288/390
Epoch 289/390


Epoch 290/390
Epoch 291/390


Epoch 292/390
Epoch 293/390


Epoch 294/390
Epoch 295/390


Epoch 296/390
Epoch 297/390


Epoch 298/390
Epoch 299/390


Epoch 300/390
Epoch 301/390


Epoch 302/390
Epoch 303/390


Epoch 304/390
Epoch 305/390


Epoch 306/390
Epoch 307/390


Epoch 308/390
Epoch 309/390


Epoch 310/390
Epoch 311/390


Epoch 312/390
Epoch 313/390


Epoch 314/390
Epoch 315/390


Epoch 316/390
Epoch 317/390


Epoch 318/390
Epoch 319/390


Epoch 320/390
Epoch 321/390


Epoch 322/390
Epoch 323/390


Epoch 324/390
Epoch 325/390


Epoch 326/390
Epoch 327/390


Epoch 328/390
Epoch 329/390


Epoch 330/390
Epoch 331/390


Epoch 332/390
Epoch 333/390


Epoch 334/390
Epoch 335/390


Epoch 336/390
Epoch 337/390


Epoch 338/390
Epoch 339/390


Epoch 340/390
Epoch 341/390


Epoch 342/390
Epoch 343/390


Epoch 344/390
Epoch 345/390


Epoch 346/390
Epoch 347/390


Epoch 348/390
Epoch 349/390


Epoch 350/390
Epoch 351/390


Epoch 352/390
Epoch 353/390


Epoch 354/390
Epoch 355/390


Epoch 356/390
Epoch 357/390


Epoch 358/390
Epoch 359/390


Epoch 360/390
Epoch 361/390


Epoch 362/390
Epoch 363/390


Epoch 364/390
Epoch 365/390


Epoch 366/390
Epoch 367/390


Epoch 368/390
Epoch 369/390


Epoch 370/390
Epoch 371/390


Epoch 372/390
Epoch 373/390


Epoch 374/390
Epoch 375/390


Epoch 376/390
Epoch 377/390


Epoch 378/390
Epoch 379/390


Epoch 380/390
Epoch 381/390


Epoch 382/390
Epoch 383/390


Epoch 384/390
Epoch 385/390


Epoch 386/390
Epoch 387/390


Epoch 388/390
Epoch 389/390


Epoch 390/390


<keras.callbacks.History at 0x224eb9e1ef0>

In [17]:
# Saving the model and the tokenizer
model.save('macbeth_model.h5')
dump(tokenizer,open('my_macbeth_tokenizer','wb'))

## Predict the next words

In [18]:
# using random sequences
random.seed(101)
random_pick = random.randint(0,len(text_sequences))
random_seed_text = text_sequences[random_pick]
seed_text = ' '.join(random_seed_text)

In [19]:
# generate the next word
model = load_model('macbeth_model.h5')
tokenizer = load(open('my_macbeth_tokenizer','rb'))
generator_text(model, tokenizer,seq_len, seed_text=seed_text, num_gen_words=25)

"know it further before where they vanish'd you in no lesse materiall in me naught in the winde macb i without your sad bosomes empty"