# Shakespeare Play Generator

In [2]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.models import Sequential
import tensorflow.keras.utils as ku 

# set seeds for reproducability
import tensorflow as tf
from numpy.random import seed
tf.random.set_seed(2)
seed(1)

import pandas as pd
import numpy as np
import string, os 

import warnings
warnings.filterwarnings("ignore")
warnings.simplefilter(action='ignore', category=FutureWarning)

In [3]:
play_df = pd.read_csv('./archive/Shakespeare_data.csv')

In [4]:
all_lines = [h for h in play_df.PlayerLine]
def clean_text(txt):
    txt = "".join(v for v in txt if v not in string.punctuation).lower()
    txt = txt.encode("utf8").decode("ascii",'ignore')
    return txt 

corpus = [clean_text(x) for x in all_lines]
corpus[:10]

['act i',
 'scene i london the palace',
 'enter king henry lord john of lancaster the earl of westmoreland sir walter blunt and others',
 'so shaken as we are so wan with care',
 'find we a time for frighted peace to pant',
 'and breathe shortwinded accents of new broils',
 'to be commenced in strands afar remote',
 'no more the thirsty entrance of this soil',
 'shall daub her lips with her own childrens blood',
 'nor more shall trenching war channel her fields']

In [5]:
tokenizer = Tokenizer()

def get_sequence_of_tokens(corpus):
    ## tokenization
    corpus = corpus[:7000]
    tokenizer.fit_on_texts(corpus)
    total_words = len(tokenizer.word_index) + 1
    
    ## convert data to sequence of tokens 
    input_sequences = []
    for line in corpus:
        token_list = tokenizer.texts_to_sequences([line])[0]
        for i in range(1, len(token_list)):
            n_gram_sequence = token_list[:i+1]
            input_sequences.append(n_gram_sequence)
    return input_sequences, total_words

inp_sequences, total_words = get_sequence_of_tokens(corpus)
inp_sequences[:10]

[[523, 4],
 [142, 4],
 [142, 4, 339],
 [142, 4, 339, 1],
 [142, 4, 339, 1, 670],
 [53, 41],
 [53, 41, 84],
 [53, 41, 84, 29],
 [53, 41, 84, 29, 124],
 [53, 41, 84, 29, 124, 3]]

In [6]:
def generate_padded_sequences(input_sequences):
    max_sequence_len = max([len(x) for x in input_sequences])
    input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))
    
    predictors, label = input_sequences[:,:-1],input_sequences[:,-1]
    label = ku.to_categorical(label, num_classes=total_words)
    return predictors, label, max_sequence_len

predictors, label, max_sequence_len = generate_padded_sequences(inp_sequences)
predictors.shape, label.shape

((45584, 33), (45584, 6543))

In [7]:
def create_model(max_sequence_len, total_words):
    input_len = max_sequence_len - 1
    model = Sequential()
    
    # Add Input Embedding Layer
    model.add(Embedding(total_words, 10, input_length=input_len))
    
    # Add Hidden Layer 1 - LSTM Layer
    model.add(LSTM(512))
    model.add(Dropout(0.4))
    
    # Add Output Layer
    model.add(Dense(total_words, activation='softmax'))

    model.compile(loss='categorical_crossentropy', optimizer='adam')
    
    return model

model = create_model(max_sequence_len, total_words)
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 33, 10)            65430     
_________________________________________________________________
lstm (LSTM)                  (None, 512)               1071104   
_________________________________________________________________
dropout (Dropout)            (None, 512)               0         
_________________________________________________________________
dense (Dense)                (None, 6543)              3356559   
Total params: 4,493,093
Trainable params: 4,493,093
Non-trainable params: 0
_________________________________________________________________


In [8]:
model.fit(predictors, label, epochs=20, verbose=2)

Epoch 1/20
1425/1425 - 549s - loss: 6.8415
Epoch 2/20
1425/1425 - 495s - loss: 6.5198
Epoch 3/20
1425/1425 - 477s - loss: 6.3824
Epoch 4/20
1425/1425 - 315s - loss: 6.2359
Epoch 5/20
1425/1425 - 287s - loss: 6.0600
Epoch 6/20
1425/1425 - 295s - loss: 5.8758
Epoch 7/20
1425/1425 - 256s - loss: 5.6699
Epoch 8/20
1425/1425 - 309s - loss: 5.4338
Epoch 9/20
1425/1425 - 321s - loss: 5.1848
Epoch 10/20
1425/1425 - 236s - loss: 4.9108
Epoch 11/20
1425/1425 - 243s - loss: 4.6376
Epoch 12/20
1425/1425 - 254s - loss: 4.3579
Epoch 13/20
1425/1425 - 244s - loss: 4.1049
Epoch 14/20
1425/1425 - 226s - loss: 3.8681
Epoch 15/20
1425/1425 - 214s - loss: 3.6492
Epoch 16/20
1425/1425 - 199s - loss: 3.4520
Epoch 17/20
1425/1425 - 210s - loss: 3.2831
Epoch 18/20
1425/1425 - 219s - loss: 3.1145
Epoch 19/20
1425/1425 - 228s - loss: 2.9807
Epoch 20/20
1425/1425 - 196s - loss: 2.8438


<tensorflow.python.keras.callbacks.History at 0x68d5c3b90>

In [9]:
def generate_text(seed_text, next_words, model, max_sequence_len):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        predicted = model.predict_classes(token_list, verbose=0)
        
        output_word = ""
        for word,index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break
        seed_text += " "+output_word
    return seed_text.title()

In [10]:
print ("1. ",generate_text("Julius", 20, model, max_sequence_len))
print ("2. ",generate_text("Thou", 20, model, max_sequence_len))
print ("3. ",generate_text("King is", 20, model, max_sequence_len))
print ("4. ",generate_text("Death of", 20, model, max_sequence_len))
print ("5. ",generate_text("The Princess", 20, model, max_sequence_len))
print ("6. ",generate_text("Thanos", 20, model, max_sequence_len))

Instructions for updating:
Please use instead:* `np.argmax(model.predict(x), axis=-1)`,   if your model does multi-class classification   (e.g. if it uses a `softmax` last-layer activation).* `(model.predict(x) > 0.5).astype("int32")`,   if your model does binary classification   (e.g. if it uses a `sigmoid` last-layer activation).
1.  Julius If I Have Dealt Ill Make Him If I Do Not If I Am My Blood If I Can Not
2.  Thou Art The Most Mortimer Of The Commons Gates Out Of England And The Sea Of A Coats Then Within Two
3.  King Is Much Myself And Never Evident Are Me You I Know Not Be Old Out Of The Living Burly Of The
4.  Death Of The Days Of The Land Month At Me And The Fifth Then Then And Their Half And C It Thunders
5.  The Princess Of The Day Of Hosts He Fought Of The Rest Of The Ladder Of The Gallows Stars And A Feast
6.  Thanos Me I Am Withered As Back I Am Purge To A Cat Of My Own Lord Of London It And
