# Shakespeare

### Import Libraries

In [1]:
import pandas as pd
import numpy as np

from keras.preprocessing.text import text_to_word_sequence, Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Embedding
from keras.layers import LSTM
from keras.layers import Dropout
from keras.layers import Dense

Using TensorFlow backend.


### Import Data

In [2]:
sd = pd.read_csv('Shakespeare_data.csv')

In [3]:
sd.head()

Unnamed: 0,Dataline,Play,PlayerLinenumber,ActSceneLine,Player,PlayerLine
0,1,Henry IV,,,,ACT I
1,2,Henry IV,,,,SCENE I. London. The palace.
2,3,Henry IV,,,,"Enter KING HENRY, LORD JOHN OF LANCASTER, the ..."
3,4,Henry IV,1.0,1.1.1,KING HENRY IV,"So shaken as we are, so wan with care,"
4,5,Henry IV,1.0,1.1.2,KING HENRY IV,"Find we a time for frighted peace to pant,"


In [4]:
sd.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 111396 entries, 0 to 111395
Data columns (total 6 columns):
Dataline            111396 non-null int64
Play                111396 non-null object
PlayerLinenumber    111393 non-null float64
ActSceneLine        105153 non-null object
Player              111389 non-null object
PlayerLine          111396 non-null object
dtypes: float64(1), int64(1), object(4)
memory usage: 5.1+ MB


In [5]:
# Seperate lines of text
with open('alllines.txt', 'r') as file:
    text = file.read()
    lines = text.lower().split('\n')

In [6]:
# Define words, vocabulary size, and sequences of words as lines
words = text_to_word_sequence(text)
tokenizer = Tokenizer()
tokenizer.fit_on_texts(words)
vocabulary_size = len(tokenizer.word_index) + 1
sequences = tokenizer.texts_to_sequences(lines)

subsequences = []
for sequence in sequences:
    for i in range(1, len(sequence)):
        subsequence = sequence[:i+1]
        subsequences.append(subsequence)

In [7]:
# Padding
sequence_length = max([len(sequence) for sequence in sequences])
sequences = pad_sequences(subsequences,maxlen=sequence_length,padding='pre')

In [8]:
# Encoding
x, y = sequences[:,:-1],sequences[:,-1]
y = to_categorical(y,num_classes=vocabulary_size,dtype=np.int8)

### Model

In [9]:
model = Sequential()
model.add(Embedding(vocabulary_size,100,input_length=sequence_length-1))
model.add(LSTM(100))
model.add(Dropout(0.1))
model.add(Dense(units=vocabulary_size,activation='softmax'))

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


In [10]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 166, 100)          2557600   
_________________________________________________________________
lstm_1 (LSTM)                (None, 100)               80400     
_________________________________________________________________
dropout_1 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 25576)             2583176   
Total params: 5,221,176
Trainable params: 5,221,176
Non-trainable params: 0
_________________________________________________________________


In [11]:
# Compile Model
model.compile(optimizer='adam',loss='categorical_crossentropy',metrics=['accuracy'])

In [12]:
# Train Model
#model.fit(x,y,epochs=500)
# Takes way too long to run, just doing 1 epoch
model.fit(x,y,epochs=1)

Instructions for updating:
Use tf.cast instead.
Epoch 1/1


<keras.callbacks.History at 0x22e34b996a0>