In [53]:
import pickle
import numpy
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM
from keras.callbacks import ModelCheckpoint
from keras.utils import np_utils

# keras module for building LSTM 
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, LSTM, Dense, Dropout
from keras.preprocessing.text import Tokenizer
from keras.callbacks import EarlyStopping
from keras.models import Sequential
import keras.utils as ku 

# set seeds for reproducability
from tensorflow import set_random_seed
from numpy.random import seed
set_random_seed(2)
seed(1)

import pandas as pd
import numpy as np
import string, os 

import warnings
import string

In [54]:
dialogues_dict = pickle.load(open('dialogues.pkl', 'rb'))

In [55]:
dumbledore_quotes = dialogues_dict['DUMBLEDORE'].copy()

In [56]:
dumbledore_quotes

['I should have known that you would be here...Professor McGonagall.',
 "I'm afraid so, Professor. The good, and the bad.",
 'Hagrid is bringing him.',
 'Ah, Professor, I would trust Hagrid with my life.',
 'No problems, I trust, Hagrid?',
 'The only family he has.',
 "Exactly. He's better off growing up away from all that. Until he is ready.",
 "There, there, Hagrid. It's not really good-bye, after all.",
 'Good luck...Harry Potter.',
 'I have a few start of term notices I wish to announce. The first years please note that the dark forest is strictly forbidden to all students. Also, our caretaker, Mr. Filch  has asked me to remind you that the 3rd floor corridor on the right hand side is out of bounds to everyone who does not wish to die a most painful death. Thank you.',
 'Let the feast...begin.',
 'SILLLLLEEENNNNCEEEEE!  Everyone will please, not panic. Now, Prefects will lead their houses back to the dormitories. Teachers will follow me to the dungeons.',
 'Back again, Harry?  I se

In [70]:
dumbledore_corpus = ' '.join(dumbledore_quotes)

dumbledore_corpus

"I should have known that you would be here...Professor McGonagall. I'm afraid so, Professor. The good, and the bad. Hagrid is bringing him. Ah, Professor, I would trust Hagrid with my life. No problems, I trust, Hagrid? The only family he has. Exactly. He's better off growing up away from all that. Until he is ready. There, there, Hagrid. It's not really good-bye, after all. Good luck...Harry Potter. I have a few start of term notices I wish to announce. The first years please note that the dark forest is strictly forbidden to all students. Also, our caretaker, Mr. Filch  has asked me to remind you that the 3rd floor corridor on the right hand side is out of bounds to everyone who does not wish to die a most painful death. Thank you. Let the feast...begin. SILLLLLEEENNNNCEEEEE!  Everyone will please, not panic. Now, Prefects will lead their houses back to the dormitories. Teachers will follow me to the dungeons. Back again, Harry?  I see that you, like so many before you, have discove

In [73]:
dumbledore_corpus = dumbledore_corpus.replace('...', '. ')

# split into tokens by white space
tokens = dumbledore_corpus.split()
# remove punctuation from each token
table = str.maketrans('', '', string.punctuation)
tokens = [w.translate(table) for w in tokens]
# remove remaining tokens that are not alphabetic
tokens = [word for word in tokens if word.isalpha()]
# make lower case
tokens = [word.lower() for word in tokens]

In [74]:
print(tokens[:200])
print('Total Tokens: %d' % len(tokens))
print('Unique Tokens: %d' % len(set(tokens)))

['i', 'should', 'have', 'known', 'that', 'you', 'would', 'be', 'here', 'professor', 'mcgonagall', 'im', 'afraid', 'so', 'professor', 'the', 'good', 'and', 'the', 'bad', 'hagrid', 'is', 'bringing', 'him', 'ah', 'professor', 'i', 'would', 'trust', 'hagrid', 'with', 'my', 'life', 'no', 'problems', 'i', 'trust', 'hagrid', 'the', 'only', 'family', 'he', 'has', 'exactly', 'hes', 'better', 'off', 'growing', 'up', 'away', 'from', 'all', 'that', 'until', 'he', 'is', 'ready', 'there', 'there', 'hagrid', 'its', 'not', 'really', 'goodbye', 'after', 'all', 'good', 'luck', 'harry', 'potter', 'i', 'have', 'a', 'few', 'start', 'of', 'term', 'notices', 'i', 'wish', 'to', 'announce', 'the', 'first', 'years', 'please', 'note', 'that', 'the', 'dark', 'forest', 'is', 'strictly', 'forbidden', 'to', 'all', 'students', 'also', 'our', 'caretaker', 'mr', 'filch', 'has', 'asked', 'me', 'to', 'remind', 'you', 'that', 'the', 'floor', 'corridor', 'on', 'the', 'right', 'hand', 'side', 'is', 'out', 'of', 'bounds', 't

In [75]:
# organize into sequences of tokens
length = 50 + 1
sequences = list()
for i in range(length, len(tokens)):
# select sequence of tokens
    seq = tokens[i-length:i]
    # convert into a line
    line = ' '.join(seq)
    # store
    sequences.append(line)
print('Total Sequences: %d' % len(sequences))

Total Sequences: 5505


In [76]:
def save_doc(lines, filename):
    data = '\n'.join(lines)
    file = open(filename, 'w')
    file.write(data)
    file.close()

In [77]:
# save sequences to file
out_filename = 'dumbledore_sequences.txt'
save_doc(sequences, out_filename)

In [78]:
# load doc into memory
def load_doc(filename):
    # open the file as read only
    file = open(filename, 'r')
    # read all text
    text = file.read()
    # close the file
    file.close()
    return text
 
# load
in_filename = 'dumbledore_sequences.txt'
doc = load_doc(in_filename)
lines = doc.split('\n')

In [79]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(lines)
sequences = tokenizer.texts_to_sequences(lines)

In [80]:
# vocabulary size
vocab_size = len(tokenizer.word_index) + 1

In [141]:
vocab_size

1362

In [84]:
# separate into input and output

sequences = np.array(sequences)
X, y = sequences[:,:-1], sequences[:,-1]
y = np_utils.to_categorical(y, num_classes=vocab_size)
seq_length = X.shape[1]

In [142]:
# define model
model = Sequential()
model.add(Embedding(vocab_size, 50, input_length=seq_length))
model.add(LSTM(200, return_sequences=True))
model.add(LSTM(100))
model.add(Dense(100, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(vocab_size, activation='softmax'))
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_17 (Embedding)     (None, 50, 50)            68100     
_________________________________________________________________
lstm_22 (LSTM)               (None, 50, 200)           200800    
_________________________________________________________________
lstm_23 (LSTM)               (None, 100)               120400    
_________________________________________________________________
dense_24 (Dense)             (None, 100)               10100     
_________________________________________________________________
dropout_2 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_25 (Dense)             (None, 1362)              137562    
Total params: 536,962
Trainable params: 536,962
Non-trainable params: 0
_________________________________________________________________
None

In [143]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# fit model
model.fit(X, y, batch_size=128, epochs=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<keras.callbacks.History at 0xb4719af60>

In [137]:
# save the model to file

model.save('char_word_model.h5')
# save the tokenizer
pickle.dump(tokenizer, open('tokenizer.pkl', 'wb'))

In [162]:
# select a seed text
seed_text = lines[np.random.randint(0,len(lines))]
print(seed_text + '\n')

must not be seen and you would i feel do well to return before this last chime if not well the consequences are really too ghastly to discuss three turns should do it i think if you succeed more than one innocent life may be spared tonight by the way when



In [97]:
encoded = tokenizer.texts_to_sequences([seed_text])[0]


In [163]:
def generate_seq(model, tokenizer, seq_length, seed_text, n_words):
    result = list()
    in_text = seed_text
    # generate a fixed number of words
    for _ in range(n_words):
        # encode the text as integer
        encoded = tokenizer.texts_to_sequences([in_text])[0]
        # truncate sequences to a fixed length
        encoded = pad_sequences([encoded], maxlen=seq_length, truncating='pre')
        # predict probabilities for each word
        yhat = model.predict_classes(encoded, verbose=0)
        # map predicted word index to word
        out_word = ''
        for word, index in tokenizer.word_index.items():
            if index == yhat:
                out_word = word
                break
        # append to input
        in_text += ' ' + out_word
        result.append(out_word)
    return ' '.join(result)

In [164]:

# generate new text

generated = generate_seq(model, tokenizer, seq_length, seed_text, 200)
print(generated)

i think is is is is a evening harry to have a few old evening when you did that i have a old friend when i think its been very evening when you see i have a old luck of the maze placed the maze continued the students will have moved to the dormitories rule to retire in the staff of the castle however i have a evening harry i want it i have a old friend for the loudspeaker that you see i think i think its been petrified the triwizard tournament harry in the castle i been very evening harry i have a old luck of the maze placed the maze continued you will also not been afraid harry i think its voldemort our evening mr well been very evening when and i think i think i think i think a evening placed you the floor corridor i have a old friend for the maze placed the maze continued you are i have a evening i have been very old friend for the loudspeaker the loudspeaker the loudspeaker the evening harry you did the loudspeaker that you see i think i have a old friend for the loudspeaker
