In [1]:
import time
import string
from collections import namedtuple

import numpy as np
import tensorflow as tf
from keras.preprocessing.text import text_to_word_sequence
from keras.utils import to_categorical
from keras.models import Sequential
from keras.callbacks import ModelCheckpoint
from keras.layers import Dense,LSTM,Dropout,Embedding

Using TensorFlow backend.


In [0]:
# Read the data
with open('The_Da_Vinci_Code.txt', encoding="utf8", errors='ignore') as f:
    text=f.read()

In [0]:
def clean_doc(doc):
	# replace '--' with a space ' '
	doc = doc.replace('--', ' ')
	# split into tokens by white space
	tokens = doc.split()
	# remove punctuation from each token
	table = str.maketrans('', '', string.punctuation)
	tokens = [w.translate(table) for w in tokens]
	# remove remaining tokens that are not alphabetic
	tokens = [word for word in tokens if word.isalpha()]
	# make lower case
	tokens = [word.lower() for word in tokens]
	return tokens

In [0]:
# Get text as tokens
tokens = clean_doc(text)
# vocabulary size
total_words = len(tokens)

In [0]:
# Length of unique tokens
unique_words =len(set(tokens))

In [0]:
# Integer encode the tokens
vocab = sorted(set(tokens))
vocab_to_int = {c: i for i, c in enumerate(vocab)}
int_to_vocab = dict(enumerate(vocab))
encoded = np.array([vocab_to_int[c] for c in tokens], dtype=np.int32)

In [0]:
np.save('int_to_vocab.npy', int_to_vocab)
np.save('vocab_to_int.npy', vocab_to_int)

In [8]:
tokens[:20]

['the',
 'da',
 'vinci',
 'code',
 'dan',
 'brown',
 'for',
 'blythe',
 'again',
 'more',
 'than',
 'ever',
 'prologue',
 'louvre',
 'museum',
 'paris',
 'pm',
 'renowned',
 'curator',
 'jacques']

In [0]:
from keras.utils import to_categorical
seq_len = 10
X=[]
y = np.zeros((total_words-seq_len,unique_words), dtype=np.bool)
for i in range(seq_len,total_words):
  words_in = tokens[i-seq_len:i]
  word_out = tokens[i]
  X.append([vocab_to_int[word] for word in words_in])
  y[i-seq_len,vocab_to_int[word_out]]=1

In [0]:
X = np.asarray(X)

In [11]:
X.shape

(137995, 10)

In [12]:
# define model
model = Sequential()
model.add(Embedding(X.shape[0], 10, input_length=seq_len))
model.add(LSTM(512))
model.add(Dropout(0.5))
model.add(Dense(y.shape[1], activation='softmax'))
print(model.summary())
# compile model
model.compile(loss='categorical_crossentropy', optimizer="Adam")
# fit model
filepath='best_model.h5'
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
model.fit(X,y, batch_size=128, epochs=100,callbacks=[checkpoint])
# save the model to file

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 10, 10)            1379950   
_________________________________________________________________
lstm_1 (LSTM)                (None, 512)               1071104   
_________________________________________________________________
dropout_1 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 11495)             5896935   
Total params: 8,347,989
Trainable params: 8,347,989
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/100
 13312/137995 [=>............................] - ETA: 1:06 - loss: 7.4697


Epoch 00001: loss improved from inf to 6.94970, saving model to best_model.h5
Epoch 2/100
 15616/137995 [==>...........................] - ETA: 51s - loss: 6.6987


Epoch 00002: loss improved from 6.94970 to 6.67559, saving model to best_model.h5
Epoch 3/100
 16128/137995 [==>...........................] - ETA: 51s - loss: 6.5147


Epoch 00003: loss improved from 6.67559 to 6.49972, saving model to best_model.h5
Epoch 4/100
 16128/137995 [==>...........................] - ETA: 51s - loss: 6.3252


Epoch 00004: loss improved from 6.49972 to 6.31588, saving model to best_model.h5
Epoch 5/100
 16512/137995 [==>...........................] - ETA: 51s - loss: 6.1066


Epoch 00005: loss improved from 6.31588 to 6.11483, saving model to best_model.h5
Epoch 6/100
 16512/137995 [==>...........................] - ETA: 51s - loss: 5.9440


Epoch 00006: loss improved from 6.11483 to 5.92262, saving model to best_model.h5
Epoch 7/100
 16384/137995 [==>...........................] - ETA: 51s - loss: 5.7196


Epoch 00007: loss improved from 5.92262 to 5.73850, saving model to best_model.h5
Epoch 8/100
 16512/137995 [==>...........................] - ETA: 51s - loss: 5.5093


Epoch 00008: loss improved from 5.73850 to 5.55917, saving model to best_model.h5
Epoch 9/100
 16384/137995 [==>...........................] - ETA: 52s - loss: 5.2951


Epoch 00009: loss improved from 5.55917 to 5.37556, saving model to best_model.h5
Epoch 10/100
 16512/137995 [==>...........................] - ETA: 50s - loss: 5.1162


Epoch 00010: loss improved from 5.37556 to 5.18827, saving model to best_model.h5
Epoch 11/100
 16256/137995 [==>...........................] - ETA: 50s - loss: 4.9017


Epoch 00011: loss improved from 5.18827 to 4.98666, saving model to best_model.h5
Epoch 12/100
 17408/137995 [==>...........................] - ETA: 50s - loss: 4.6877


Epoch 00012: loss improved from 4.98666 to 4.77804, saving model to best_model.h5
Epoch 13/100
 16768/137995 [==>...........................] - ETA: 50s - loss: 4.4428


Epoch 00013: loss improved from 4.77804 to 4.56654, saving model to best_model.h5
Epoch 14/100
 17152/137995 [==>...........................] - ETA: 50s - loss: 4.1874


Epoch 00014: loss improved from 4.56654 to 4.35286, saving model to best_model.h5
Epoch 15/100
 16896/137995 [==>...........................] - ETA: 50s - loss: 3.9738


Epoch 00015: loss improved from 4.35286 to 4.14227, saving model to best_model.h5
Epoch 16/100
 16512/137995 [==>...........................] - ETA: 51s - loss: 3.7487


Epoch 00016: loss improved from 4.14227 to 3.95022, saving model to best_model.h5
Epoch 17/100
 16256/137995 [==>...........................] - ETA: 50s - loss: 3.5863


Epoch 00017: loss improved from 3.95022 to 3.75852, saving model to best_model.h5
Epoch 18/100
 16256/137995 [==>...........................] - ETA: 50s - loss: 3.4145


Epoch 00018: loss improved from 3.75852 to 3.59512, saving model to best_model.h5
Epoch 19/100
 16128/137995 [==>...........................] - ETA: 51s - loss: 3.2471


Epoch 00019: loss improved from 3.59512 to 3.43515, saving model to best_model.h5
Epoch 20/100
 16384/137995 [==>...........................] - ETA: 51s - loss: 3.1104


Epoch 00020: loss improved from 3.43515 to 3.29009, saving model to best_model.h5
Epoch 21/100
 16512/137995 [==>...........................] - ETA: 51s - loss: 3.0080


Epoch 00021: loss improved from 3.29009 to 3.15767, saving model to best_model.h5
Epoch 22/100
 16256/137995 [==>...........................] - ETA: 51s - loss: 2.8620


Epoch 00022: loss improved from 3.15767 to 3.03658, saving model to best_model.h5
Epoch 23/100
 16128/137995 [==>...........................] - ETA: 51s - loss: 2.7576


Epoch 00023: loss improved from 3.03658 to 2.92057, saving model to best_model.h5
Epoch 24/100
 16384/137995 [==>...........................] - ETA: 51s - loss: 2.6557


Epoch 00024: loss improved from 2.92057 to 2.82436, saving model to best_model.h5
Epoch 25/100
 16512/137995 [==>...........................] - ETA: 51s - loss: 2.5506


Epoch 00025: loss improved from 2.82436 to 2.72216, saving model to best_model.h5
Epoch 26/100
 16256/137995 [==>...........................] - ETA: 51s - loss: 2.4360


Epoch 00026: loss improved from 2.72216 to 2.63233, saving model to best_model.h5
Epoch 27/100
 16256/137995 [==>...........................] - ETA: 51s - loss: 2.3811


Epoch 00027: loss improved from 2.63233 to 2.55237, saving model to best_model.h5
Epoch 28/100
 16256/137995 [==>...........................] - ETA: 51s - loss: 2.2838


Epoch 00028: loss improved from 2.55237 to 2.47470, saving model to best_model.h5
Epoch 29/100
 16128/137995 [==>...........................] - ETA: 51s - loss: 2.2339


Epoch 00029: loss improved from 2.47470 to 2.40573, saving model to best_model.h5
Epoch 30/100
 16256/137995 [==>...........................] - ETA: 51s - loss: 2.1487


Epoch 00030: loss improved from 2.40573 to 2.33130, saving model to best_model.h5
Epoch 31/100
 16128/137995 [==>...........................] - ETA: 52s - loss: 2.0890


Epoch 00031: loss improved from 2.33130 to 2.26708, saving model to best_model.h5
Epoch 32/100
 16256/137995 [==>...........................] - ETA: 51s - loss: 2.0480


Epoch 00032: loss improved from 2.26708 to 2.20849, saving model to best_model.h5
Epoch 33/100
 16256/137995 [==>...........................] - ETA: 51s - loss: 1.9915


Epoch 00033: loss improved from 2.20849 to 2.15168, saving model to best_model.h5
Epoch 34/100
 16512/137995 [==>...........................] - ETA: 50s - loss: 1.9324


Epoch 00034: loss improved from 2.15168 to 2.09556, saving model to best_model.h5
Epoch 35/100
 16128/137995 [==>...........................] - ETA: 52s - loss: 1.8974


Epoch 00035: loss improved from 2.09556 to 2.05444, saving model to best_model.h5
Epoch 36/100
 16128/137995 [==>...........................] - ETA: 51s - loss: 1.8261


Epoch 00036: loss improved from 2.05444 to 2.00453, saving model to best_model.h5
Epoch 37/100
 16256/137995 [==>...........................] - ETA: 51s - loss: 1.8009


Epoch 00037: loss improved from 2.00453 to 1.95596, saving model to best_model.h5
Epoch 38/100
 16128/137995 [==>...........................] - ETA: 51s - loss: 1.7365


Epoch 00038: loss improved from 1.95596 to 1.92106, saving model to best_model.h5
Epoch 39/100
 16256/137995 [==>...........................] - ETA: 51s - loss: 1.7545


Epoch 00039: loss improved from 1.92106 to 1.88074, saving model to best_model.h5
Epoch 40/100
 16128/137995 [==>...........................] - ETA: 50s - loss: 1.6929


Epoch 00040: loss improved from 1.88074 to 1.84261, saving model to best_model.h5
Epoch 41/100
 16000/137995 [==>...........................] - ETA: 51s - loss: 1.6569


Epoch 00041: loss improved from 1.84261 to 1.81156, saving model to best_model.h5
Epoch 42/100
 16256/137995 [==>...........................] - ETA: 50s - loss: 1.6187


Epoch 00042: loss improved from 1.81156 to 1.77121, saving model to best_model.h5
Epoch 43/100
 16384/137995 [==>...........................] - ETA: 51s - loss: 1.5774


Epoch 00043: loss improved from 1.77121 to 1.74584, saving model to best_model.h5
Epoch 44/100
 16256/137995 [==>...........................] - ETA: 51s - loss: 1.5636


Epoch 00044: loss improved from 1.74584 to 1.71281, saving model to best_model.h5
Epoch 45/100
 16256/137995 [==>...........................] - ETA: 51s - loss: 1.5432


Epoch 00045: loss improved from 1.71281 to 1.68684, saving model to best_model.h5
Epoch 46/100
 16256/137995 [==>...........................] - ETA: 50s - loss: 1.5174


Epoch 00046: loss improved from 1.68684 to 1.65996, saving model to best_model.h5
Epoch 47/100
 16128/137995 [==>...........................] - ETA: 50s - loss: 1.4793


Epoch 00047: loss improved from 1.65996 to 1.63312, saving model to best_model.h5
Epoch 48/100
 16128/137995 [==>...........................] - ETA: 51s - loss: 1.4759


Epoch 00048: loss improved from 1.63312 to 1.60730, saving model to best_model.h5
Epoch 49/100
 16128/137995 [==>...........................] - ETA: 50s - loss: 1.4734


Epoch 00049: loss improved from 1.60730 to 1.58938, saving model to best_model.h5
Epoch 50/100
 16000/137995 [==>...........................] - ETA: 51s - loss: 1.4210


Epoch 00050: loss improved from 1.58938 to 1.56220, saving model to best_model.h5
Epoch 51/100
 16000/137995 [==>...........................] - ETA: 51s - loss: 1.3763


Epoch 00051: loss improved from 1.56220 to 1.53854, saving model to best_model.h5
Epoch 52/100
 16384/137995 [==>...........................] - ETA: 51s - loss: 1.3835


Epoch 00052: loss improved from 1.53854 to 1.52782, saving model to best_model.h5
Epoch 53/100
 16256/137995 [==>...........................] - ETA: 51s - loss: 1.3600


Epoch 00053: loss improved from 1.52782 to 1.50572, saving model to best_model.h5
Epoch 54/100
 16256/137995 [==>...........................] - ETA: 50s - loss: 1.3548


Epoch 00054: loss improved from 1.50572 to 1.48593, saving model to best_model.h5
Epoch 55/100
 16128/137995 [==>...........................] - ETA: 51s - loss: 1.3338


Epoch 00055: loss improved from 1.48593 to 1.46281, saving model to best_model.h5
Epoch 56/100
 16128/137995 [==>...........................] - ETA: 51s - loss: 1.3104


Epoch 00056: loss improved from 1.46281 to 1.44880, saving model to best_model.h5
Epoch 57/100
 16256/137995 [==>...........................] - ETA: 51s - loss: 1.3167


Epoch 00057: loss improved from 1.44880 to 1.43808, saving model to best_model.h5
Epoch 58/100
 16128/137995 [==>...........................] - ETA: 51s - loss: 1.2949


Epoch 00058: loss improved from 1.43808 to 1.41684, saving model to best_model.h5
Epoch 59/100
 16256/137995 [==>...........................] - ETA: 51s - loss: 1.2694


Epoch 00059: loss improved from 1.41684 to 1.40362, saving model to best_model.h5
Epoch 60/100
 16128/137995 [==>...........................] - ETA: 51s - loss: 1.2612


Epoch 00060: loss improved from 1.40362 to 1.38837, saving model to best_model.h5
Epoch 61/100
 16256/137995 [==>...........................] - ETA: 51s - loss: 1.2431


Epoch 00061: loss improved from 1.38837 to 1.37984, saving model to best_model.h5
Epoch 62/100
 16384/137995 [==>...........................] - ETA: 50s - loss: 1.2339


Epoch 00062: loss improved from 1.37984 to 1.37228, saving model to best_model.h5
Epoch 63/100
 16128/137995 [==>...........................] - ETA: 50s - loss: 1.2205


Epoch 00063: loss improved from 1.37228 to 1.35424, saving model to best_model.h5
Epoch 64/100
 16384/137995 [==>...........................] - ETA: 51s - loss: 1.2173


Epoch 00064: loss improved from 1.35424 to 1.33918, saving model to best_model.h5
Epoch 65/100
 16384/137995 [==>...........................] - ETA: 51s - loss: 1.2073


Epoch 00065: loss improved from 1.33918 to 1.32831, saving model to best_model.h5
Epoch 66/100
 16256/137995 [==>...........................] - ETA: 51s - loss: 1.2042


Epoch 00066: loss improved from 1.32831 to 1.31504, saving model to best_model.h5
Epoch 67/100
 16384/137995 [==>...........................] - ETA: 50s - loss: 1.1969


Epoch 00067: loss improved from 1.31504 to 1.31287, saving model to best_model.h5
Epoch 68/100
 16128/137995 [==>...........................] - ETA: 51s - loss: 1.1986


Epoch 00068: loss improved from 1.31287 to 1.29988, saving model to best_model.h5
Epoch 69/100
 16128/137995 [==>...........................] - ETA: 51s - loss: 1.1881


Epoch 00069: loss improved from 1.29988 to 1.29530, saving model to best_model.h5
Epoch 70/100
 16128/137995 [==>...........................] - ETA: 51s - loss: 1.1602


Epoch 00070: loss improved from 1.29530 to 1.28027, saving model to best_model.h5
Epoch 71/100
 16000/137995 [==>...........................] - ETA: 52s - loss: 1.1457


Epoch 00071: loss improved from 1.28027 to 1.27193, saving model to best_model.h5
Epoch 72/100
 16256/137995 [==>...........................] - ETA: 51s - loss: 1.1410


Epoch 00072: loss improved from 1.27193 to 1.26491, saving model to best_model.h5
Epoch 73/100
 16128/137995 [==>...........................] - ETA: 51s - loss: 1.1438


Epoch 00073: loss improved from 1.26491 to 1.25019, saving model to best_model.h5
Epoch 74/100
 16128/137995 [==>...........................] - ETA: 51s - loss: 1.1231


Epoch 00074: loss improved from 1.25019 to 1.24047, saving model to best_model.h5
Epoch 75/100
 16128/137995 [==>...........................] - ETA: 52s - loss: 1.1224


Epoch 00075: loss improved from 1.24047 to 1.23282, saving model to best_model.h5
Epoch 76/100
 16256/137995 [==>...........................] - ETA: 51s - loss: 1.0907


Epoch 00076: loss improved from 1.23282 to 1.21715, saving model to best_model.h5
Epoch 77/100
 16256/137995 [==>...........................] - ETA: 51s - loss: 1.1168


Epoch 00077: loss did not improve
Epoch 78/100
 20224/137995 [===>..........................] - ETA: 50s - loss: 1.1061


Epoch 00078: loss improved from 1.21715 to 1.21173, saving model to best_model.h5
Epoch 79/100
 17280/137995 [==>...........................] - ETA: 51s - loss: 1.1086


Epoch 00079: loss improved from 1.21173 to 1.20544, saving model to best_model.h5
Epoch 80/100
 16384/137995 [==>...........................] - ETA: 51s - loss: 1.0816


Epoch 00080: loss improved from 1.20544 to 1.19425, saving model to best_model.h5
Epoch 81/100
 16256/137995 [==>...........................] - ETA: 51s - loss: 1.0984


Epoch 00081: loss improved from 1.19425 to 1.19177, saving model to best_model.h5
Epoch 82/100
 16256/137995 [==>...........................] - ETA: 51s - loss: 1.0691


Epoch 00082: loss improved from 1.19177 to 1.18501, saving model to best_model.h5
Epoch 83/100
 16256/137995 [==>...........................] - ETA: 51s - loss: 1.0753


Epoch 00083: loss improved from 1.18501 to 1.17905, saving model to best_model.h5
Epoch 84/100
 16256/137995 [==>...........................] - ETA: 51s - loss: 1.0473


Epoch 00084: loss improved from 1.17905 to 1.16758, saving model to best_model.h5
Epoch 85/100
 16256/137995 [==>...........................] - ETA: 50s - loss: 1.0598


Epoch 00085: loss did not improve
Epoch 86/100
 20480/137995 [===>..........................] - ETA: 49s - loss: 1.0625


Epoch 00086: loss improved from 1.16758 to 1.16399, saving model to best_model.h5
Epoch 87/100
 17280/137995 [==>...........................] - ETA: 51s - loss: 1.0484


Epoch 00087: loss improved from 1.16399 to 1.15273, saving model to best_model.h5
Epoch 88/100
 16512/137995 [==>...........................] - ETA: 51s - loss: 1.0487


Epoch 00088: loss improved from 1.15273 to 1.14856, saving model to best_model.h5
Epoch 89/100
 16128/137995 [==>...........................] - ETA: 51s - loss: 1.0072


Epoch 00089: loss improved from 1.14856 to 1.12914, saving model to best_model.h5
Epoch 90/100
 16128/137995 [==>...........................] - ETA: 51s - loss: 1.0530


Epoch 00090: loss did not improve
Epoch 91/100
 20096/137995 [===>..........................] - ETA: 49s - loss: 1.0313


Epoch 00091: loss did not improve
Epoch 92/100
 21632/137995 [===>..........................] - ETA: 49s - loss: 1.0296


Epoch 00092: loss improved from 1.12914 to 1.12386, saving model to best_model.h5
Epoch 93/100
 17792/137995 [==>...........................] - ETA: 50s - loss: 1.0143


Epoch 00093: loss did not improve
Epoch 94/100
 20992/137995 [===>..........................] - ETA: 49s - loss: 0.9951


Epoch 00094: loss improved from 1.12386 to 1.11072, saving model to best_model.h5
Epoch 95/100
 17664/137995 [==>...........................] - ETA: 50s - loss: 1.0356


Epoch 00095: loss did not improve
Epoch 96/100
 20992/137995 [===>..........................] - ETA: 49s - loss: 1.0121


Epoch 00096: loss improved from 1.11072 to 1.10606, saving model to best_model.h5
Epoch 97/100
 17536/137995 [==>...........................] - ETA: 51s - loss: 1.0340


Epoch 00097: loss did not improve
Epoch 98/100
 20608/137995 [===>..........................] - ETA: 50s - loss: 0.9898


Epoch 00098: loss improved from 1.10606 to 1.10004, saving model to best_model.h5
Epoch 99/100
 17536/137995 [==>...........................] - ETA: 50s - loss: 0.9795


Epoch 00099: loss did not improve
Epoch 100/100
 20608/137995 [===>..........................] - ETA: 49s - loss: 1.0120


Epoch 00100: loss improved from 1.10004 to 1.09481, saving model to best_model.h5


<keras.callbacks.History at 0x7f986ff0e7b8>

In [0]:
from google.colab import files
files.download('best_model.h5')

In [0]:
from keras.models import load_model
model=load_model('best_model.h5')

In [0]:
from keras.preprocessing.sequence import pad_sequences
def generate_seq(model, seq_length, seed_text, n_words):
    result = list()
    in_text = seed_text
    # generate a fixed number of words
    for _ in range(n_words):
      encoded=[]
      # encode the text as integer
      for word in in_text.split():
        encoded.append(vocab_to_int[word])
      # truncate sequences to a fixed length
      encoded = pad_sequences([encoded], maxlen=seq_length, truncating='pre')
      # predict probabilities for each word
      yhat = model.predict_classes(encoded, verbose=0)
      # map predicted word index to word
      out_word = ''
      for word, index in vocab_to_int.items():
        if index == yhat:
          out_word = word
          break
      # append to input
      in_text += ' ' + out_word
      result.append(out_word)
    return ' '.join(result)

In [16]:
seed=' '.join(tokens[:100])
generated = generate_seq(model,seq_len,seed,1000)
print(generated)

a moment gasping for breath taking stock i am still alive he crawled out from under the canvas and scanned the cavernous space for someplace to hide a voice spoke chillingly close do not move on the floor and then it is the knights thing of the priory of sion and the grail that is the holy grail that the documents simply had been told the grail in the chapel she was certain he had just yet the line of the painful fallingout she was not in the moment he thought she had been given the teacher to the end of days a little sophie and be finished in the side of the church and a lone thought of wine and the most feminine of this of the most world the holy grail is a grail but when the documents documents a knight that the grail stopped at the tomb of the french police langdon looked back at the mona lisa before the door im a message with the keystone he felt a startled and that news teabing had been often in the hurry of the priory and yet the true man was not being one is the holy grail is a