## Level 2 - Word Prediction using LSTM

Follow the same steps as in Char Prediction (Level 1) but at the word level than at the Char Level. 

In [2]:
import nltk
from nltk.tokenize import RegexpTokenizer
import numpy as np
import pandas as pd
import keras
from keras.models import Sequential
from keras.layers import LSTM,Embedding, Dense
from sklearn.model_selection import train_test_split
from keras.utils import np_utils
from keras.optimizers import RMSprop

Using TensorFlow backend.


In [3]:
# loading dracula story
dracula_story = open('Dracula', 'r+')
dracula = dracula_story.read()

In [4]:
# for breaking it into words
tokenizer = RegexpTokenizer(r'\w+')

In [5]:
dracula_words = tokenizer.tokenize(dracula)

In [6]:
len(dracula_words)

163422

In [7]:
dracula_words

['DRACULA',
 'CHAPTER',
 'I',
 'JONATHAN',
 'HARKER',
 'S',
 'JOURNAL',
 '_Kept',
 'in',
 'shorthand',
 '_',
 '_3',
 'May',
 'Bistritz',
 '_',
 'Left',
 'Munich',
 'at',
 '8',
 '35',
 'P',
 'M',
 'on',
 '1st',
 'May',
 'arriving',
 'at',
 'Vienna',
 'early',
 'next',
 'morning',
 'should',
 'have',
 'arrived',
 'at',
 '6',
 '46',
 'but',
 'train',
 'was',
 'an',
 'hour',
 'late',
 'Buda',
 'Pesth',
 'seems',
 'a',
 'wonderful',
 'place',
 'from',
 'the',
 'glimpse',
 'which',
 'I',
 'got',
 'of',
 'it',
 'from',
 'the',
 'train',
 'and',
 'the',
 'little',
 'I',
 'could',
 'walk',
 'through',
 'the',
 'streets',
 'I',
 'feared',
 'to',
 'go',
 'very',
 'far',
 'from',
 'the',
 'station',
 'as',
 'we',
 'had',
 'arrived',
 'late',
 'and',
 'would',
 'start',
 'as',
 'near',
 'the',
 'correct',
 'time',
 'as',
 'possible',
 'The',
 'impression',
 'I',
 'had',
 'was',
 'that',
 'we',
 'were',
 'leaving',
 'the',
 'West',
 'and',
 'entering',
 'the',
 'East',
 'the',
 'most',
 'western',
 

In [8]:
# word to number
word_to_number = {word:i for i,word in enumerate(dracula_words)}

count = 0
for word,i in word_to_number.items():
    word_to_number[word] = count
    count+=1
    
int_to_word = {i:word for word,i in word_to_number.items()}
# len(word_to_number)

In [9]:
# # word to number
# def word_to_number(words):
#     word_int_dict = { word:i for i,word in enumerate(words) }
#     return word_int_di
word_to_number

{'unhinged': 0,
 'Winchesters': 1,
 'though': 2,
 'To': 3,
 'mutilate': 4,
 'eastward': 5,
 'lethal': 6,
 'wanted': 7,
 'Fears': 8,
 'blinked': 9,
 'directly': 10,
 'prodigal': 11,
 'Krone': 12,
 'nicely': 13,
 'Lane': 14,
 'judgment': 15,
 'taught': 16,
 'significantly': 17,
 'simultaneously': 18,
 'gathering': 19,
 'potent': 20,
 'burst': 21,
 'silent': 22,
 'simile': 23,
 'reading': 24,
 'impalpable': 25,
 'replaced': 26,
 'flows': 27,
 'totals': 28,
 'solved': 29,
 'replying': 30,
 'sometime': 31,
 'experimenting': 32,
 'Incorporated': 33,
 'afterwards': 34,
 'dull': 35,
 'Blessed': 36,
 'immeasurable': 37,
 'grabbed': 38,
 'gusto': 39,
 'diagnosis': 40,
 'Soh': 41,
 'Faithfully': 42,
 'consented': 43,
 'frontiers': 44,
 'weepy': 45,
 'fireplace': 46,
 'Northern': 47,
 'brains': 48,
 'thirsty': 49,
 'foreknowledge': 50,
 'visited': 51,
 'Paris': 52,
 'final': 53,
 'assembled': 54,
 'to': 55,
 'Saturday': 56,
 'defined': 57,
 'display': 58,
 'tissue': 59,
 'Danes': 60,
 'puss': 61,


In [9]:
# word_dictionary = word_to_number(dracula_words)

In [10]:
# def number_to_word(number):
#      return (list(word_dictionary.keys())[list(word_dictionary.values()).index(number)])

In [11]:
def sliding_window(string,interval):
    words = tokenizer.tokenize(string)
    x_train = []
    y_train = []
    for i in range(len(words)-interval):
        x_mini_list= []
        for j in range(i,i+interval):
            x_mini_list.append(word_to_number[words[j]])
        x_train.append(x_mini_list)
        y_train.append(word_to_number[words[i+interval]])
    return x_train, y_train

In [12]:
# initialise the amount of interval for sliding window
interval = 100

In [13]:
# getting the sled lists into variables
x, y = sliding_window(dracula,interval)

In [14]:
len(x)

163322

In [15]:
len(y)

163322

In [16]:
len(dracula_words)

163422

In [17]:
# splitting into train and test data
x_train,x_test,y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

In [18]:
# len(x_train)

In [19]:
# reshaping to required 3d format
seqIn = np.array(x_train).reshape(len(x_train), interval, 1)

In [20]:
len(x_train)

114325

In [21]:
len(y_train)

114325

In [22]:
seqIn.shape

(114325, 100, 1)

In [23]:
min(y_train)

0

In [24]:
max(y_train)

10111

In [25]:
min(y_test)

0

In [26]:
max(y_test)

10111

In [27]:
len(dracula_words)

163422

In [28]:
len(set(dracula_words))

10112

In [29]:
y_train

[5527,
 5569,
 3201,
 2427,
 2433,
 4287,
 4287,
 559,
 7540,
 4030,
 8560,
 1685,
 1394,
 2440,
 3136,
 2626,
 2507,
 235,
 7475,
 7733,
 2843,
 1231,
 6283,
 3546,
 4089,
 6242,
 6699,
 4910,
 1476,
 4268,
 4201,
 1798,
 2546,
 3482,
 8906,
 7189,
 4427,
 7648,
 9092,
 1685,
 5432,
 5060,
 9835,
 9765,
 5065,
 3546,
 4287,
 1059,
 5570,
 2374,
 230,
 4287,
 7714,
 2233,
 10026,
 915,
 9680,
 969,
 1952,
 9667,
 9054,
 2358,
 1918,
 4506,
 9765,
 1766,
 9361,
 9765,
 3099,
 3324,
 3624,
 5111,
 9361,
 6475,
 1685,
 4646,
 9765,
 4031,
 5638,
 1952,
 4089,
 7274,
 9765,
 1919,
 2466,
 396,
 4287,
 9765,
 6897,
 2292,
 2626,
 6210,
 4251,
 369,
 2451,
 4251,
 1420,
 7997,
 2148,
 5060,
 9659,
 369,
 1065,
 8560,
 811,
 244,
 7540,
 7327,
 235,
 4251,
 4646,
 9124,
 369,
 2358,
 654,
 3842,
 230,
 9867,
 6354,
 4287,
 396,
 7540,
 4287,
 235,
 8522,
 1557,
 7643,
 876,
 7480,
 4414,
 7655,
 6283,
 7495,
 9765,
 9765,
 817,
 1685,
 9459,
 5117,
 4287,
 1557,
 6301,
 7997,
 4252,
 9170,
 9

In [30]:
num_classes = len(word_to_number)

In [31]:
len(y_train)

114325

In [32]:
# categorising the output
seqOut = np_utils.to_categorical(y_train, num_classes= num_classes)
y_test = np_utils.to_categorical(y_test, num_classes=num_classes)
# seqOut = keras.utils.to_categorical(y_train, num_classes=max(y_train)+1)

In [44]:
x_test = np.array(x_test).reshape(len(x_test), interval, 1)

In [39]:
seqOut

array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       ..., 
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]])

In [37]:
seqOut.shape

(114325, 10112)

In [38]:
seqIn.shape

(114325, 100, 1)

In [40]:
epochs = 1
batch_size = 128

In [45]:
# Modelling the data
model = Sequential()
model.add(LSTM(128,activation = 'relu', input_shape=(seqIn.shape[1],seqIn.shape[2],))
model.add(LSTM(64, input_shape=(seqIn.shape[1],seqIn.shape[2],)))
#model.add(Dropout(0.2))
#model.add(Dense(600, activation='relu', input_shape=(3,)))
model.add(Dense(500, activation='relu'))
model.add(Dense(seqOut.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='RMSprop',metrics = ['accuracy'])
print(model.summary())
model.fit(seqIn, seqOut, epochs=epochs, batch_size=batch_size, verbose=1,validation_data= (x_test, y_test),validation_split = .2)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_5 (LSTM)                (None, 64)                16896     
_________________________________________________________________
dense_9 (Dense)              (None, 500)               32500     
_________________________________________________________________
dense_10 (Dense)             (None, 10112)             5066112   
Total params: 5,115,508
Trainable params: 5,115,508
Non-trainable params: 0
_________________________________________________________________
None
Train on 114325 samples, validate on 48997 samples
Epoch 1/1


<keras.callbacks.History at 0x7f115305a3c8>

In [57]:
test_string= """Buda-Pesth seems a wonderful place, from the glimpse which I\ngot of it from the train and the little I could walk through the\nstreets. I feared to go very far from the station, as we had arrived\nlate and would start as near the correct time as possible. The\nimpression I had was that we were leaving the West and entering the\nEast; the most western of splendid bridges over the Danube, which is\nhere of noble width and depth, took us among the traditions of Turkish\nrule.\n\nWe left in pretty good time, and came after nightfall to Klausenburgh.\nHere I stopped for the night at the Hotel Royale. I had for dinner, or\nrather supper, a chicken done up some way with red pepper, which was\nvery good but thirsty. (_Mem._, get recipe for Mina.) I asked the\nwaiter, and he said it was called "paprika hendl," and that, as it was a\nnational dish, I should be able to get it anywhere along the\nCarpathians. I found my smattering of German very useful here; indeed, I\ndon\'t know how I should be able to get on without it.\n\nHaving had some time at my disposal when in London, I had visited the\nBritish Museum, and made search among the books and maps in the library\nregarding Transylvania; it had struck me that some foreknowledge of the\ncountry could hardly fail to have some importance in dealing with a\nnobleman of that country. I find that the district he named is in the\nextreme east of the country, just on the borders of three states,\nTransylvania, Moldavia and Bukovina, in the midst of the Carpathian\nmountains; one of the wildest and least known portions of Europe. I was\nnot able to light on any map or work giving the exact locality of the\nCastle Dracula, as there are no maps of this country as yet to compare\nwith our own Ordnance Survey maps; but I found that Bistritz, the post\ntown named by Count Dracula, is a fairly well-known place. I shall enter\nhere some of my notes, as they may refresh my memory when I talk over my\ntravels with Mina.\n\nIn the population of Transylvania there are four distinct nationalities:\nSaxons in the South, and mixed with them the Wallachs, who are the\ndescendants of the Dacians; Magyars in the West, and Szekelys in the\nEast and North. I am going among the latter, who claim to be descended\nfrom Attila and the Huns. This may be so, for when the Magyars conquered\nthe country in the eleventh century they found the Huns settled in it. I\nread that every known superstition in the world is gathered into the\nhorseshoe of the Carpathians, as if it were the centre of some sort of\nimaginative whirlpool; if so my stay may be very interesting. (_Mem._, I\nmust ask the Count all about them.)\n\nI did not sleep well, though my bed was comfortable enough, for I had\nall sorts of queer dreams. There was a dog howling all night under my\nwindow, which may have had something to do with it; or it may have been\nthe paprika, for I had to drink up all the water in my carafe, and was\nstill thirsty. Towards morning I slept and was wakened by the continuous\nknocking at my door, so I guess I must have been sleeping soundly then.\nI had for breakfast more paprika, and a sort of porridge of maize flour\nwhich they said was "mamaliga," and egg-plant stuffed with forcemeat, a\nvery excellent dish, which they call "impletata." (_Mem._, get recipe\nfor this also.) I had to hurry breakfast, for the train started a little\nbefore eight, or rather it ought to have done so, for after rushing to\nthe station at 7:30 I had to sit in the carriage for more than an hour\nbefore we began to move. It seems to me that the further east you go the\nmore unpunctual are the trains. What ought they to be in China?\n\nAll day long we seemed to dawdle through a country which was full of\nbeauty of every kind. Sometimes we saw little towns or castles on the\ntop of steep hills such as we see in old missals; sometimes we ran by\nrivers and streams which seemed from the wide stony margin on each side\nof them to be subject to great floods."""

In [69]:
test_x, test_y = sliding_window(test_string, interval)
test_x = np.array(test_x).reshape(len(test_x), interval, 1)

In [70]:
actual = []
for i in test_y:
    actual.append(int_to_word[i])

In [74]:
# actual

In [73]:
# word_to_number

In [75]:
y = model.predict(test_x)

In [76]:
# int_to_word

In [78]:
predicted = []
for i in y:
    temp = np.argmax(i)
    predicted.append(int_to_word[temp])

In [79]:
predicted

['the',
 'the',
 'the',
 'the',
 'the',
 'the',
 'the',
 'the',
 'the',
 'the',
 'the',
 'the',
 'the',
 'the',
 'the',
 'the',
 'the',
 'the',
 'the',
 'the',
 'the',
 'the',
 'the',
 'the',
 'the',
 'the',
 'the',
 'the',
 'the',
 'the',
 'the',
 'the',
 'the',
 'the',
 'the',
 'the',
 'the',
 'the',
 'the',
 'the',
 'the',
 'the',
 'the',
 'the',
 'the',
 'the',
 'the',
 'the',
 'the',
 'the',
 'the',
 'the',
 'the',
 'the',
 'the',
 'the',
 'the',
 'the',
 'the',
 'the',
 'the',
 'the',
 'the',
 'the',
 'the',
 'the',
 'the',
 'the',
 'the',
 'and',
 'the',
 'the',
 'the',
 'the',
 'the',
 'the',
 'the',
 'the',
 'the',
 'the',
 'the',
 'the',
 'the',
 'the',
 'the',
 'the',
 'the',
 'the',
 'the',
 'the',
 'the',
 'the',
 'the',
 'the',
 'the',
 'the',
 'the',
 'the',
 'the',
 'the',
 'the',
 'the',
 'the',
 'the',
 'the',
 'the',
 'the',
 'the',
 'the',
 'the',
 'the',
 'the',
 'the',
 'and',
 'the',
 'the',
 'the',
 'the',
 'the',
 'the',
 'the',
 'the',
 'the',
 'the',
 'the',
