In [1]:
import os
import string
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter
from keras.utils import to_categorical
from keras.utils.data_utils import get_file
from keras.models import Sequential, load_model
from keras.layers import Embedding, LSTM, Dense
from keras.callbacks import EarlyStopping, ModelCheckpoint

In [2]:
path=get_file('nietzsche.txt',origin='https://s3.amazonaws.com/text-datasets/nietzsche.txt')
with open(path, encoding='utf-8') as f:
    raw_text=f.read()

print('corpus length:',len(raw_text))
print('example text:',raw_text[:150])

corpus length: 600893
example text: PREFACE


SUPPOSING that Truth is a woman--what then? Is there not ground
for suspecting that all philosophers, in so far as they have been
dogmatists


In [3]:
tokens=raw_text.replace('--', ' ').split()
cleaned_tokens=[]
table=str.maketrans('','', string.punctuation)
for word in tokens:
    word=word.translate(table)
    if word.isalpha():
        cleaned_tokens.append(word.lower())

In [4]:
min_count=2
unknown_token='<unk>'
word2index={unknown_token: 0}
index2word=[unknown_token]

filtered_words=0
counter=Counter(cleaned_tokens)
for word, count in counter.items():
    if count>=min_count:
        index2word.append(word)
        word2index[word]=len(word2index)
    else:
        filtered_words+=1

num_classes=len(word2index)
print('vocabulary size: ',num_classes)
print('filtered words: ',filtered_words)

vocabulary size:  5090
filtered words:  5097


In [5]:
step=3
maxlen=40
X=[]
y=[]
for i in range(0,len(cleaned_tokens)-maxlen,step):
    sentence=cleaned_tokens[i:i+maxlen]
    next_word=cleaned_tokens[i+maxlen]
    X.append([word2index.get(word,0) for word in sentence])
    y.append(word2index.get(next_word,0))
X=np.array(X)
Y=to_categorical(y,num_classes)
print('sequence dimension: ',X.shape)
print('target dimension: ',Y.shape)
print('example sequence:\n',X[0])

sequence dimension:  (33342, 40)
target dimension:  (33342, 5090)
example sequence:
 [ 1  2  3  4  5  6  7  8  9  5 10 11 12 13  0  3 14 15 16 17 18 19 20 21
 22 23 21 24 25 26 27  3 28 29 30 31 32  0 33 34]


In [6]:
embedding_size=50
lstm_size=256
model1=Sequential()
model1.add(Embedding(num_classes,embedding_size,input_length=maxlen))
model1.add(LSTM(lstm_size))
model1.add(Dense(num_classes,activation='softmax'))
model1.compile(loss='categorical_crossentropy',optimizer='adam')
print(model1.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 40, 50)            254500    
                                                                 
 lstm (LSTM)                 (None, 256)               314368    
                                                                 
 dense (Dense)               (None, 5090)              1308130   
                                                                 
Total params: 1,876,998
Trainable params: 1,876,998
Non-trainable params: 0
_________________________________________________________________
None


In [7]:
epochs=40
batch_size=32
validation_split=0.2
address1='lstm_weights1.hdf5'
print('model checkpoint address: ',address1)

history=model1.fit(X,Y,batch_size=batch_size, 
                            epochs=epochs, verbose=1,
                            validation_split=validation_split)

model_info={'history': history,'model':model1}



model checkpoint address:  lstm_weights1.hdf5
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


In [9]:
def check_prediction(model, num_predict):
    true_print_out='Actual words: '
    pred_print_out='Predicted words: '
    for i in range(num_predict):
        x=X[i]
        prediction=model.predict(x[np.newaxis, :], verbose = 0)
        index=np.argmax(prediction)
        true_print_out+=index2word[y[i]]+' '
        pred_print_out+=index2word[index]+' '

    print(true_print_out)
    print(pred_print_out)


In [10]:
num_predict=10
model=model_info['model']
check_prediction(model,num_predict)

Actual words: they paid to been unseemly <unk> certainly never to and 
Predicted words: they paid to been unseemly <unk> certainly never to and 
