In [4]:
import spacy

`python -m spacy download en`


In [10]:
nlp = spacy.load('en',disable=['parser', 'tagger','ner'])

In [12]:
nlp.max_length = 1198623

In [13]:
def separate_punc(doc_text):
    return [token.text.lower() for token in nlp(doc_text) if token.text not in '\n\n \n\n\n!"-#$%&()--.*+,-/:;<=>?@[\\]^_`{|}~\t\n ']

In [14]:
def read_file(filepath):
    with open(filepath) as f:
        str_text = f.read()
    return str_text

In [15]:
d = read_file('./data/moby_dick_chapters(1-4).txt')

In [16]:
tokens = separate_punc(d)

In [17]:
len(tokens)

11394

In [18]:
sequence_len = 25
train_len = sequence_len + 1
text_sequences =[]
for i in range(train_len,len(tokens)):
    seq = tokens[i-train_len:i]
    text_sequences.append(seq)

In [19]:
from keras.preprocessing.text import Tokenizer

Using TensorFlow backend.


In [20]:
# integer encode sequences of words
tokenizer = Tokenizer()
tokenizer.fit_on_texts(text_sequences)
sequences = tokenizer.texts_to_sequences(text_sequences)

In [21]:
for i in sequences[0]:
    print(f'{i} : {tokenizer.index_word[i]}')

964 : call
14 : me
265 : ishmael
51 : some
263 : years
416 : ago
87 : never
222 : mind
129 : how
111 : long
962 : precisely
262 : having
50 : little
43 : or
37 : no
321 : money
7 : in
23 : my
555 : purse
3 : and
150 : nothing
261 : particular
6 : to
2704 : interest
14 : me
24 : on


In [22]:
vocabulary_size = len(tokenizer.word_counts)
vocabulary_size

2709

In [23]:
import numpy as np

sequences = np.array(sequences)
sequences

array([[ 964,   14,  265, ..., 2704,   14,   24],
       [  14,  265,   51, ...,   14,   24,  965],
       [ 265,   51,  263, ...,   24,  965,    5],
       ...,
       [ 960,   12,  168, ...,  264,   53,    2],
       [  12,  168, 2703, ...,   53,    2, 2709],
       [ 168, 2703,    3, ...,    2, 2709,   26]])

In [24]:
import keras
from keras.models import Sequential
from keras.layers import Dense,LSTM,Embedding

def create_model(vocabulary_size, seq_len):
    model = Sequential()
    model.add(Embedding(vocabulary_size, 25, input_length=seq_len))
    model.add(LSTM(150, return_sequences=True))
    model.add(LSTM(150))
    model.add(Dense(150, activation='relu'))

    model.add(Dense(vocabulary_size, activation='softmax'))
    
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
   
    model.summary()
    
    return model

In [25]:
from keras.utils import to_categorical

In [26]:
X = sequences[:,:-1] 
y = sequences[:,-1]

In [27]:
y = to_categorical(y, num_classes=vocabulary_size+1)

In [28]:
seq_len = X.shape[1]
seq_len

25

In [36]:
model = create_model(vocabulary_size+1, seq_len)

Instructions for updating:
Colocations handled automatically by placer.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 25, 25)            67750     
_________________________________________________________________
lstm_1 (LSTM)                (None, 25, 150)           105600    
_________________________________________________________________
lstm_2 (LSTM)                (None, 150)               180600    
_________________________________________________________________
dense_1 (Dense)              (None, 150)               22650     
_________________________________________________________________
dense_2 (Dense)              (None, 2710)              409210    
Total params: 785,810
Trainable params: 785,810
Non-trainable params: 0
_________________________________________________________________


In [37]:
model.fit(X, y, batch_size=128, epochs=300,verbose=1)

265 - acc: 0.3052
Epoch 107/300
Epoch 108/300
Epoch 109/300
Epoch 110/300
Epoch 111/300
Epoch 112/300
Epoch 113/300
Epoch 114/300
Epoch 115/300
Epoch 116/300
Epoch 117/300
Epoch 118/300
Epoch 119/300
Epoch 120/300
Epoch 121/300
Epoch 122/300
Epoch 123/300
Epoch 124/300
Epoch 125/300
Epoch 126/300
Epoch 127/300
Epoch 128/300
Epoch 129/300
Epoch 130/300
Epoch 131/300
Epoch 132/300
Epoch 133/300
Epoch 134/300
Epoch 135/300
Epoch 136/300
Epoch 137/300
Epoch 138/300
Epoch 139/300
Epoch 140/300
Epoch 141/300
Epoch 142/300
Epoch 143/300
Epoch 144/300
Epoch 145/300
Epoch 146/300
Epoch 147/300
Epoch 148/300
Epoch 149/300
Epoch 150/300
Epoch 151/300
Epoch 152/300
Epoch 153/300
Epoch 154/300
Epoch 155/300
Epoch 156/300
Epoch 157/300
Epoch 158/300
Epoch 159/300
Epoch 160/300
Epoch 161/300
Epoch 162/300
Epoch 163/300
Epoch 164/300
Epoch 165/300
Epoch 166/300
Epoch 167/300
Epoch 168/300
Epoch 169/300
Epoch 170/300
Epoch 171/300
Epoch 172/300
Epoch 173/300
Epoch 174/300
Epoch 175/300
Epoch 176/300
Ep

<keras.callbacks.History at 0x21908be2908>

In [47]:
from keras.preprocessing.sequence import pad_sequences

def generate_text(model,tokenizer,seq_len,seed_text,num_gen_words):
    output_text = []
    input_text = seed_text
    for i in range(num_gen_words):
        encoded_text = tokenizer.texts_to_sequences([input_text])[0]
        pad_encoded = pad_sequences([encoded_text],maxlen=seq_len,truncating='pre')
        pred_word_ind = model.predict_classes(pad_encoded,verbose=0)[0]
        pred_word = tokenizer.index_word[pred_word_ind]
        input_text += ' '+pred_word
        output_text.append(pred_word)
    return ' '.join(output_text)

In [50]:
seed_text = 'This is a story of a journey'
generate_text(model,tokenizer,seq_len,seed_text=seed_text,num_gen_words=50)

'head i have in the scales of the window and his dark bag into bed but was getting late and what directly thought i heard in me with the floor in one corner it was it may be nothing but this was exactly that ere double duty with the best'

In [49]:
import random
random_pick = random.randint(0,len(text_sequences))
random_seed_text = text_sequences[random_pick]
seed_text = ' '.join(random_seed_text)

generate_text(model,tokenizer,seq_len,seed_text=seed_text,num_gen_words=50)

"that his very large oilpainting so thoroughly besmoked and frozen entry and at last in the unequal crosslights in him they raised a cry of bulkington bulkington of n't make and comfortable would make and when right length my head which you entered a strong say over in the holy"