In [1]:
def read_file(filepath):
    with open(filepath) as f:
        str_text = f.read()
    
    return str_text

In [2]:
# read_file('moby_dick_four_chapters.txt')

In [7]:
import spacy

In [10]:
nlp = spacy.load('en_core_web_sm', disable=['parser', 'tagger', 'ner'])

In [11]:
nlp.max_length = 1198623

In [12]:
def separate_punc(doc_text):
    return [token.text.lower() for token in nlp(doc_text) if token.text not in '\n\n \n\n\n!"-#$%&()--.*+,-/:;<=>?@[\\]^_`{|}~\t\n ']

In [13]:
d = read_file('moby_dick_four_chapters.txt')

In [14]:
tokens = separate_punc(d)



In [15]:
len(tokens)

11338

In [16]:
# 25 words --> network predict #26

In [18]:
train_len = 25 + 1

text_sequences = []

for i in range(train_len, len(tokens)):
    seq = tokens[i-train_len:i]
    
    text_sequences.append(seq)

In [20]:
from keras.preprocessing.text import Tokenizer

In [21]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(text_sequences)

In [23]:
sequences = tokenizer.texts_to_sequences(text_sequences)

In [25]:
for i in sequences[0]:
    print(f"{i}:{tokenizer.index_word[i]}")

956:call
14:me
263:ishmael
51:some
261:years
408:ago
87:never
219:mind
129:how
111:long
954:precisely
260:having
50:little
43:or
38:no
314:money
7:in
23:my
546:purse
3:and
150:nothing
259:particular
6:to
2713:interest
14:me
24:on


In [27]:
# tokenizer.word_counts

In [28]:
vocabulary_size = len(tokenizer.word_counts)

In [29]:
vocabulary_size

2718

In [30]:
import numpy as np

In [31]:
sequences = np.array(sequences)

In [32]:
sequences

array([[ 956,   14,  263, ..., 2713,   14,   24],
       [  14,  263,   51, ...,   14,   24,  957],
       [ 263,   51,  261, ...,   24,  957,    5],
       ...,
       [ 952,   12,  166, ...,  262,   53,    2],
       [  12,  166, 2712, ...,   53,    2, 2718],
       [ 166, 2712,    3, ...,    2, 2718,   26]])

In [33]:
from keras.utils import to_categorical

In [37]:
X = sequences[:, :-1]

In [38]:
y = sequences[:, -1]

In [39]:
y = to_categorical(y, num_classes=vocabulary_size+1)

In [40]:
seq_len = X.shape[1]

In [41]:
X.shape

(11312, 25)

In [42]:
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding

In [43]:
def create_model(vocabulary_size, seq_len):
    model = Sequential()
    model.add(Embedding(vocabulary_size, seq_len, input_length=seq_len))
    model.add(LSTM(seq_len*2, return_sequences=True))
    model.add(LSTM(seq_len*2))
    model.add(Dense(seq_len*2, activation='relu'))
    
    model.add(Dense(vocabulary_size, activation='softmax'))
    
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    
    model.summary()
    
    return model

In [45]:
model = create_model(vocabulary_size + 1, seq_len)

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 25, 25)            67975     
                                                                 
 lstm_2 (LSTM)               (None, 25, 50)            15200     
                                                                 
 lstm_3 (LSTM)               (None, 50)                20200     
                                                                 
 dense_2 (Dense)             (None, 50)                2550      
                                                                 
 dense_3 (Dense)             (None, 2719)              138669    
                                                                 
Total params: 244,594
Trainable params: 244,594
Non-trainable params: 0
_________________________________________________________________


In [46]:
from pickle import dump, load

In [47]:
model.fit(X, y, batch_size=128, epochs=2, verbose=1)

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7fef2cf0c880>

In [48]:
model.save('my_mobydick_model.h5')

In [49]:
dump(tokenizer, open('my_simpletokenizer', 'wb'))

In [51]:
from keras_preprocessing.sequence import pad_sequences

In [58]:
def generate_text(model, tokenizer, seq_len, seed_text, num_gen_words):
    output_text = []
    
    input_text = seed_text
    
    for i in range(num_gen_words):
        encoded_text = tokenizer.texts_to_sequences([input_text])[0]
        
        pad_encoded = pad_sequences([encoded_text],maxlen=seq_len, truncating='pre')
        
        pred_word_ind = np.argmax(model.predict(pad_encoded, verbose=0),axis=1)[0]
        
        pred_word = tokenizer.index_word[pred_word_ind]
        
        input_text += " " + pred_word
        
        output_text.append(pred_word)
    
    
    return ' '.join(output_text)

In [59]:
text_sequences[0]

['call',
 'me',
 'ishmael',
 'some',
 'years',
 'ago',
 'never',
 'mind',
 'how',
 'long',
 'precisely',
 'having',
 'little',
 'or',
 'no',
 'money',
 'in',
 'my',
 'purse',
 'and',
 'nothing',
 'particular',
 'to',
 'interest',
 'me',
 'on']

In [60]:
seed_text = ' '.join(text_sequences[0])

In [61]:
seed_text

'call me ishmael some years ago never mind how long precisely having little or no money in my purse and nothing particular to interest me on'

In [62]:
generate_text(model, tokenizer, seq_len, seed_text, 25)

'the the the the the the the the the the the the the the the the the the the the the the the the the'

In [63]:
from keras.models import load_model

In [64]:
model = load_model('epochBIG.h5')

In [65]:
tokenizer = load(open('epochBIG', 'rb'))

In [66]:
generate_text(model, tokenizer, seq_len, seed_text, 25)

"board and is anchor and break yell accursed long it said the old man 's delirium should be used on that strange when much during"