In [94]:
def read_file(filepath):
    with open(filepath) as f:
        str_text = f.read()
    
    return str_text

In [95]:
# read_file('moby_dick_four_chapters.txt')

In [96]:
import spacy

In [97]:
nlp = spacy.load('en', disable=['parser', 'tagger', 'ner'])

In [98]:
nlp.max_length = 1198623

In [99]:
def separate_punc(doc_text):
    return [token.text.lower() for token in nlp(doc_text) if token.text not in '\n\n \n\n\n!"-#$%&()--.*+,-/:;<=>?@[\\]^_`{|}~\t\n ']

In [100]:
d = read_file('moby_dick_four_chapters.txt')

In [101]:
tokens = separate_punc(d)

In [102]:
len(tokens) # words here

11338

In [103]:
#25 words --> network predicts word #26

In [104]:
train_len = 25 + 1

text_sequences = []

for i in range(train_len, len(tokens)):
    seq = tokens[i-train_len:i]

    text_sequences.append(seq)

In [105]:
type(text_sequences)

list

In [106]:
# text_sequences[0]
# text_sequences[1] # sequences, each sequence is one token over (+1 beginign and end)

' '.join(text_sequences[0])

'call me ishmael some years ago never mind how long precisely having little or no money in my purse and nothing particular to interest me on'

In [107]:
' '.join(text_sequences[1])

'me ishmael some years ago never mind how long precisely having little or no money in my purse and nothing particular to interest me on shore'

In [108]:
' '.join(text_sequences[2])

'ishmael some years ago never mind how long precisely having little or no money in my purse and nothing particular to interest me on shore i'

In [109]:
# format text sequence into a numerical sequence for keras
from keras.preprocessing.text import Tokenizer

In [110]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(text_sequences)

In [111]:
sequences = tokenizer.texts_to_sequences(text_sequences)

In [112]:
# now we have our converted numeric sequences
# sequences[0] 

In [113]:
# each number is actually a word (token)
# tokenizer.index_word

In [114]:
for i in sequences[0]:
    print(f"{i} : {tokenizer.index_word[i]}")

956 : call
14 : me
263 : ishmael
51 : some
261 : years
408 : ago
87 : never
219 : mind
129 : how
111 : long
954 : precisely
260 : having
50 : little
43 : or
38 : no
315 : money
7 : in
23 : my
546 : purse
3 : and
150 : nothing
259 : particular
6 : to
2712 : interest
14 : me
24 : on


In [115]:
# see how many times words appears in our sequence
# tokenizer.word_counts

In [116]:
vocabulary_size = len(tokenizer.word_counts) # number of unique words in all this 4 chapters document that we got

In [117]:
type(sequences)

list

In [118]:
# turn the sequence into a numpy matrix format, to create a train split with it
import numpy as np

In [119]:
sequences = np.array(sequences)

In [120]:
sequences

array([[ 956,   14,  263, ..., 2712,   14,   24],
       [  14,  263,   51, ...,   14,   24,  957],
       [ 263,   51,  261, ...,   24,  957,    5],
       ...,
       [ 952,   12,  166, ...,  262,   53,    2],
       [  12,  166, 2711, ...,   53,    2, 2717],
       [ 166, 2711,    3, ...,    2, 2717,   26]])

In [121]:
# part 2: Create our data (X, y)
from keras.utils import to_categorical

In [122]:
sequences[:, :-1] # each row of words (features)

array([[ 956,   14,  263, ...,    6, 2712,   14],
       [  14,  263,   51, ..., 2712,   14,   24],
       [ 263,   51,  261, ...,   14,   24,  957],
       ...,
       [ 952,   12,  166, ...,   11,  262,   53],
       [  12,  166, 2711, ...,  262,   53,    2],
       [ 166, 2711,    3, ...,   53,    2, 2717]])

In [123]:
sequences[:,-1] # last word of each row (label)

array([  24,  957,    5, ...,    2, 2717,   26])

In [124]:
X = sequences[:, :-1] # features

In [125]:
y = sequences[:,-1] # label

In [126]:
y = to_categorical(y, num_classes=vocabulary_size+1)

In [127]:
X.shape # (sequences, words per sequence)

(11312, 25)

In [128]:
seq_len = X.shape[1]

In [129]:
# Create our model with the data
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding # LSTM: For the sequences, Embedding: for the words

In [130]:
# function to create models
def create_model(vocabulary_size, seq_len):

    model = Sequential()

    model.add(Embedding(vocabulary_size, seq_len, input_length=seq_len))
    # LSTM layer chooses amount of neurons, recommended to have as amount a multiple of seq_len
    model.add(LSTM(seq_len*2, return_sequences=True)) # first param should be multiple of seq_len (Recomended)
    model.add(LSTM(seq_len*2))

    model.add(Dense(50, activation='relu'))
    model.add(Dense(vocabulary_size, activation='softmax'))

    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

    model.summary()

    return model

In [131]:
model = create_model(vocabulary_size+1, seq_len) # creating our model

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 25, 25)            67950     
_________________________________________________________________
lstm_4 (LSTM)                (None, 25, 50)            15200     
_________________________________________________________________
lstm_5 (LSTM)                (None, 50)                20200     
_________________________________________________________________
dense_4 (Dense)              (None, 50)                2550      
_________________________________________________________________
dense_5 (Dense)              (None, 2718)              138618    
Total params: 244,518
Trainable params: 244,518
Non-trainable params: 0
_________________________________________________________________


In [132]:
# training our model

In [133]:
# to save files for later and load them
from pickle import dump, load

In [134]:
model.fit(
    X, y,
    batch_size=128, # how many sequences do you want to pass at a time
    epochs=2, # number of tranings in the execution
    verbose=1 # output report
)

Epoch 1/2
Epoch 2/2


<tensorflow.python.keras.callbacks.History at 0x6718347d0>

In [135]:
# saving our model
model.save('my_mobydick_model.h5')

In [137]:
# tokenizer from Keras with all the vocabsulary of the documents
dump(tokenizer,open('my_simpletokenizer','wb')) # saving our token