## Read files

In [1]:
def read_file(file):
    with open(file) as f:
        text = f.read()
    return text    

In [2]:
doc = read_file('moby_dick_four_chapters.txt')
len(doc)

61614

## Tokenize, clean text

In [3]:
import spacy 

nlp = spacy.load('en_core_web_md', disable=['parser', 'tagger', 'ner'])
nlp.max_length = 1198623

In [4]:
def tokenize_text(doc):
    return [token.text.lower() for token in nlp(doc) if token.text not in '\n\n \n\n\n!"-#$%&()--.*+,-/:;<=>?@[\\]^_`{|}~\t\n ']

text = tokenize_text(doc)
len(text)



11338

In [227]:
#25 words to generate -> 26th word using NN

In [6]:
text_seq = []
train_len = 25+1

for i in range(train_len, len(text)):
    seq = text[i-train_len:i]
    text_seq.append(seq)

text_seq[0]

['call',
 'me',
 'ishmael',
 'some',
 'years',
 'ago',
 'never',
 'mind',
 'how',
 'long',
 'precisely',
 'having',
 'little',
 'or',
 'no',
 'money',
 'in',
 'my',
 'purse',
 'and',
 'nothing',
 'particular',
 'to',
 'interest',
 'me',
 'on']

In [7]:
len(text_seq)

11312

In [8]:
from keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer()
tokenizer.fit_on_texts(text_seq)

sequences = tokenizer.texts_to_sequences(text_seq)

sequences[0]

[956,
 14,
 263,
 51,
 261,
 408,
 87,
 219,
 129,
 111,
 954,
 260,
 50,
 43,
 38,
 314,
 7,
 23,
 546,
 3,
 150,
 259,
 6,
 2713,
 14,
 24]

In [232]:
tokenizer.index_word[956]


'call'

In [10]:
vocabulary_size = len(tokenizer.word_counts) # or len(tokenizer.index_word). Anything
vocabulary_size

2718

In [11]:
import numpy as np

sequences = np.array(sequences)
sequences

array([[ 956,   14,  263, ..., 2713,   14,   24],
       [  14,  263,   51, ...,   14,   24,  957],
       [ 263,   51,  261, ...,   24,  957,    5],
       ...,
       [ 952,   12,  166, ...,  262,   53,    2],
       [  12,  166, 2712, ...,   53,    2, 2718],
       [ 166, 2712,    3, ...,    2, 2718,   26]])

In [12]:
# from keras.utils import to_categorical 
from keras.utils import np_utils
# .to_categorical(y_train, num_classes)
# sequences[:,:-1]

In [13]:
X = sequences[:,:-1]
y = sequences[:,-1]

y = np_utils.to_categorical(y, num_classes=vocabulary_size+1)

In [14]:
seq_len = X.shape[1]
seq_len

25

In [15]:
import numpy as np

from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, LSTM, Embedding
# import keras
# from keras.models import Sequential
# from keras.layers import Dense,LSTM,Embedding

In [25]:
def create_model(vocabulary_size, seq_len):
    model = Sequential()
    model.add(Embedding(vocabulary_size, 25, input_length=seq_len))
    model.add(LSTM(100, return_sequences=True))
    model.add(LSTM(100))
    model.add(Dense(100, activation='relu'))

    model.add(Dense(vocabulary_size, activation='softmax'))
    
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
   
    model.summary()
    
    return model

In [26]:
model = create_model(vocabulary_size+1, seq_len)
model

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 25, 25)            67975     
                                                                 
 lstm_4 (LSTM)               (None, 25, 100)           50400     
                                                                 
 lstm_5 (LSTM)               (None, 100)               80400     
                                                                 
 dense_4 (Dense)             (None, 100)               10100     
                                                                 
 dense_5 (Dense)             (None, 2719)              274619    
                                                                 
Total params: 483,494
Trainable params: 483,494
Non-trainable params: 0
_________________________________________________________________


<keras.engine.sequential.Sequential at 0x148d97a1070>

In [27]:
from pickle import dump, load

In [28]:
model.fit(X,y,batch_size=256,epochs=300, verbose=1)

Epoch 1/300
Epoch 2/300
Epoch 3/300
Epoch 4/300
Epoch 5/300
Epoch 6/300
Epoch 7/300
Epoch 8/300
Epoch 9/300
Epoch 10/300
Epoch 11/300
Epoch 12/300
Epoch 13/300
Epoch 14/300
Epoch 15/300
Epoch 16/300
Epoch 17/300
Epoch 18/300
Epoch 19/300
Epoch 20/300
Epoch 21/300
Epoch 22/300
Epoch 23/300
Epoch 24/300
Epoch 25/300
Epoch 26/300
Epoch 27/300
Epoch 28/300
Epoch 29/300
Epoch 30/300
Epoch 31/300
Epoch 32/300
Epoch 33/300
Epoch 34/300
Epoch 35/300
Epoch 36/300
Epoch 37/300
Epoch 38/300
Epoch 39/300
Epoch 40/300
Epoch 41/300
Epoch 42/300
Epoch 43/300
Epoch 44/300
Epoch 45/300
Epoch 46/300
Epoch 47/300
Epoch 48/300
Epoch 49/300
Epoch 50/300
Epoch 51/300
Epoch 52/300
Epoch 53/300
Epoch 54/300
Epoch 55/300
Epoch 56/300
Epoch 57/300
Epoch 58/300
Epoch 59/300
Epoch 60/300
Epoch 61/300
Epoch 62/300
Epoch 63/300
Epoch 64/300
Epoch 65/300
Epoch 66/300
Epoch 67/300
Epoch 68/300
Epoch 69/300
Epoch 70/300
Epoch 71/300
Epoch 72/300
Epoch 73/300
Epoch 74/300
Epoch 75/300
Epoch 76/300
Epoch 77/300
Epoch 78

Epoch 161/300
Epoch 162/300
Epoch 163/300
Epoch 164/300
Epoch 165/300
Epoch 166/300
Epoch 167/300
Epoch 168/300
Epoch 169/300
Epoch 170/300
Epoch 171/300
Epoch 172/300
Epoch 173/300
Epoch 174/300
Epoch 175/300
Epoch 176/300
Epoch 177/300
Epoch 178/300
Epoch 179/300
Epoch 180/300
Epoch 181/300
Epoch 182/300
Epoch 183/300
Epoch 184/300
Epoch 185/300
Epoch 186/300
Epoch 187/300
Epoch 188/300
Epoch 189/300
Epoch 190/300
Epoch 191/300
Epoch 192/300
Epoch 193/300
Epoch 194/300
Epoch 195/300
Epoch 196/300
Epoch 197/300
Epoch 198/300
Epoch 199/300
Epoch 200/300
Epoch 201/300
Epoch 202/300
Epoch 203/300
Epoch 204/300
Epoch 205/300
Epoch 206/300
Epoch 207/300
Epoch 208/300
Epoch 209/300
Epoch 210/300
Epoch 211/300
Epoch 212/300
Epoch 213/300
Epoch 214/300
Epoch 215/300
Epoch 216/300
Epoch 217/300
Epoch 218/300
Epoch 219/300
Epoch 220/300
Epoch 221/300
Epoch 222/300
Epoch 223/300
Epoch 224/300
Epoch 225/300
Epoch 226/300
Epoch 227/300
Epoch 228/300
Epoch 229/300
Epoch 230/300
Epoch 231/300
Epoch 

<keras.callbacks.History at 0x148d992f550>

In [29]:
model.save('text_gen_model.h5')

In [30]:
dump(tokenizer, open('simple_tokenizer', 'wb'))

In [83]:
from keras.preprocessing.sequence import pad_sequences

def generate_text(model, tokenizer, seq_len, seed_text, num_gen_words):
    
    output_text = [] 
    input_text = seed_text
    
    for i in range(num_gen_words):
        encoded_text = tokenizer.texts_to_sequences([input_text])[0]
        pad_encoding = pad_sequences([encoded_text], maxlen=seq_len, truncating='pre')
        pred_word_ind = np.argmax(model.predict(pad_encoding), axis=-1)[0] #model.predict_classes(pad_encoding, verbose=0)[0]
        pred_word = tokenizer.index_word[pred_word_ind]
        input_text = input_text+ ' ' + pred_word
        output_text.append(pred_word)
        final_text = input_text
    return ' '.join(output_text)

In [162]:
import random
random.seed()
random_pick = random.randint(0,len(text_seq))
random_pick
random_seed_text = text_seq[random_pick]
seed_text = ' '.join(random_seed_text)
seed_text
# seed_text = 'mean to have it inferred that i ever go to sea as a passenger for to go as a passenger you must needs have a'

'hugged me tightly as though naught but death should part us twain i now strove to rouse him--"queequeg!"--but his only answer was a snore i then'

In [163]:
generate_text(model, tokenizer, seq_len, seed_text=seed_text, num_gen_words=10)

"did n't stop said a thirty years ' war and"