## Functions for Processing Text

### Reading in files as a string text

In [None]:
!python -m spacy download en_core_web_sm



In [38]:

def read_file(filepath):
    
    with open(filepath) as f:
        
        str_text = f.read()
    
    return str_text

In [None]:
read_file('moby_dick_four_chapters.txt')

### Tokenize and Clean Text

In [40]:
import spacy
nlp = spacy.load('en_core_web_sm',disable=['parser', 'tagger','ner'])



nlp.max_length = 1198623

In [41]:
def separate_punc(doc_text):
    return [token.text.lower() for token in nlp(doc_text) if token.text not in '\n\n \n\n\n!"-#$%&()--.*+,-/:;<=>?@[\\]^_`{|}~\t\n ']

In [42]:
d = read_file('melville-moby_dick.txt')
tokens = separate_punc(d)



### sequences of tokens

In [46]:
# organize into sequences of tokens
train_len = 25+1 # 25 training words , then one target word

# Empty list of sequences
text_sequences = []

for i in range(train_len, len(tokens)):
    
    # Grab train_len# amount of characters
    seq = tokens[i-train_len:i]
    
    # Add to list of sequences
    text_sequences.append(seq)

In [47]:
' '.join(text_sequences[0])

'chapter 1 loomings call me ishmael some years ago never mind how long precisely having little or no money in my purse and nothing particular to'

In [48]:
' '.join(text_sequences[1])

'1 loomings call me ishmael some years ago never mind how long precisely having little or no money in my purse and nothing particular to interest'

In [49]:
' '.join(text_sequences[2])

'loomings call me ishmael some years ago never mind how long precisely having little or no money in my purse and nothing particular to interest me'

In [50]:
len(text_sequences)

214682

### Keras Tokenization

In [None]:
!pip install tensorflow
import tensorflow as tf


In [55]:
from tensorflow.keras.preprocessing.text import Tokenizer


In [56]:
# integer encode sequences of words
tokenizer = Tokenizer()
tokenizer.fit_on_texts(text_sequences)
sequences = tokenizer.texts_to_sequences(text_sequences)

In [57]:
sequences[0]

[158,
 9443,
 17526,
 402,
 42,
 1043,
 43,
 247,
 659,
 140,
 296,
 116,
 82,
 787,
 347,
 113,
 36,
 50,
 1788,
 6,
 49,
 3028,
 3,
 218,
 442,
 5]

In [None]:
tokenizer.index_word

In [None]:
for i in sequences[0]:
    print(f'{i} : {tokenizer.index_word[i]}')

In [None]:
tokenizer.word_counts

In [61]:
vocabulary_size = len(tokenizer.word_counts)

### Convert to Numpy Matrix

In [62]:
import numpy as np

In [63]:
sequences = np.array(sequences)

In [64]:
sequences

array([[  158,  9443, 17526, ...,   218,   442,     5],
       [ 9443, 17526,   402, ...,   442,     5,  1165],
       [17526,   402,    42, ...,     5,  1165,    42],
       ...,
       [  240,   938,   351, ...,  1419,  1313,    74],
       [  938,   351,  1418, ...,  1313,    74,   219],
       [  351,  1418,     3, ...,    74,   219,   222]])

# Creating an LSTM based model

In [65]:
import keras
from keras.models import Sequential
from keras.layers import Dense,LSTM,Embedding

In [66]:
def create_model(vocabulary_size, seq_len):
    model = Sequential()
    model.add(Embedding(vocabulary_size, 25, input_length=seq_len))
    model.add(LSTM(64, return_sequences=True))
    model.add(LSTM(64))
    model.add(Dense(64, activation='relu'))

    model.add(Dense(vocabulary_size, activation='softmax'))
    
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
   
    model.summary()
    
    return model

### Train / Test Split

In [67]:
from keras.utils import to_categorical

In [68]:
sequences

array([[  158,  9443, 17526, ...,   218,   442,     5],
       [ 9443, 17526,   402, ...,   442,     5,  1165],
       [17526,   402,    42, ...,     5,  1165,    42],
       ...,
       [  240,   938,   351, ...,  1419,  1313,    74],
       [  938,   351,  1418, ...,  1313,    74,   219],
       [  351,  1418,     3, ...,    74,   219,   222]])

In [69]:
# First 25 words
sequences[:,:-1]

array([[  158,  9443, 17526, ...,     3,   218,   442],
       [ 9443, 17526,   402, ...,   218,   442,     5],
       [17526,   402,    42, ...,   442,     5,  1165],
       ...,
       [  240,   938,   351, ...,    84,  1419,  1313],
       [  938,   351,  1418, ...,  1419,  1313,    74],
       [  351,  1418,     3, ...,  1313,    74,   219]])

In [70]:
# last Word
sequences[:,-1]

array([   5, 1165,   42, ...,   74,  219,  222])

In [71]:
X = sequences[:,:-1]

In [72]:
y = sequences[:,-1]

In [None]:
y = to_categorical(y, num_classes=vocabulary_size+1)

In [75]:
seq_len = X.shape[1]

In [76]:
seq_len

25

### Training the Model

In [77]:
# define model
model = create_model(vocabulary_size+1, seq_len)



Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 25, 25)            438175    
                                                                 
 lstm (LSTM)                 (None, 25, 150)           105600    
                                                                 
 lstm_1 (LSTM)               (None, 150)               180600    
                                                                 
 dense (Dense)               (None, 150)               22650     
                                                                 
 dense_1 (Dense)             (None, 17527)             2646577   
                                                                 
Total params: 3393602 (12.95 MB)
Trainable params: 3393602 (12.95 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


---

----

In [78]:
from pickle import dump,load

In [80]:
# fit model
model.fit(X, y, batch_size=128, epochs=300,verbose=1)

In [None]:
# save the model to file
model.save('epochBIG.h5')
# save the tokenizer
dump(tokenizer, open('epochBIG', 'wb'))