### Recurrent neural networks (RNN)
- Especially equiped for "sequences": time series data (data with constant stream)
    - or sentence (sequence of words)
```
Lets imagine sequence [1, 2, 3, 4]
```
Could you guess the next item in the sequence?

- Recurrent neuron sends output to itself.

### Long short term memory units (LSTM)
- An RNN will begin to forget begining inputs
- We need long term memory, balancing the short and long term memory of the network
- Most commonly used for text generation.
- Steps:
    1. Process text
    2. Clean text
    3. Tokenise text and create sequences with Keras

In [1]:
def read_file(path):
    with open(path) as f:
        str_text = f.read()
        
    return str_text

In [2]:
import spacy

In [4]:
nlp = spacy.load('en_core_web_sm', disable=['parser', 'tagger', 'ner']) # Named entity recognition.

In [5]:
nlp.max_length = 1198623 # Larger number of max length to work with

In [7]:
# Do not overfit to punctuation points
def separate_punc(doc_text):
    return [token.text.lower() for token in nlp(doc_text) if token.text not in '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n \n\n \n\n\n']

In [8]:
data = read_file('moby_dick_four_chapters.txt')

In [9]:
tokens = separate_punc(data)



In [10]:
len(tokens)

11429

In [11]:
# Create sequence of tokens. Put 25 words into a sequence and have network predict the 26th

In [12]:
train_len = 25 + 1

In [13]:
text_sequences = []

for i in range(train_len, len(tokens)):
    sequence = tokens[i - train_len: i]
    text_sequences.append(sequence)

In [14]:
text_sequences

[['call',
  'me',
  'ishmael',
  'some',
  'years',
  'ago',
  '--',
  'never',
  'mind',
  'how',
  'long',
  'precisely',
  '--',
  'having',
  'little',
  'or',
  'no',
  'money',
  'in',
  'my',
  'purse',
  'and',
  'nothing',
  'particular',
  'to',
  'interest'],
 ['me',
  'ishmael',
  'some',
  'years',
  'ago',
  '--',
  'never',
  'mind',
  'how',
  'long',
  'precisely',
  '--',
  'having',
  'little',
  'or',
  'no',
  'money',
  'in',
  'my',
  'purse',
  'and',
  'nothing',
  'particular',
  'to',
  'interest',
  'me'],
 ['ishmael',
  'some',
  'years',
  'ago',
  '--',
  'never',
  'mind',
  'how',
  'long',
  'precisely',
  '--',
  'having',
  'little',
  'or',
  'no',
  'money',
  'in',
  'my',
  'purse',
  'and',
  'nothing',
  'particular',
  'to',
  'interest',
  'me',
  'on'],
 ['some',
  'years',
  'ago',
  '--',
  'never',
  'mind',
  'how',
  'long',
  'precisely',
  '--',
  'having',
  'little',
  'or',
  'no',
  'money',
  'in',
  'my',
  'purse',
  'and',
  '

In [15]:
# Format into numerical system
from keras.preprocessing.text import Tokenizer

In [16]:
tokenizer = Tokenizer()

In [18]:
tokenizer.fit_on_texts(text_sequences)

In [21]:
sequences = tokenizer.texts_to_sequences(text_sequences)

In [22]:
sequences[0]

[957,
 14,
 264,
 52,
 262,
 409,
 17,
 88,
 220,
 130,
 112,
 955,
 17,
 261,
 51,
 44,
 39,
 315,
 7,
 24,
 547,
 3,
 151,
 260,
 6,
 958]

In [23]:
sequences[1]

[14,
 264,
 52,
 262,
 409,
 17,
 88,
 220,
 130,
 112,
 955,
 17,
 261,
 51,
 44,
 39,
 315,
 7,
 24,
 547,
 3,
 151,
 260,
 6,
 958,
 14]

In [24]:
# We have replaced original text sequences with index positions for word in a vocab!
tokenizer.index_word[1]

'the'

In [25]:
tokenizer.index_word[2]

'a'

In [27]:
tokenizer.word_counts

OrderedDict([('call', 27),
             ('me', 2472),
             ('ishmael', 133),
             ('some', 758),
             ('years', 135),
             ('ago', 84),
             ('--', 2334),
             ('never', 450),
             ('mind', 165),
             ('how', 322),
             ('long', 375),
             ('precisely', 38),
             ('having', 144),
             ('little', 769),
             ('or', 952),
             ('no', 1005),
             ('money', 122),
             ('in', 5649),
             ('my', 1788),
             ('purse', 73),
             ('and', 9648),
             ('nothing', 283),
             ('particular', 154),
             ('to', 6499),
             ('interest', 26),
             ('on', 1716),
             ('shore', 26),
             ('i', 7150),
             ('thought', 676),
             ('would', 702),
             ('sail', 104),
             ('about', 1014),
             ('a', 10377),
             ('see', 416),
             ('the', 15540),
    

In [28]:
vocab_size = len(tokenizer.word_counts)

In [29]:
vocab_size

2719

In [30]:
import numpy as np

In [31]:
sequences = np.array(sequences)

In [33]:
sequences # Formats it nicely...

array([[ 957,   14,  264, ...,  260,    6,  958],
       [  14,  264,   52, ...,    6,  958,   14],
       [ 264,   52,  262, ...,  958,   14,   25],
       ...,
       [ 953,   12,  167, ...,  263,   54,    2],
       [  12,  167, 2714, ...,   54,    2, 2719],
       [ 167, 2714,    3, ...,    2, 2719,   27]])

In [34]:
# LSTM based model.
from keras.utils import to_categorical

In [37]:
X = sequences[:, :-1] # For every row get each column except the last

In [38]:
y = sequences[:, -1]

In [39]:
y = to_categorical(y, num_classes=vocab_size+1)

In [40]:
y

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [41]:
sequence_length = X.shape[1]

In [42]:
X.shape

(11403, 25)

In [43]:
from keras.models import Sequential

In [44]:
from keras.layers import Dense, LSTM, Embedding

In [46]:
def create_model(vocabulary_size, seq_len):
    model = Sequential()
    num_neurons = 50
    # Define an input dimentsion and output dimenstion for the sequence.
    model.add(Embedding(vocabulary_size, seq_len, input_length=seq_len))
    model.add(LSTM(num_neurons, return_sequences=True))
    model.add(LSTM(num_neurons))
    model.add(Dense(50, activation='relu'))
    
    model.add(Dense(vocabulary_size, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics='accuracy')
    model.summary()
    return model

In [48]:
model = create_model(vocab_size+1, sequence_length)

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 25, 25)            68000     
                                                                 
 lstm_2 (LSTM)               (None, 25, 50)            15200     
                                                                 
 lstm_3 (LSTM)               (None, 50)                20200     
                                                                 
 dense_2 (Dense)             (None, 50)                2550      
                                                                 
 dense_3 (Dense)             (None, 2720)              138720    
                                                                 
Total params: 244670 (955.74 KB)
Trainable params: 244670 (955.74 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [49]:
from pickle import dump, load

In [51]:
# Only sent in 128 at a time.
model.fit(X, y, batch_size=128, epochs=2, verbose=1) # Run for at least 200 epochs 

Epoch 1/2
Epoch 2/2


<keras.src.callbacks.History at 0x2a66b0650>

In [53]:
model.save('my_mobydick_model.keras')

In [54]:
dump(tokenizer, open('my_simple_tokenizer', 'wb'))

In [55]:
# Model succesfully trained and saved!

In [70]:
from keras.preprocessing.sequence import pad_sequences

In [75]:
def predict_classes(model, data):
    return np.argmax(model.predict(data),axis=1)

# Generate new text.
def generate_text(
    # The neural net model we built.
    model, 
    # The tokenizer which has knowledge of the vocabulary and what index matches what word.
    tokenizer, 
    seq_len, 
    seed_text, 
    # How many words we want to generate.
    words_to_get_len
):
    output_text = []
    input_text = seed_text
    
    for i in range(words_to_get_len):
        # Remember that text_to_sequences just return the numerical values that represent the words in the tokeniser vocab.
        # So what would be returned here would be something like [0, 1, 4, 5] where 0 = the and 1 = cat and 4 = jumped e.t.c
        encoded = tokenizer.texts_to_sequences([input_text])[0]
        
        # this makes sure if seed was too long or short that it is exactly the sequence length we need.
        # Cut off the first word as we keep adding to the end.
        pad_encoded = pad_sequences([encoded], maxlen=seq_len, truncating='pre')
        
        # For every word in the encoded text, get the most likely next word.
        pred_word_index_pos = predict_classes(model, pad_encoded)[0]
        pred_word = tokenizer.index_word[pred_word_index_pos]
        
        # Add tp the end of the sentence.
        input_text += ' ' + pred_word
        output_text.append(pred_word)
    
    
    return ' '.join(output_text)

In [65]:
import random
random.seed(101)
rand_pick = random.randint(0, len(text_sequences))

In [66]:
seed_text = ' '.join(text_sequences[rand_pick])

In [67]:
seed_text

'at his pipe and sitting up in bed you gettee in he added motioning to me with his tomahawk and throwing the clothes to one side'

In [76]:
generate_text(model, tokenizer, sequence_length, seed_text=seed_text, words_to_get_len=25)



'the the the the the the the the the the the the the the the the the the the the the the the the the'

In [77]:
# Trained for 300 epochs...
from keras.models import load_model

In [78]:
model = load_model('epochBIG.h5')

In [80]:
tokenizer = load(open('epochBIG', 'rb'))

ModuleNotFoundError: No module named 'keras_preprocessing'