In [110]:
import numpy as np
import re
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout, BatchNormalization
from keras.utils import to_categorical
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

### Reead data and split it per line

In [114]:
def read_prepare_data(file_path):
    with open(file_path, 'r') as f:
        data = f.read().strip()
        data = re.sub("[^a-zA-Z0-9 ,.:? \n]", "", data)
        data = data.lower().split('\n')
        data = [line for line in data if line]
    return data

file_path = "/kaggle/input/shekspier/Shekspier.txt"
data = read_prepare_data(file_path)
print(data[0:4])

['first citizen:', 'before we proceed any further, hear me speak.', 'all:', 'speak, speak.']


### Tokenizer and Padding

In [116]:
def init_tokenizer():
    # initialize the tokenizer
    tkn = Tokenizer()
    tkn.fit_on_texts(data)
    # +1 to handle the words the tokenizer has not seen before
    nmu_unique_words = len(tkn.word_index) + 1
    return (tkn, nmu_unique_words)
tokenizer, nmu_unique_words = init_tokenizer()

def tokenize_data_padding(raw_data):
    seq = tkn.texts_to_sequences(raw_data)
    seq = [sample for sample in seq if len(sample) >= 2]
    max_length = max([len(sample) for sample in seq])
    data = pad_sequences(seq, padding = 'pre', maxlen= max_length)
    return data
data = tokenize_data_padding(data)

### Separate X and y and one-hot encoding

In [119]:
X = data[:,:-1]
y = data[:,-1]

input_size = X.shape[1]
y = to_categorical(y, nmu_unique_words)

### Creat model and train it

In [121]:
def lstm_model(input_dim, input_size, output_dim=100):
    model = Sequential()
    model.add(Embedding(input_dim, output_dim))
    model.add(BatchNormalization())
    model.add(LSTM(128))
    model.add(Dropout(0.1))
    model.add(Dense(input_dim, activation="softmax"))
    model.build(input_shape=(1, input_size))
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=["accuracy"])
    return model

model = lstm_model(nmu_unique_words, input_size)
model.summary()

In [70]:
history=model2.fit(X, y, epochs=70, batch_size=128)

Epoch 1/70
[1m214/214[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 16ms/step - accuracy: 0.8721 - loss: 0.4753
Epoch 2/70
[1m214/214[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 15ms/step - accuracy: 0.8662 - loss: 0.4811
Epoch 3/70
[1m214/214[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 15ms/step - accuracy: 0.8680 - loss: 0.4839
Epoch 4/70
[1m214/214[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 16ms/step - accuracy: 0.8712 - loss: 0.4664
Epoch 5/70
[1m214/214[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 16ms/step - accuracy: 0.8666 - loss: 0.4837
Epoch 6/70
[1m214/214[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 15ms/step - accuracy: 0.8635 - loss: 0.4874
Epoch 7/70
[1m214/214[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 15ms/step - accuracy: 0.8722 - loss: 0.4614
Epoch 8/70
[1m214/214[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 15ms/step - accuracy: 0.8722 - loss: 0.4710
Epoch 9/70
[1m214/214[0m [32m

### Test the model

In [109]:
def word_generator(input_text_arrays, num_pred_words):
    for i in range(num_pred_words):
        tokenized_my_samples = tkn.texts_to_sequences(input_text_arrays)
        tokenized_my_samples = pad_sequences(tokenized_my_samples, padding = 'pre', maxlen= max_length)

        prediction = model2.predict(tokenized_my_samples)
        prediction = np.argmax(prediction, axis=1)[0]

        for k in tkn.word_index.keys():
            if tkn.word_index[k] == prediction:
                input_text_arrays[0] = input_text_arrays[0] + ' ' + k
    
    return input_text_arrays
    

my_sample = ["I like the"]
word_generator(my_sample, 5)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step


['I like the people give up enemies sir']