In [12]:
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, LSTM, Dense, Dropout
from keras.preprocessing.text import Tokenizer
from keras.callbacks import EarlyStopping
from keras.models import Sequential
import keras.utils as ku 
import numpy as np

In [89]:
data = """The cat and her kittens
They put on their mittens,
To eat a Christmas pie.
The poor little kittens
They lost their mittens,
And then they began to cry.
O mother dear, we sadly fear
We cannot go to-day,
For we have lost our mittens."
"If it be so, ye shall not go,
For ye are naughty kittens."""

# print(data)

In [94]:
import os
import pandas as pd

DATA_DIR = "./data"

input_file = os.path.join(DATA_DIR,"dummy_preprocessed.txt")
# input_file = os.path.join(DATA_DIR,"europarl-v7.en.preprocessed.txt")

# df = pd.read_csv(input_file, sep="\n", header=None)
# df = df.to_numpy().flatten()
# # print(df)

with open(input_file, 'r') as file:
    data = file.read()
    
# print(data)

In [95]:
tokenizer = Tokenizer()
def dataset_preparation(data):
#     corpus = data.lower().split("\n")
    corpus = data.split("\n")
#     print(corpus[0])
    tokenizer.fit_on_texts(corpus)
    total_words = len(tokenizer.word_index) + 1
    
    input_sequences = []
    for line in corpus:
        token_list = tokenizer.texts_to_sequences([line])[0]
        for i in range(1, len(token_list)):
            n_gram_sequence = token_list[:i+1]
            input_sequences.append(n_gram_sequence)
            
#     print(input_sequences)
            
    max_sequence_len = max([len(x) for x in input_sequences])
    input_sequences = np.array(pad_sequences(input_sequences,   
                          maxlen=max_sequence_len, padding='pre'))
    
    predictors, label = input_sequences[:,:-1],input_sequences[:,-1]
    label = ku.to_categorical(label, num_classes=total_words)
    
#     print("predictors")
#     print(predictors)
    
#     print("label")
#     print(label)
    
    return predictors, label, max_sequence_len, total_words

In [96]:
import pickle

MODEL_DIR = "./model"

def create_model(predictors, label, max_sequence_len, total_words):
    input_len = max_sequence_len - 1
    model = Sequential()
    model.add(Embedding(total_words, 10, input_length=input_len))
    model.add(LSTM(150))
    model.add(Dropout(0.1))
    model.add(Dense(total_words, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam')
    model.fit(predictors, label, epochs=100, verbose=1)
    
    # save the model to disk
    filename = os.path.join(MODEL_DIR,"finalized_model.sav")
#     filename = 'finalized_model.sav'
    pickle.dump(model, open(filename, 'wb'))
    
    return model

In [97]:
def generate_text(seed_text, next_words, max_sequence_len, model):
    for j in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        predicted = model.predict_classes(token_list, verbose=0)
  
        output_word = ""
        for word, index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break
        seed_text += " " + output_word
    return seed_text

In [None]:
X, Y, max_len, total_words = dataset_preparation(data)
model = create_model(X, Y, max_len, total_words)

In [69]:
text = generate_text("cat and", 3, max_len, model)
print(text)
# "cat and her lost kittens"
text = generate_text("we naughty", 3, max_len, model)
print(text)
# "we naughty lost to day"

cat and then her to
we naughty we kittens ye
