In [1]:
# keras module for building LSTM
from keras.utils import pad_sequences
from keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from keras.preprocessing.text import Tokenizer
from keras.callbacks import EarlyStopping
from keras.models import Sequential
import keras.utils as ku

# set seeds for reproducability
import tensorflow as tf

import pandas as pd
import numpy as np
import string, os

import warnings
import matplotlib.pyplot as plt
from keras.preprocessing.text import tokenizer_from_json
warnings.filterwarnings("ignore")
warnings.simplefilter(action="ignore", category=FutureWarning)

from dataset import load_tokenized_sentences

sentence_len_min_limit = 2
sentence_len_max_limit = 25

dataset_temp = load_tokenized_sentences("../datasets/pickled/books_clear.pickle")
print(len(dataset_temp), dataset_temp[:5])
dataset = load_tokenized_sentences("../datasets/pickled/bajki_clear.pickle") 
print(len(dataset), dataset[:5])
dataset = dataset_temp + dataset
dataset_join = []
for sentence in dataset:
    if len(sentence) > sentence_len_max_limit:
        for i in range(0, len(sentence) - sentence_len_min_limit, sentence_len_max_limit):
           dataset_join.append(" ".join(sentence[i:i+sentence_len_max_limit])) 
    if sentence_len_min_limit <= len(sentence) <= sentence_len_max_limit:
        dataset_join.append(" ".join(sentence))
#dataset = [" ".join(sentence) for sentence in dataset if sentence_len_min_limit <= len(sentence) <= sentence_len_max_limit]
dataset = dataset_join

# for i in range(len(dataset)):
#     for _char in ".,?!:;":
#         dataset[i] = dataset[i].replace(' ' + _char, _char)
        
print(len(dataset), dataset[:5])

542601 [['jules', 'verne', 'mil', 'podmorskiej', 'żeglugitłum'], ['tłumacz', 'nieznanyisbn', 'skała', 'uciekającanie', 'zapomniano', 'zapewne', 'dotąd', 'wypadku', 'dziwnego', 'niepojętego', 'i', 'trudnego', 'do', 'objaśnienia', 'zjawiska', 'jakim', 'się', 'odznaczył', 'rok'], ['nie', 'mówiąc', 'już', 'o', 'pogłoskach', 'niepokojących', 'ludność', 'portów', 'i', 'zajmujących', 'ogół', 'na', 'wszystkich', 'lądach', 'dodać', 'wypada', 'że', 'marynarze', 'byli', 'najmocniej', 'zaniepokojeni'], ['kupcy', 'armatorzy', 'dowódcy', 'okrętów', 'szyprowie', 'i', 'sternicy', 'statków', 'europejskich', 'i', 'amerykańskich', 'oficerowie', 'marynarki', 'wojennej', 'wszystkich', 'krajów', 'a', 'nawet', 'rządy', 'różnych', 'państw', 'obu', 'lądów', 'do', 'najwyższego', 'stopnia', 'zajęci', 'byli', 'tym', 'wydarzeniem'], ['od', 'niejakiego', 'czasu', 'okręty', 'napotykały', 'na', 'morzu', 'jakąś', 'rzecz', 'ogromną', 'przedmiot', 'długi', 'kształtu', 'wrzecionowatego', 'niekiedy', 'świecący', 'nieskońc

In [2]:
from typing import List


def get_sequence_of_tokens(dataset: List[str], tokenizer: Tokenizer = None):
    if tokenizer is None:
        tokenizer = Tokenizer()
        tokenizer.fit_on_texts(dataset)

    total_words = len(tokenizer.word_index) + 1
    word2int_sequences = []
    for line in dataset:
        token_list = tokenizer.texts_to_sequences([line])[0]
        for i in range(sentence_len_min_limit, len(token_list)):
            n_gram_sequence = token_list[: i + 1]
            word2int_sequences.append(n_gram_sequence)
    return word2int_sequences, total_words, tokenizer


def generate_padded_sequences(word2int_sequences: List[List[int]]):
    max_sequence_len = max([len(x) for x in word2int_sequences])
    word2int_sequences = np.array(
        pad_sequences(word2int_sequences, maxlen=max_sequence_len, padding="pre")
    )
    predictors, label = word2int_sequences[:, :-1], word2int_sequences[:, -1]
    return predictors, label, max_sequence_len


def dataset_generator(predictors, label, batch_size=256):
    while True:
        p = np.random.permutation(len(predictors))
        for i in range(0, len(predictors) - batch_size + 1, batch_size):
            indexes = p[i : i + batch_size]
            yield predictors[indexes], label[indexes]

tokenizer = tokenizer_from_json('../lstm_models/tokenizer.json')
word2int_sequences, total_words, tokenizer = get_sequence_of_tokens(dataset, tokenizer)
print(word2int_sequences[:10])

predictors, label, max_sequence_len = generate_padded_sequences(word2int_sequences)
print(predictors.shape, label.shape)


batch_size = 256
gen = dataset_generator(predictors, label, 256)
a, b = next(gen)
print(a.shape, b.shape)
# [[62859, 103749, 1245], [62859, 103749, 1245, 33519], [62859, 103749, 1245, 33519, 171004], [21370, 88416, 9660], [21370, 88416, 9660, 171005], [21370, 88416, 9660, 171005, 13608], [21370, 88416, 9660, 171005, 13608, 532], [21370, 88416, 9660, 171005, 13608, 532, 539], [21370, 88416, 9660, 171005, 13608, 532, 539, 1731], [21370, 88416, 9660, 171005, 13608, 532, 539, 1731, 2222]]
# (6218251, 24) (6218251,)
# (256, 24) (256,)

[[62859, 103749, 1245], [62859, 103749, 1245, 33519], [62859, 103749, 1245, 33519, 171004], [21370, 88416, 9660], [21370, 88416, 9660, 171005], [21370, 88416, 9660, 171005, 13608], [21370, 88416, 9660, 171005, 13608, 532], [21370, 88416, 9660, 171005, 13608, 532, 539], [21370, 88416, 9660, 171005, 13608, 532, 539, 1731], [21370, 88416, 9660, 171005, 13608, 532, 539, 1731, 2222]]
(6218251, 24) (6218251,)
(256, 24) (256,)


In [4]:
import random


class PredictCallback(tf.keras.callbacks.Callback):
    def __init__(self, seed_text, next_words, max_sequence_len, randomize=True):
        self.seed_text = seed_text
        self.next_words = next_words
        self.max_sequence_len = max_sequence_len
        self.randomize = randomize

    def on_epoch_begin(self, epoch, logs=None):
        seed_text: str = self.seed_text
        for _ in range(self.next_words):
            token_list = tokenizer.texts_to_sequences([seed_text])[0]
            token_list = pad_sequences(
                [token_list], maxlen=self.max_sequence_len - 1, padding="pre"
            )
            predicted = self.model.predict_on_batch(token_list)[0]
            if self.randomize:
                indc = np.argpartition(predicted, -5)[-5:]
                predicted = random.choice(indc)
            else:
                predicted = predicted.argmax()
            
            output_word = ""
            for word, index in tokenizer.word_index.items():
                if index == predicted:
                    output_word = word
                    break
            seed_text += " " + output_word
        print(f"Start epoch {epoch} of training; Generated text:", seed_text)


def create_model(max_sequence_len, total_words):
    input_len = max_sequence_len - 1
    model = Sequential()
    model.add(Embedding(total_words, 50, input_length=input_len))
    model.add(LSTM(128, return_sequences=True))
    model.add(Dropout(0.2))
    model.add(LSTM(256, return_sequences=True))
    model.add(LSTM(512))
    model.add(Dense(total_words, activation="softmax"))
    model.compile(loss="sparse_categorical_crossentropy", optimizer="adam")
    return model

print(total_words, max_sequence_len)
model = create_model(max_sequence_len, total_words)
model.summary()



331307 25
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 24, 50)            16565350  
                                                                 
 lstm (LSTM)                 (None, 24, 128)           91648     
                                                                 
 dropout (Dropout)           (None, 24, 128)           0         
                                                                 
 lstm_1 (LSTM)               (None, 24, 256)           394240    
                                                                 
 lstm_2 (LSTM)               (None, 512)               1574912   
                                                                 
 dense (Dense)               (None, 331307)            169960491 
                                                                 
Total params: 188,586,641
Trainable params: 18

In [1]:
model.fit(
    gen,
    steps_per_epoch=len(predictors) // batch_size,
    epochs=100,
    verbose=1,
    callbacks=[
        PredictCallback("dawno temu czerwony kapturek poszedł do lasu", 25, max_sequence_len, True),
        PredictCallback("dawno temu czerwony kapturek poszedł do lasu", 25, max_sequence_len, False),
        tf.keras.callbacks.ModelCheckpoint('../lstm_models/model_best_2.h5', monitor='loss', save_best_only=True, save_weights_only=True)
    ],
)

NameError: name 'model' is not defined

In [4]:
import json as pjson
json = tokenizer.to_json()

with open('../lstm_models/tokenizer.json', 'w', encoding='utf-8') as f:
    f.write(pjson.dumps(json, ensure_ascii=False))

