In [1]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 

In [2]:
import tensorflow as tf
tf.config.get_visible_devices()

[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'),
 PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [3]:
import json
from keras import Model, Input, layers, losses, optimizers
from transformer import TokenAndPositionEmbedding, TransformerEncoder, TransformerBlock, TextGenerator, SaveModel
import re
import warnings
from keras.preprocessing.text import tokenizer_from_json

warnings.filterwarnings("ignore")
warnings.simplefilter(action="ignore", category=FutureWarning)

from typing import List
from glob import glob
from dataset import dataset_generator2
from keras_nlp.layers import TransformerEncoder, TransformerDecoder, TokenAndPositionEmbedding

def create_model(
    max_sequence_len: int,
    total_words: int,
    embedding_dim: int = 256,
    num_heads: int = 4,
    transformer_layers: int = 1
) -> Model:
    inputs1 = x = Input(shape=(max_sequence_len,))
    inputs2 = y = Input(shape=(max_sequence_len,))

    y = TokenAndPositionEmbedding(max_sequence_len, total_words, embedding_dim, mask_zero=True)(y)
    x = TokenAndPositionEmbedding(max_sequence_len, total_words, embedding_dim, mask_zero=True)(x)

    for _ in range(transformer_layers):
        y = TransformerEncoder(embedding_dim, num_heads, 0.1, layer_norm_epsilon=1e-5)(y)
        x = TransformerDecoder(embedding_dim, num_heads, 0.1, layer_norm_epsilon=1e-5)(x, y)
    outputs = layers.Dense(total_words)(x)
    model = Model(inputs=[inputs1, inputs2], outputs=outputs)
    model.compile(
        loss=losses.SparseCategoricalCrossentropy(from_logits=True),
        optimizer=optimizers.Adam(1e-3),
    )
    return model


def preprocess(content: str) -> str:
    to_left: str = r" A-Za-ząćęłńóśźż\-.,?!:;()\n"
    content = re.sub(f"[^{to_left}]+", "", content).lower()
    # content = re.sub(f"([{string.punctuation}])", r" \1", content)
    content = re.sub("\n+", " \n ", content)
    content = re.sub(" +", " ", content)
    return content


def read_files(filenames: List[str]):
    data = []
    for filename in filenames:
        with open(filename, "r", encoding="utf8") as f:
            content = f.read().lower()
            content = preprocess(content)
            data.append(content)
    return data


VOCAB_SIZE = 100_000
LEN_MAX_LIMIT = 50
LEN_MIN_LIMIT = 10
PADDING = "post"
batch_size = 128

with open("../datasets/words/books-bajki-raw-tokenizer_100000.json", "r") as f:
    data = json.load(f)
    tokenizer = tokenizer_from_json(data)

fnames = list(glob("../texts/bajki-extend/*")) + list(glob("../texts/books-raw/*"))
data = read_files(fnames)
ds = dataset_generator2(
    data,
    tokenizer,
    LEN_MIN_LIMIT,
    LEN_MAX_LIMIT,
    3,
    for_transformer=True,
    padding="post",
    batch_size=batch_size,
)

print(
    {
        "tokenizer": tokenizer.num_words,
    }
)

Using TensorFlow backend
{'tokenizer': 100000}


In [4]:
steps_per_epoch = 500
epochs = 25

model = create_model(LEN_MAX_LIMIT, VOCAB_SIZE, embedding_dim=128)
model.summary()

# for epoch in range(epochs):
#     for step in range(steps_per_epoch):
#         inp, out = next(ds)
#         loss = model.train_on_batch(inp, out)
#         print(loss)

model.fit(ds, verbose=1, epochs=epochs, steps_per_epoch=steps_per_epoch, callbacks=[
    TextGenerator("dawno temu czerwony kapturek poszedł do lasu i gdy szedł obok rzeki", 60, LEN_MAX_LIMIT, tokenizer, 20, padding=PADDING),
    SaveModel('../transformer_models/model_best_2.tf')
])


Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_2 (InputLayer)           [(None, 50)]         0           []                               
                                                                                                  
 input_1 (InputLayer)           [(None, 50)]         0           []                               
                                                                                                  
 token_and_position_embedding (  (None, 50, 128)     12806400    ['input_2[0][0]']                
 TokenAndPositionEmbedding)                                                                       
                                                                                                  
 token_and_position_embedding_1  (None, 50, 128)     12806400    ['input_1[0][0]']            