In [3]:
import tensorflow as tf
tf.config.get_visible_devices()

2023-09-10 15:29:19.924449: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'),
 PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [5]:
import json
import tensorflow as tf

import warnings
from keras.preprocessing.text import tokenizer_from_json
warnings.filterwarnings("ignore")
warnings.simplefilter(action="ignore", category=FutureWarning)
from dataset import *

dataset = load_tokenized_sentences('../datasets/words/books-bajki-raw.pickle')

with open('../datasets/words/books-bajki-raw-tokenizer_100000.json', 'r') as f:
    data = json.load(f)
    tokenizer = tokenizer_from_json(data)
        
print({
    "size": len(dataset),
    "tokenizer": tokenizer.num_words,
    "str": dataset[1][:500],
    "tokenizer": tokenizer.num_words
})

{'size': 145, 'tokenizer': 100000, 'str': 'louisa may alcott\n\nmałe kobietki\ntłum. zofia grabowska\n\nisbn ----\n\n\n\n\nrozdział i. pielgrzymki\n\n co mi za boże narodzenie bez podarków!  mruknęła ludka, leżąc na dywanie przed kominkiem.\n\n to straszne być ubogą!  westchnęła małgosia, spoglądając na swą starą suknię.\n\n bardzo jest nieładnie, że niektóre dziewczęta mają mnóstwo pięknych rzeczy, a inne nie mają nic  dodała amelka z gniewną minką.\n\n mamy przecież ojca, mamę i siebie nawzajem  z zadowoleniem odezwała się eliza ze swego kącika.\n\nich cz'}


In [6]:
from keras.layers import (
    Layer,
    MultiHeadAttention,
    Dense,
    LayerNormalization,
    Dropout,
    Embedding,
    Input,
    Add
)
from keras import Model, losses, Sequential, callbacks, activations, optimizers
from keras.preprocessing.text import Tokenizer
import tensorflow as tf
from typing import Literal
import numpy as np

class MaskedSparseCategoricalCrossentropy(losses.Loss):
    def __init__(self, from_logits: bool = True, pad_value: int = 0, **kwargs):
        super().__init__(**kwargs)
        self.pad_value = pad_value
        self.loss = losses.SparseCategoricalCrossentropy(from_logits, reduction="none")

    def call(self, y_true: tf.Tensor, y_pred: tf.Tensor):
        loss = self.loss(y_true, y_pred)
        mask = tf.cast(y_true != self.pad_value, dtype=loss.dtype)
        loss *= mask
        loss = tf.reduce_sum(loss) / tf.reduce_sum(mask)
        return loss


class TransformerBlock(Layer):
    def __init__(
        self, embed_dim: int, num_heads: int, ff_dim: int, rate: float = 0.1, **kwargs
    ):
        super().__init__(**kwargs)
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.ff_dim = ff_dim
        self.rate = rate

        self.dropout1 = Dropout(self.rate)
        self.dropout2 = Dropout(self.rate)
        self.layernorm1 = LayerNormalization(epsilon=1e-6, center=True, scale=True)
        self.layernorm2 = LayerNormalization(epsilon=1e-6, center=True, scale=True)

        self.mha = MultiHeadAttention(self.num_heads, self.embed_dim)
        self.ffn = Sequential(
            [
                Dense(self.ff_dim, activation="relu"),
                Dense(self.embed_dim),
            ]
        )

    def attention_mask(self, batch_size: int, n_dest: int, n_src: int, dtype: tf.DType) -> tf.Tensor:
        i = tf.expand_dims(tf.range(n_dest), axis=-1)
        j = tf.range(n_src)
        mask = tf.cast(i >= j - n_src + n_dest, dtype)
        mask = tf.reshape(mask, [1, n_dest, n_src])
        mult = tf.concat(
            [tf.expand_dims(batch_size, -1), tf.constant([1, 1], dtype=tf.int32)], 0
        )
        return tf.tile(mask, mult)

    def call(self, inputs: tf.Tensor) -> tf.Tensor:
        batch_size, seq_len = tf.shape(inputs)[0], tf.shape(inputs)[1]
        mask = self.attention_mask(batch_size, seq_len, seq_len, tf.bool)
        attention_output = self.mha(inputs, inputs, attention_mask=mask)
        attention_output = self.dropout1(attention_output)
        out = self.layernorm1(inputs + attention_output)
        ffn_out = self.ffn(out)
        ffn_out = self.dropout2(ffn_out)
        norm = self.layernorm2(out + ffn_out)
        return norm

    def get_config(self) -> dict:
        config = {
            "embed_dim": self.embed_dim,
            "ff_dim": self.ff_dim,
            "num_heads": self.num_heads,
            "rate": self.rate,
        }

        return config


class TokenAndPositionEmbedding(Layer):
    def __init__(self, max_len: int, vocab_size: int, embed_dim: int):
        super().__init__()
        self.max_len = max_len
        self.embed_dim = embed_dim
        self.vocab_size = vocab_size

        self.positions = tf.range(start=0, limit=self.max_len, delta=1)
        self.embedding_token = Embedding(input_dim=self.vocab_size, output_dim=self.embed_dim, mask_zero=True)
        self.embedding_position = Embedding(input_dim=self.max_len, output_dim=self.embed_dim)

    def call(self, x: tf.Tensor) -> tf.Tensor:
        positions = self.embedding_position(self.positions)
        x = self.embedding_token(x)
        _sum = x + positions
        return _sum

    def get_config(self) -> dict:
        config = {
            "max_len": self.max_len,
            "embed_dim": self.embed_dim,
            "vocab_size": self.vocab_size,
        }
        return config


class TextGenerator(callbacks.Callback):
    def __init__(
        self,
        seed_text: str,
        next_words: int,
        max_sequence_len: int,
        tokenizer: Tokenizer,
        top_k=10,
        print_every=1,
        model=None,
        padding: Literal["pre", "post"] = "pre",
    ):
        self.seed_text = seed_text
        self.next_words = next_words
        self.max_sequence_len = max_sequence_len
        self.tokenizer = tokenizer
        if model is not None:
            self.model: Model = model
        self.print_every = print_every
        self.k = top_k
        self.padding = padding

    def sample_from(self, logits: np.ndarray) -> np.ndarray:
        indices = logits.argpartition(-self.k)[-self.k:].astype("int32")
        logits = logits[indices]
        preds = activations.softmax(np.expand_dims(logits, 0))
        preds = np.array(preds[0]).astype("float32")
        return np.random.choice(indices, p=preds)

    def generate_text(self) -> str:
        start_tokens = self.tokenizer.texts_to_sequences([self.seed_text])[0]
        tokens_generated = []
        while len(tokens_generated) <= self.next_words:
            x = pad_sequences(
                [start_tokens], maxlen=self.max_sequence_len, padding=self.padding
            )

            y = self.model.predict_on_batch(x)[0]

            idx = -1
            if self.padding == "post":
                idx = min(len(start_tokens) - 1, self.max_sequence_len - 1)

            sample_token = self.sample_from(y[idx])
            tokens_generated.append(sample_token)
            start_tokens.append(sample_token)

        token_to_word = []
        for tok in tokens_generated:
            try:
                word = self.tokenizer.index_word[tok]
                token_to_word.append(word)
            except:
                token_to_word.append("")
        txt = self.seed_text + " " + " ".join(token_to_word)
        return txt

    def on_epoch_begin(self, epoch: int, logs=None):
        if (epoch + 1) % self.print_every != 0:
            return
        txt = self.generate_text()
        print(f"Epoch: {epoch}; Generated text:\n{txt}\n")



In [None]:
LEN_MAX_LIMIT = 50
LEN_MIN_LIMIT = 10
SKIP = 5
padding = 'post'

def create_model(max_sequence_len: int, total_words: int) -> Model:
    embed_dim = 128
    num_heads = 2
    ff_dim = 256
    inputs = Input(shape=(max_sequence_len,))
    x = TokenAndPositionEmbedding(max_sequence_len, total_words, embed_dim)(inputs)
    x = TransformerBlock(embed_dim, num_heads, ff_dim)(x)
    outputs = Dense(total_words)(x)
    model = Model(inputs=inputs, outputs=outputs)
    model.compile(
        loss=MaskedSparseCategoricalCrossentropy(from_logits=True), optimizer=optimizers.Adam(1e-4)
    )
    return model



batch_size = 128
dataset_size = calculate_dataset_size(dataset, tokenizer, LEN_MAX_LIMIT, SKIP)
gen = dataset_generator(
    dataset,
    tokenizer,
    LEN_MIN_LIMIT,
    LEN_MAX_LIMIT,
    SKIP,
    batch_size,
    for_transformer=True,
    padding=padding
)
a, b = next(gen)
print(a.shape, b.shape, dataset_size)

In [None]:
print(a[0])
print(b[0])

In [None]:
from keras.utils import plot_model
LEN_MAX_LIMIT = 50 
VOCAB_SIZE = 50_000
steps_per_epoch = 1500
epochs = 25

model = create_model(LEN_MAX_LIMIT, tokenizer.num_words + 1)
plot_model(model,show_shapes=True)
model.summary()

model.fit(gen, verbose=1, epochs=epochs, steps_per_epoch=steps_per_epoch, callbacks=[
    TextGenerator("dawno temu czerwony kapturek poszedł do lasu i gdy szedł obok rzeki", 60, LEN_MAX_LIMIT, tokenizer, 0, padding=padding),
    tf.keras.callbacks.ModelCheckpoint('../transformer_models/model_best_2.h5', monitor='loss', save_best_only=True, save_weights_only=False)
])


In [8]:
from keras.models import load_model

model = load_model(
    "../transformer_models/model_best_1.h5",
    custom_objects={
        "TokenAndPositionEmbedding": TokenAndPositionEmbedding,
        "TransformerBlock": TransformerBlock,
    },
)

for k in [1, 2, 4, 8]:
    txt = TextGenerator(
        "dawno temu czerwony kapturek poszedł do lasu i gdy szedł obok rzeki",
        60,
        LEN_MAX_LIMIT,
        tokenizer,
        k,
        model=model,
        padding=padding,
    ).generate_text()
    print(txt)

2023-09-10 15:32:58.595003: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1635] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 5722 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 3070, pci bus id: 0000:06:00.0, compute capability: 8.6


ValueError: Cannot assign value to variable ' transformer_block/layer_normalization/gamma:0': Shape mismatch.The variable shape (128,), and the assigned value shape (128, 2, 128) are incompatible.