### Nauka i konwersja modelu językowego do formatu TensorFlowJS

Model powstał na podstawie poradnika: https://keras.io/examples/generative/text_generation_with_miniature_gpt/

In [1]:
import os

os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"

import tensorflow as tf
import keras
from keras import layers, ops
import numpy as np
import os
import string
import random


def causal_attention_mask(batch_size, n_dest, n_src, dtype):
    i = ops.arange(n_dest)[:, None]
    j = ops.arange(n_src)
    m = i >= j - n_src + n_dest
    mask = ops.cast(m, dtype)
    mask = ops.reshape(mask, [1, n_dest, n_src])
    mult = ops.concatenate(
        [ops.expand_dims(batch_size, -1), ops.convert_to_tensor([1, 1])], 0
    )
    return ops.tile(mask, mult)


class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim: int, num_heads: int, ff_dim: int, rate: float = 0.1):
        super().__init__()
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.ff_dim = ff_dim
        self.rate = rate

    def get_config(self):
        config = super().get_config().copy()
        config.update(
            {
                "embed_dim": self.embed_dim,
                "num_heads": self.num_heads,
                "ff_dim": self.ff_dim,
                "rate": self.rate,
            }
        )
        return config

    def build(self, input_shape):
        self.att = layers.MultiHeadAttention(self.num_heads, self.embed_dim)
        self.ffn = keras.Sequential(
            [
                layers.Dense(self.ff_dim, activation="relu"),
                layers.Dense(self.embed_dim),
            ]
        )
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(self.rate)
        self.dropout2 = layers.Dropout(self.rate)

    def call(self, inputs):
        input_shape = ops.shape(inputs)
        batch_size = input_shape[0]
        seq_len = input_shape[1]
        causal_mask = causal_attention_mask(batch_size, seq_len, seq_len, "bool")
        attention_output = self.att(inputs, inputs, attention_mask=causal_mask)
        attention_output = self.dropout1(attention_output)
        out1 = self.layernorm1(inputs + attention_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output)
        return self.layernorm2(out1 + ffn_output)


class TokenAndPositionEmbedding(layers.Layer):
    def __init__(self, maxlen: int, vocab_size: int, embed_dim: int):
        super().__init__()
        self.maxlen = maxlen
        self.vocab_size = vocab_size
        self.embed_dim = embed_dim

    def get_config(self):
        config = super().get_config().copy()
        config.update(
            {
                "maxlen": self.maxlen,
                "vocab_size": self.vocab_size,
                "embed_dim": self.embed_dim,
            }
        )
        return config

    def build(self, input_shape):
        self.token_emb = layers.Embedding(
            input_dim=self.vocab_size, output_dim=self.embed_dim
        )
        self.pos_emb = layers.Embedding(
            input_dim=self.maxlen, output_dim=self.embed_dim
        )

    def call(self, x):
        maxlen = ops.shape(x)[-1]
        positions = ops.arange(0, maxlen, 1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions


vocab_size = 20_000  # Only consider the top 21k words
maxlen = 80  # Max sequence size
embed_dim = 512  # Embedding size for each token
num_heads = 2  # Number of attention heads
feed_forward_dim = 512  # Hidden layer size in feed forward network inside transformer


def create_model():
    inputs = layers.Input(shape=(maxlen,), dtype="int32")
    x = TokenAndPositionEmbedding(maxlen, vocab_size, embed_dim)(inputs)
    x = TransformerBlock(embed_dim, num_heads, feed_forward_dim)(x)
    outputs = layers.Dense(vocab_size)(x)
    model = keras.Model(inputs=inputs, outputs=[outputs, x])
    loss_fn = keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    model.compile(
        "adam",
        loss=[loss_fn, None],
    )
    return model


E0000 00:00:1733470584.640212  110375 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1733470584.657070  110375 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


### Przygotowanie danych

In [2]:
![ -f aclImdb_v1.tar.gz ] || curl -O https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
![ -d aclImdb ] || tar -xf aclImdb_v1.tar.gz


In [3]:
batch_size = 128

filenames = []
directories = [
    "aclImdb/train/pos",
    "aclImdb/train/neg",
    "aclImdb/test/pos",
    "aclImdb/test/neg",
]
for dir in directories:
    for f in os.listdir(dir):
        filenames.append(os.path.join(dir, f))

print(f"{len(filenames)} files")

random.shuffle(filenames)
text_ds = tf.data.TextLineDataset(filenames)
text_ds = text_ds.shuffle(buffer_size=256)
text_ds = text_ds.batch(batch_size)


def custom_standardization(input_string):
    lowercased = tf.strings.lower(input_string)
    stripped_html = tf.strings.regex_replace(lowercased, "<br />", " ")
    return tf.strings.regex_replace(stripped_html, f"([{string.punctuation}])", r" \1")


vectorize_layer = layers.TextVectorization(
    standardize=custom_standardization,
    max_tokens=vocab_size - 1,
    output_mode="int",
    output_sequence_length=maxlen + 1,
)
vectorize_layer.adapt(text_ds)
vocab = vectorize_layer.get_vocabulary()


def prepare_lm_inputs_labels(text):
    text = tf.expand_dims(text, -1)
    tokenized_sentences = vectorize_layer(text)
    x = tokenized_sentences[:, :-1]
    y = tokenized_sentences[:, 1:]
    return x, y


text_ds = text_ds.map(prepare_lm_inputs_labels, num_parallel_calls=tf.data.AUTOTUNE)
text_ds = text_ds.prefetch(tf.data.AUTOTUNE)


50000 files


I0000 00:00:1733470588.233421  110375 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 21769 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 3090, pci bus id: 0000:06:00.0, compute capability: 8.6


### Nauka modelu

In [4]:
class TextGenerator(keras.callbacks.Callback):
    """A callback to generate text from a trained model.
    1. Feed some starting prompt to the model
    2. Predict probabilities for the next token
    3. Sample the next token and add it to the next input

    Arguments:
        max_tokens: Integer, the number of tokens to be generated after prompt.
        start_tokens: List of integers, the token indices for the starting prompt.
        index_to_word: List of strings, obtained from the TextVectorization layer.
        top_k: Integer, sample from the `top_k` token predictions.
        print_every: Integer, print after this many epochs.
    """

    def __init__(
        self, max_tokens, start_tokens, index_to_word, top_k=10, print_every=1
    ):
        self.max_tokens = max_tokens
        self.start_tokens = start_tokens
        self.index_to_word = index_to_word
        self.print_every = print_every
        self.k = top_k

    def sample_from(self, logits):
        logits, indices = ops.top_k(logits, k=self.k, sorted=True)
        indices = np.asarray(indices).astype("int32")
        preds = keras.activations.softmax(ops.expand_dims(logits, 0))[0]
        preds = np.asarray(preds).astype("float32")
        return np.random.choice(indices, p=preds)

    def detokenize(self, number):
        return self.index_to_word[number]

    def on_epoch_end(self, epoch, logs=None):
        start_tokens = [_ for _ in self.start_tokens]
        if (epoch + 1) % self.print_every != 0:
            return
        num_tokens_generated = 0
        tokens_generated = []
        while num_tokens_generated <= self.max_tokens:
            pad_len = maxlen - len(start_tokens)
            sample_index = len(start_tokens) - 1
            if pad_len < 0:
                x = start_tokens[:maxlen]
                sample_index = maxlen - 1
            elif pad_len > 0:
                x = start_tokens + [0] * pad_len
            else:
                x = start_tokens
            x = np.array([x])
            y, _ = self.model.predict(x, verbose=0)
            sample_token = self.sample_from(y[0][sample_index])
            tokens_generated.append(sample_token)
            start_tokens.append(sample_token)
            num_tokens_generated = len(tokens_generated)
        txt = " ".join(
            [self.detokenize(_) for _ in self.start_tokens + tokens_generated]
        )
        print(f"generated text:\n{txt}\n")


# Tokenize starting prompt
word_to_index = {}
for index, word in enumerate(vocab):
    word_to_index[word] = index

start_prompt = "this movie is"
start_tokens = [word_to_index.get(_, 1) for _ in start_prompt.split()]
num_tokens_generated = 40
text_gen_callback = TextGenerator(num_tokens_generated, start_tokens, vocab)

model = create_model()
model.summary()
model.fit(text_ds, verbose=1, epochs=25, callbacks=[text_gen_callback])


Epoch 1/25


I0000 00:00:1733470604.488302  110491 service.cc:148] XLA service 0x7f5d18014e10 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1733470604.488534  110491 service.cc:156]   StreamExecutor device (0): NVIDIA GeForce RTX 3090, Compute Capability 8.6
W0000 00:00:1733470604.641034  110491 assert_op.cc:38] Ignoring Assert operator compile_loss/sparse_categorical_crossentropy/SparseSoftmaxCrossEntropyWithLogits/assert_equal_1/Assert/Assert
I0000 00:00:1733470604.764242  110491 cuda_dnn.cc:529] Loaded cuDNN version 90300
I0000 00:00:1733470617.157228  110491 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


    390/Unknown [1m40s[0m 66ms/step - loss: 5.9383

W0000 00:00:1733470643.122127  110491 assert_op.cc:38] Ignoring Assert operator compile_loss/sparse_categorical_crossentropy/SparseSoftmaxCrossEntropyWithLogits/assert_equal_1/Assert/Assert


    391/Unknown [1m51s[0m 93ms/step - loss: 5.9367



generated text:
this movie is just about the characters and the main character in her role ) she 's so he was not the best actor who had an old fashioned [UNK] . she was [UNK] and she did . [UNK] a very good actor who

[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m56s[0m 107ms/step - loss: 5.9351
Epoch 2/25
[1m390/391[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 67ms/step - loss: 4.6499generated text:
this movie is just terrible . it is not a terrible movie . it 's not the movie that is just terrible . i don 't know what i have ever seen . but i don 't think it 's good . . the

[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 71ms/step - loss: 4.6494
Epoch 3/25
[1m390/391[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 67ms/step - loss: 4.3414generated text:
this movie is one of my favorites ! i was very disappointed with his acting . i had just finished watching this flick and found that this movie was a little slow and the acting was a

<keras.src.callbacks.history.History at 0x7f5eb87e3e50>

### Konwersja modelu

In [5]:
import tensorflowjs as tfjs
import tempfile
import os
import json

# Ścieżka do zapisu modelu w formacie TFJS
tfjs_save_path = "language_model.tfjs"
# Maksymalny rozmiar częsci wag modelu w bajtach
weight_shard_size_bytes = 1024 * 1024 * 15

# Zapis modelu do formatu TFJS
# Tworzenie tymczasowego katalogu dla modelu TF
with tempfile.TemporaryDirectory(suffix=".tf") as tf_path:
    # Zapis modelu w formacie TF
    tf.saved_model.save(model, tf_path)

    # Konwersja modelu do formatu TFJS
    tfjs.converters.convert_tf_saved_model(
        saved_model_dir=tf_path,
        output_dir=tfjs_save_path,
        weight_shard_size_bytes=weight_shard_size_bytes,
    )

    # Zapisanie słownika do JSON
    # Będzie niezbędny do konwersji indeksów słów na słowa w aplikacji webowej
    vocab = vectorize_layer.get_vocabulary()
    vocab_dict = {i: str(word) for i, word in enumerate(vocab)}
    with open(os.path.join(tfjs_save_path, "vocab.json"), "w") as f:
        json.dump(vocab_dict, f)


INFO:tensorflow:Assets written to: /tmp/tmpvorfbjmc.tf/assets


INFO:tensorflow:Assets written to: /tmp/tmpvorfbjmc.tf/assets
I0000 00:00:1733471349.388812  110375 devices.cc:67] Number of eligible GPUs (core count >= 8, compute capability >= 0.0): 1
I0000 00:00:1733471349.388954  110375 single_machine.cc:361] Starting new session
I0000 00:00:1733471349.389565  110375 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 21769 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 3090, pci bus id: 0000:06:00.0, compute capability: 8.6
