In [1]:
import tensorflow as tf
tf.config.get_visible_devices()

[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'),
 PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [1]:
import json
import tensorflow as tf

import warnings
from keras.preprocessing.text import tokenizer_from_json
warnings.filterwarnings("ignore")
warnings.simplefilter(action="ignore", category=FutureWarning)
from dataset import *

dataset = load_tokenized_sentences('../datasets/words/books-bajki-raw.pickle')

with open('../datasets/words/books-bajki-raw-tokenizer_100000.json', 'r') as f:
    data = json.load(f)
    tokenizer = tokenizer_from_json(data)
        
print({
    "size": len(dataset),
    "tokenizer": tokenizer.num_words,
    "str": dataset[1][:500],
    "tokenizer": tokenizer.num_words
})

{'size': 41, 'tokenizer': 100000, 'str': 'była raz mała słodka dzieweczka, którą kochał każdy, kto ją tylko ujrzał, a najwięcej kochała ją babcia  nie wiedziała wprost, co jej dać. pewnego razu podarowała jej kapturek z czerwonego aksamitu, a dziewczynce tak się ten kapturek podobał, że nie chciała nosić żadnego innego, toteż nazwano ją czerwonym kapturkiem.\npewnego razu rzekła matka do czerwonego kapturka:\n oto masz, dziecko, w koszyku placek i flaszkę wina, zanieś to babci, która jest chora i słaba, i ucieszy się bardzo tym podarunkie'}


In [2]:
from keras.layers import (
    Layer,
    MultiHeadAttention,
    Dense,
    LayerNormalization,
    Dropout,
    Embedding,
    Input
)
from keras import Model, losses, Sequential, callbacks, activations
from keras.preprocessing.text import Tokenizer

class TransformerBlock(Layer):
    def __init__(self, embed_dim: int, num_heads: int, ff_dim: int, rate=0.1):
        super().__init__()
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.ff_dim = ff_dim
        self.rate = rate
        
        self.att = MultiHeadAttention(self.num_heads, self.embed_dim)
        self.ffn = Sequential(
            [
                Dense(self.ff_dim, activation="relu"),
                Dense(self.embed_dim),
            ]
        )
        self.layernorm1 = LayerNormalization(epsilon=1e-6)
        self.layernorm2 = LayerNormalization(epsilon=1e-6)
        self.dropout1 = Dropout(self.rate)
        self.dropout2 = Dropout(self.rate)
        
    def causal_attention_mask(self, batch_size: int, n_dest: int, n_src: int, dtype: tf.DType):
        i = tf.range(n_dest)[:, None]
        j = tf.range(n_src)
        m = i >= j - n_src + n_dest
        mask = tf.cast(m, dtype)
        mask = tf.reshape(mask, [1, n_dest, n_src])
        mult = tf.concat(
            [tf.expand_dims(batch_size, -1), tf.constant([1, 1], dtype=tf.int32)], 0
        )
        return tf.tile(mask, mult)
    
    def call(self, inputs: tf.Tensor):
        input_shape = tf.shape(inputs)
        batch_size = input_shape[0]
        seq_len = input_shape[1]
        causal_mask = self.causal_attention_mask(batch_size, seq_len, seq_len, tf.bool)
        attention_output = self.att(inputs, inputs, attention_mask=causal_mask)
        attention_output = self.dropout1(attention_output)
        out1 = self.layernorm1(inputs + attention_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output)
        return self.layernorm2(out1 + ffn_output)

    def get_config(self):
        config = {}
        config.update({
            "embed_dim": self.embed_dim,
            "ff_dim": self.ff_dim,
            "num_heads": self.num_heads,
            "rate": self.rate,
        })
        return config
    
class TokenAndPositionEmbedding(Layer):
    def __init__(self, max_len: int, vocab_size: int, embed_dim: int):
        super().__init__()
        self.max_len = max_len
        self.embed_dim = embed_dim
        self.vocab_size = vocab_size
        
        self.token_emb = Embedding(input_dim=self.vocab_size, output_dim=self.embed_dim)
        self.pos_emb = Embedding(input_dim=self.max_len, output_dim=self.embed_dim)

    def call(self, x: tf.Tensor):
        maxlen = tf.shape(x)[-1]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions

    def get_config(self):
        config = {}
        config.update({
            "max_len": self.max_len,
            "embed_dim": self.embed_dim,
            "vocab_size": self.vocab_size,
        })
        return config

def create_model(max_sequence_len: int, total_words: int):
    embed_dim = 128
    num_heads = 2
    ff_dim = 256
    inputs = Input(shape=(max_sequence_len,))
    x = TokenAndPositionEmbedding(max_sequence_len, total_words, embed_dim)(inputs)
    x = TransformerBlock(embed_dim, num_heads, ff_dim)(x)
    outputs = Dense(total_words)(x)
    model = Model(inputs=inputs, outputs=outputs)
    model.compile(loss=losses.SparseCategoricalCrossentropy(from_logits=True), optimizer="adam")
    return model

class TextGenerator(callbacks.Callback):
    def __init__(
        self,
        seed_text: str,
        next_words: int,
        max_sequence_len: int,
        tokenizer: Tokenizer,
        temperature=0.0,
        top_k=10, 
        print_every=1,
        model=None,
        padding: Literal['pre', 'post'] = 'pre'
    ):
        self.seed_text = seed_text
        self.next_words = next_words
        self.max_sequence_len = max_sequence_len
        self.temperature = temperature
        self.tokenizer = tokenizer
        if model is not None:
            self.model: Model = model
        self.print_every = print_every
        self.k = top_k
        self.padding = padding

    def sample_from(self, logits: tf.Tensor):
        logits, indices = tf.math.top_k(logits, k=self.k, sorted=True)
        indices = np.asarray(indices).astype("int32")
        preds = activations.softmax(tf.expand_dims(logits, 0))[0]
        preds = np.asarray(preds).astype("float32")
        return np.random.choice(indices, p=preds)

    def generate_text(self):
        start_tokens = self.tokenizer.texts_to_sequences([self.seed_text])[0]
        tokens_generated = []
        while len(tokens_generated) <= self.next_words:
            x = pad_sequences(
                [start_tokens], maxlen=self.max_sequence_len, padding=self.padding
            )
            y = self.model.predict_on_batch(x)[0]
            sample_token = self.sample_from(y[-1])
            tokens_generated.append(sample_token)
            start_tokens.append(sample_token)

        token_to_word = []
        for tok in tokens_generated:
            try:
                word = self.tokenizer.index_word[tok]
                token_to_word.append(word)
            except:
                token_to_word.append("")
        txt = self.seed_text + " " + " ".join(token_to_word)
        return txt
    
    def on_epoch_begin(self, epoch: int, logs=None):
        if (epoch + 1) % self.print_every != 0:
            return
        txt = self.generate_text()
        print(f"Epoch: {epoch}; Generated text:\n{txt}\n")


In [3]:
LEN_MAX_LIMIT = 50
LEN_MIN_LIMIT = 5
SKIP = 5
padding = 'post'

batch_size = 128
dataset_size = calculate_dataset_size(dataset, tokenizer, LEN_MAX_LIMIT, SKIP)
gen = dataset_generator(
    dataset,
    tokenizer,
    LEN_MIN_LIMIT,
    LEN_MAX_LIMIT,
    SKIP,
    batch_size,
    for_transformer=True,
    padding=padding
)
a, b = next(gen)
print(a.shape, b.shape, dataset_size)

(128, 50) (128, 50) 1355395


In [4]:
from keras.utils import plot_model
LEN_MAX_LIMIT = 50 
VOCAB_SIZE = 100_000


model = create_model(LEN_MAX_LIMIT, tokenizer.num_words + 1)
plot_model(model,show_shapes=True)
model.summary()

model.fit(gen, verbose=1, epochs=4, steps_per_epoch=dataset_size // batch_size, callbacks=[
    TextGenerator("dawno temu czerwony kapturek poszedł do lasu", 60, LEN_MAX_LIMIT, tokenizer, 0, padding=padding),
    tf.keras.callbacks.ModelCheckpoint('../transformer_models/model_best_2.h5', monitor='loss', save_best_only=True, save_weights_only=False)
])


Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 50)]              0         
                                                                 
 token_and_position_embeddin  (None, 50, 128)          12806528  
 g (TokenAndPositionEmbeddin                                     
 g)                                                              
                                                                 
 transformer_block (Transfor  (None, 50, 128)          198400    
 merBlock)                                                       
                                                                 
 dense_2 (Dense)             (None, 50, 100001)        12900129  
                                                                 
Total params: 25,905,057
Trainable params: 25,905,057
Non-trainable params: 0
_________________________________________________

<keras.callbacks.History at 0x1dd9c8351e0>

In [6]:
seed = "dawno temu czerwony kapturek poszedł do lasu"
for i in [1, 2, 4, 8, 10, 20]:
    txt = TextGenerator(seed, 250, LEN_MAX_LIMIT, tokenizer, 0, top_k=i, model=model).generate_text()
    with open(f'../generated_texts/transformer/text_{i}', 'w') as f:
        f.write(f'Seed: {seed}\n')
        f.write(txt)