## Enabling and testing the TPU

In [None]:
import datetime
import functools
import math
import os

import tensorflow as tf
import tensorflow_datasets as tfds

%load_ext tensorboard

print("Tensorflow version " + tf.__version__)

Tensorflow version 2.15.0


In [None]:
tf.debugging.set_log_device_placement(False)

GPU = tf.config.list_logical_devices('GPU')
TPU = tf.config.list_logical_devices('TPU')

if TPU:
    RESOLVER = tf.distribute.cluster_resolver.TPUClusterResolver()
    tf.config.experimental_connect_to_cluster(RESOLVER)
    tf.tpu.experimental.initialize_tpu_system(RESOLVER)
    DISTRIBUTION_STRATEGY = tf.distribute.TPUStrategy(RESOLVER)
elif GPU:
    DISTRIBUTION_STRATEGY = tf.distribute.MirroredStrategy(GPU)

print(DISTRIBUTION_STRATEGY)

[LogicalDevice(name='/device:GPU:0', device_type='GPU')]


## Defining The Metadata

In [None]:
# META ########################################################################

ATTENTION = True
NORMALIZATION = True

N_DEPTH = 3 # D
N_TOKEN_DIM = 4 # G
N_ENCODING_DIM = 256 # U
N_EMBEDDING_DIM = N_ENCODING_DIM # E
N_LATENT_DIM = N_EMBEDDING_DIM # L

N_EPOCHS = 8
N_EPOCHS_RAMPUP = 0
N_EPOCHS_SUSTAIN = 0

N_BATCH = 128 # number of samples per batch
N_SAMPLE = 128 # number of characters per sample (=> N_TOKEN_DIM * N_SAMPLE int per sample)

R_MIN = 0.00001
R_MAX = 0.0001
R_EXP = .8

## Loading The Data

In [None]:
# DATA ########################################################################

LANG = ['ar', 'de', 'en', 'es', 'hi', 'vi', 'zh']
TRAIN = {__l: tfds.load('mlqa/' + __l, split='test', as_supervised=False, shuffle_files=True, data_dir='~/.cache/tensorflow/', batch_size=N_BATCH) for __l in LANG}
TEST = {__l: tfds.load('mlqa/' + __l, split='validation', as_supervised=False, shuffle_files=True, data_dir='~/.cache/tensorflow/', batch_size=N_BATCH) for __l in LANG}

Downloading and preparing dataset 72.21 MiB (download: 72.21 MiB, generated: 9.27 MiB, total: 81.49 MiB) to /root/.cache/tensorflow/mlqa/ar/1.0.0...


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

Extraction completed...: 0 file [00:00, ? file/s]

Generating splits...:   0%|          | 0/2 [00:00<?, ? splits/s]

Generating test examples...:   0%|          | 0/5335 [00:00<?, ? examples/s]

Shuffling /root/.cache/tensorflow/mlqa/ar/1.0.0.incompleteY0EMVQ/mlqa-test.tfrecord*...:   0%|          | 0/53…

Generating validation examples...:   0%|          | 0/517 [00:00<?, ? examples/s]

Shuffling /root/.cache/tensorflow/mlqa/ar/1.0.0.incompleteY0EMVQ/mlqa-validation.tfrecord*...:   0%|          …

Dataset mlqa downloaded and prepared to /root/.cache/tensorflow/mlqa/ar/1.0.0. Subsequent calls will reuse this data.
Downloading and preparing dataset 72.21 MiB (download: 72.21 MiB, generated: 5.06 MiB, total: 77.28 MiB) to /root/.cache/tensorflow/mlqa/de/1.0.0...


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

Extraction completed...: 0 file [00:00, ? file/s]

Generating splits...:   0%|          | 0/2 [00:00<?, ? splits/s]

Generating test examples...:   0%|          | 0/4517 [00:00<?, ? examples/s]

Shuffling /root/.cache/tensorflow/mlqa/de/1.0.0.incomplete1W6BFQ/mlqa-test.tfrecord*...:   0%|          | 0/45…

Generating validation examples...:   0%|          | 0/512 [00:00<?, ? examples/s]

Shuffling /root/.cache/tensorflow/mlqa/de/1.0.0.incomplete1W6BFQ/mlqa-validation.tfrecord*...:   0%|          …

Dataset mlqa downloaded and prepared to /root/.cache/tensorflow/mlqa/de/1.0.0. Subsequent calls will reuse this data.
Downloading and preparing dataset 72.21 MiB (download: 72.21 MiB, generated: 15.72 MiB, total: 87.94 MiB) to /root/.cache/tensorflow/mlqa/en/1.0.0...


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

Extraction completed...: 0 file [00:00, ? file/s]

Generating splits...:   0%|          | 0/2 [00:00<?, ? splits/s]

Generating test examples...:   0%|          | 0/11590 [00:00<?, ? examples/s]

Shuffling /root/.cache/tensorflow/mlqa/en/1.0.0.incompleteWEKT27/mlqa-test.tfrecord*...:   0%|          | 0/11…

Generating validation examples...:   0%|          | 0/1148 [00:00<?, ? examples/s]

Shuffling /root/.cache/tensorflow/mlqa/en/1.0.0.incompleteWEKT27/mlqa-validation.tfrecord*...:   0%|          …

Dataset mlqa downloaded and prepared to /root/.cache/tensorflow/mlqa/en/1.0.0. Subsequent calls will reuse this data.
Downloading and preparing dataset 72.21 MiB (download: 72.21 MiB, generated: 5.09 MiB, total: 77.30 MiB) to /root/.cache/tensorflow/mlqa/es/1.0.0...


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

Extraction completed...: 0 file [00:00, ? file/s]

Generating splits...:   0%|          | 0/2 [00:00<?, ? splits/s]

Generating test examples...:   0%|          | 0/5253 [00:00<?, ? examples/s]

Shuffling /root/.cache/tensorflow/mlqa/es/1.0.0.incompleteHCNC41/mlqa-test.tfrecord*...:   0%|          | 0/52…

Generating validation examples...:   0%|          | 0/500 [00:00<?, ? examples/s]

Shuffling /root/.cache/tensorflow/mlqa/es/1.0.0.incompleteHCNC41/mlqa-validation.tfrecord*...:   0%|          …

Dataset mlqa downloaded and prepared to /root/.cache/tensorflow/mlqa/es/1.0.0. Subsequent calls will reuse this data.
Downloading and preparing dataset 72.21 MiB (download: 72.21 MiB, generated: 12.83 MiB, total: 85.04 MiB) to /root/.cache/tensorflow/mlqa/hi/1.0.0...


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

Extraction completed...: 0 file [00:00, ? file/s]

Generating splits...:   0%|          | 0/2 [00:00<?, ? splits/s]

Generating test examples...:   0%|          | 0/4918 [00:00<?, ? examples/s]

Shuffling /root/.cache/tensorflow/mlqa/hi/1.0.0.incomplete0R8HTZ/mlqa-test.tfrecord*...:   0%|          | 0/49…

Generating validation examples...:   0%|          | 0/507 [00:00<?, ? examples/s]

Shuffling /root/.cache/tensorflow/mlqa/hi/1.0.0.incomplete0R8HTZ/mlqa-validation.tfrecord*...:   0%|          …

Dataset mlqa downloaded and prepared to /root/.cache/tensorflow/mlqa/hi/1.0.0. Subsequent calls will reuse this data.
Downloading and preparing dataset 72.21 MiB (download: 72.21 MiB, generated: 8.77 MiB, total: 80.98 MiB) to /root/.cache/tensorflow/mlqa/vi/1.0.0...


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

Extraction completed...: 0 file [00:00, ? file/s]

Generating splits...:   0%|          | 0/2 [00:00<?, ? splits/s]

Generating test examples...:   0%|          | 0/5495 [00:00<?, ? examples/s]

Shuffling /root/.cache/tensorflow/mlqa/vi/1.0.0.incompleteC443IM/mlqa-test.tfrecord*...:   0%|          | 0/54…

Generating validation examples...:   0%|          | 0/511 [00:00<?, ? examples/s]

Shuffling /root/.cache/tensorflow/mlqa/vi/1.0.0.incompleteC443IM/mlqa-validation.tfrecord*...:   0%|          …

Dataset mlqa downloaded and prepared to /root/.cache/tensorflow/mlqa/vi/1.0.0. Subsequent calls will reuse this data.
Downloading and preparing dataset 72.21 MiB (download: 72.21 MiB, generated: 5.13 MiB, total: 77.34 MiB) to /root/.cache/tensorflow/mlqa/zh/1.0.0...


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

Extraction completed...: 0 file [00:00, ? file/s]

Generating splits...:   0%|          | 0/2 [00:00<?, ? splits/s]

Generating test examples...:   0%|          | 0/5137 [00:00<?, ? examples/s]

Shuffling /root/.cache/tensorflow/mlqa/zh/1.0.0.incomplete1Q4NJQ/mlqa-test.tfrecord*...:   0%|          | 0/51…

Generating validation examples...:   0%|          | 0/504 [00:00<?, ? examples/s]

Shuffling /root/.cache/tensorflow/mlqa/zh/1.0.0.incomplete1Q4NJQ/mlqa-validation.tfrecord*...:   0%|          …

Dataset mlqa downloaded and prepared to /root/.cache/tensorflow/mlqa/zh/1.0.0. Subsequent calls will reuse this data.


## LAYERS

In [None]:
# EMBEDDING ###################################################################

class PositionalEmbedding(tf.keras.layers.Layer):
    def __init__(
        self,
        input_axis: int=1, # axis of the sequence
        output_axis: int=-1, # axis of the embedding
        **kwargs
    ):
        super(PositionalEmbedding, self).__init__(**kwargs)
        self._input_axis = input_axis
        self._output_axis = output_axis
        self._kernel = None

    def build(self, input_shape: tuple):
        # shape
        __axes = [self._input_axis % len(input_shape), self._output_axis % len(input_shape)]
        __shape = [(__d if __i in __axes else 1) for __i, __d in enumerate(list(input_shape))]
        # init values
        __kernel_init = tf.keras.initializers.GlorotNormal()
        # register the weights
        self._kernel = self.add_weight(name="kernel", shape=__shape, initializer=__kernel_init)
        # notify the model
        self.built = True

    def call(self, inputs: tf.Tensor):
        return inputs + self._kernel # each index in the sequence axis has a dedicated bias (different from dense bias)

# RESHAPING ###################################################################

def _normalize_shape(shape: list) -> list:
    return [-1 if __d is None else __d for __d in shape]

def _normalize_dim(dim: int) -> int:
    return -1 if (dim is None or dim < 0) else dim

def _multiply_dim(dim_l: int, dim_r: int) -> int:
    return -1 if (dim_l == -1 or dim_r == -1) else dim_l * dim_r

def _divide_dim(dim_l: int, dim_r: int) -> int:
    return -1 if (dim_l == -1 or dim_r == -1) else dim_l // dim_r

class Divide(tf.keras.layers.Layer):
    def __init__(
        self,
        input_axis: int, # relative to the NEW shape / rank
        output_axis: int, # same
        factor: int,
        insert: bool=False,
        **kwargs
    ) -> None:
        super(Divide, self).__init__(**kwargs)
        self._input_axis = input_axis
        self._output_axis = output_axis
        self._factor = factor
        self._insert = insert

    def call(self, inputs: tf.Tensor) -> tf.Tensor:
        # infer the dimension of the symbolic axis
        __shape = _normalize_shape(list(inputs.shape))
        # rank, according to the new shape
        __rank = len(__shape) + int(self._insert)
        # axes, taken from the new shape
        __axis0 = self._input_axis % __rank
        __axis1 = self._output_axis % __rank
        # option to group data on a new axistho i do it with other
        if self._insert: __shape.insert(__axis1, 1)
        # move data from axis 0 to axis 1
        __shape[__axis0] = _divide_dim(__shape[__axis0], self._factor)
        __shape[__axis1] = _multiply_dim(__shape[__axis1], self._factor)
        return tf.reshape(tensor=inputs, shape=__shape)

class Merge(tf.keras.layers.Layer):
    def __init__(
        self,
        left_axis: int=-2,
        right_axis: int=-1,
        left: bool=True,
        **kwargs
    ) -> None:
        super(Merge, self).__init__(**kwargs)
        self._left_axis = left_axis
        self._right_axis = right_axis
        self._left = left

    def call(self, inputs: tf.Tensor) -> tf.Tensor:
        # infer the dimension of the symbolic axis
        __shape = _normalize_shape(list(inputs.shape))
        __rank = len(__shape)
        # target axes
        __axis_l = self._left_axis % __rank
        __axis_r = self._right_axis % __rank
        # new axis
        __dim = _multiply_dim(__shape[__axis_l], __shape[__axis_r])
        __axis_k = __axis_l if self._left else __axis_r # kept axis
        __axis_d = __axis_r if self._left else __axis_l # deleted axis
        # new shape
        __shape[__axis_k] = __dim
        __shape.pop(__axis_d)
        # actually merge the two axes
        return tf.reshape(tensor=inputs, shape=__shape)

## Blocks

In [None]:
# ENCODING BLOCKS #############################################################

class TokenizeBlock(tf.keras.layers.Layer):
    def __init__(
        self,
        left_axis: int=-2,
        right_axis: int=-1,
        token_dim: int=4,
        latent_dim: int=256,
        attention: bool=False,
        normalization: bool=False,
        **kwargs
    ) -> None:
        super(TokenizeBlock, self).__init__(**kwargs)
        # layers
        self._normalization = tf.keras.layers.LayerNormalization(axis=-1, epsilon=0.001, center=True, scale=True, name='normalization') if normalization else None # normalize each token unit independently
        self._divide = Divide(input_axis=0, output_axis=1, factor=token_dim, insert=True, name='group') # (B * G, E) => (B, G, E)
        self._embedding = PositionalEmbedding(input_axis=left_axis, output_axis=right_axis, name='position') # (B, G, E) + (1, G, E)
        self._attention = tf.keras.layers.Attention(use_scale=False, score_mode='dot', dropout=0., seed=None, name='attention') if attention else None # (B, G, E) + (B, G, E) * (B, E, G) * (B, G, E)
        self._merge = Merge(left_axis=left_axis, right_axis=right_axis, left=True, name='merging') # (B, G, E) => (B, G * E)
        self._dense = tf.keras.layers.Dense(units=latent_dim, activation='relu', use_bias=True, kernel_initializer='glorot_uniform', bias_initializer='zeros', name='compression') # (B, G * E) => (B, L), typically L = E

    def call(self, inputs: tf.Tensor) -> tf.Tensor:
        __t = self._normalization(inputs) if self._normalization else inputs
        __t = self._embedding(self._divide(__t))
        __t = self._attention([__t, __t, __t], return_attention_scores=False, use_causal_mask=False) if self._attention else __t
        return self._dense(self._merge(__t))

# DECODING BLOCKS #############################################################

class DetokenizeBlock(tf.keras.layers.Layer):
    def __init__(
        self,
        token_dim: int=4,
        embedding_dim: int=256,
        attention: bool=False,
        normalization: bool=False,
        **kwargs
    ) -> None:
        super(DetokenizeBlock, self).__init__(**kwargs)
        # layers
        self._dense = tf.keras.layers.Dense(units=token_dim * embedding_dim, activation='relu', use_bias=True, kernel_initializer='glorot_uniform', bias_initializer='zeros', name='decompression') # (B, L) => (B, G * E), typically L = E
        self._divide = Divide(input_axis=-2, output_axis=-1, insert=True, factor=embedding_dim, name='split') # (B, G * E) => (B, G, E)
        self._embedding = PositionalEmbedding(input_axis=-2, output_axis=-1, name='position') # (B, G, E) + (1, G, E)
        self._attention = tf.keras.layers.Attention(use_scale=False, score_mode='dot', dropout=0., seed=None, name='attention') if attention else None # (B, G, E) + (B, G, E) * (B, E, G) * (B, G, E)
        self._merge = Merge(left_axis=0, right_axis=1, left=True) # (B, G, E) => (B * G, E)
        self._normalization = tf.keras.layers.LayerNormalization(axis=-1, epsilon=0.001, center=True, scale=True, name='normalization') if normalization else None # normalize each token unit independently

    def call(self, inputs: tf.Tensor) -> tf.Tensor:
        __t = self._embedding(self._divide(self._dense(inputs)))
        __t = self._attention([__t, __t, __t], return_attention_scores=False, use_causal_mask=False) if self._attention else __t
        __t = self._merge(__t)
        return self._normalization(__t) if self._normalization else __t

# HEAD BLOCK ##################################################################

class HeadBlock(tf.keras.layers.Layer):
    def __init__(
        self,
        encoding_dim: int=256,
        **kwargs
    ) -> None:
        super(HeadBlock, self).__init__(**kwargs)
        # layers
        self._dense = tf.keras.layers.Dense(units=encoding_dim, activation=None, use_bias=True, kernel_initializer='glorot_uniform', bias_initializer='zeros', name='project-head') # (..., G, E) => (..., G, U), typically U = E
        self._softmax = tf.keras.layers.Softmax(axis=-1, name='softmax') # (..., G, U)

    def call(self, inputs: tf.Tensor) -> tf.Tensor:
        return self._softmax(self._dense(inputs))


## Model

In [None]:
# ENCODER #####################################################################

class Encoder(tf.keras.models.Model):
    def __init__(self, depth: int, token_dim: int, encoding_dim: int, embedding_dim: int, latent_dim: int, batch_dim: int=None, attention: bool=False, normalization: bool=False, **kwargs) -> None:
        super(Encoder, self).__init__(**kwargs)
        self._encoder = tf.keras.Sequential([
            tf.keras.Input(shape=(encoding_dim,), batch_size=batch_dim, name='input'), # (B * G ^ D, U)
            tf.keras.layers.Dense(units=embedding_dim, activation=None, use_bias=False, kernel_initializer='glorot_uniform', bias_initializer=None, name='embed-1'),] # (B * G ^ D, U) => (B * G ^ D, E)
            + [TokenizeBlock(left_axis=-2, right_axis=-1, token_dim=token_dim, latent_dim=latent_dim, attention=attention, normalization=normalization, name='tokenize' + (__i + 1) * '-4') for __i in range(depth)]) # (B * G ^ i, E) => (B * G ^ (i-1), E)

    def call(self, x: tf.Tensor) -> tf.Tensor:
        return self._encoder(x)

# DECODER #####################################################################

class Decoder(tf.keras.models.Model):
    def __init__(self, depth: int, token_dim: int, encoding_dim: int, embedding_dim: int, latent_dim: int, batch_dim: int=None, attention: bool=False, normalization: bool=False, **kwargs) -> None:
        super(Decoder, self).__init__(**kwargs)
        self._decoder = tf.keras.Sequential(
            [tf.keras.Input(shape=(latent_dim,), batch_size=batch_dim, name='input')] # (B, E)
            + [DetokenizeBlock(token_dim=token_dim, embedding_dim=embedding_dim, attention=attention, normalization=normalization, name='detokenize' + (depth - __i) * '-4') for __i in range(depth)] # (B * G ^ i, E) => (B * G ^ (i+1), E)
            + [HeadBlock(encoding_dim=encoding_dim, name='project-head')]) # (B * G ^ D, E) => (B * G ^ D, U)

    def call(self, x: tf.Tensor) -> tf.Tensor:
        return self._decoder(x)

# VAE #########################################################################

class AutoEncoder(tf.keras.models.Model):
    def __init__(self, depth: int, token_dim: int, encoding_dim: int, embedding_dim: int, latent_dim: int, batch_dim: int=None, attention: bool=False, normalization: bool=False, **kwargs) -> None:
        super(AutoEncoder, self).__init__(**kwargs)
        self._encoder = Encoder(depth=depth, token_dim=token_dim, encoding_dim=encoding_dim, embedding_dim=embedding_dim, latent_dim=latent_dim, batch_dim=batch_dim, attention=attention, normalization=normalization)
        self._decoder = Decoder(depth=depth, token_dim=token_dim, encoding_dim=encoding_dim, embedding_dim=embedding_dim, latent_dim=latent_dim, batch_dim=batch_dim, attention=attention, normalization=normalization)

    def call(self, x: tf.Tensor) -> tf.Tensor:
        return self._decoder(self._encoder(x))

In [None]:
with DISTRIBUTION_STRATEGY.scope():
    MODEL = AutoEncoder(depth=N_DEPTH, token_dim=N_TOKEN_DIM, encoding_dim=N_ENCODING_DIM, embedding_dim=N_EMBEDDING_DIM, latent_dim=N_LATENT_DIM, batch_dim=None, attention=ATTENTION, normalization=NORMALIZATION)
    MODEL.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=R_MAX),
        loss=tf.keras.losses.CategoricalCrossentropy(from_logits=False, label_smoothing=0., axis=-1, reduction=tf.keras.losses.Reduction.SUM_OVER_BATCH_SIZE, name='loss'),
        metrics=['accuracy'])


## Train

In [None]:
# CONTROL #####################################################################

def learning_rate_hokusai(epoch: int, lr_min: float, lr_max: float, lr_exp: float, rampup: int, sustain: int) -> float:
    __lr = lr_min
    if epoch < rampup:
        __lr = lr_min + (epoch * (lr_max - lr_min) / rampup)
    elif epoch < rampup + sustain:
        __lr = lr_max
    else:
        __lr = lr_min + (lr_max - lr_min) * lr_exp ** (epoch - rampup - sustain)
    return __lr

In [None]:
# ENCODE ######################################################################

def shape(layer_count: int, group_size: int, flatten: bool=False) -> list:
    return [-1] + (1 - int(flatten)) * layer_count * [group_size]

def _tokenize_scalar(text: str, layer_count: int=1, group_size: int=4, flatten: bool=False) -> tf.Tensor:
    __mod = group_size ** layer_count
    __bytes = list(text.encode('utf-32-be'))
    __shape = shape(layer_count=layer_count, group_size=group_size, flatten=flatten)
    __padding = (-len(__bytes) % __mod) * [0]
    __tensor = tf.convert_to_tensor(value=__bytes + __padding, dtype=tf.dtypes.int32) # uint8 is not allowed
    return tf.reshape(tensor=__tensor, shape=__shape)

def tokenize(data: tf.Tensor, layer_count: int=1, group_size: int=4, sample_size: int=64, flatten: bool=False) -> tf.Tensor:
    # make sure each sample has a length multiple of G ** L = T, the token dim
    __mod = group_size ** layer_count
    __dim = math.ceil(4 * sample_size / __mod) * __mod # factor 4 because of the UTF-32 encoding
    # output shape
    __shape = shape(layer_count=layer_count, group_size=group_size, flatten=flatten)
    # Decode bytes from UTF-8
    __bytes = tf.strings.unicode_transcode(input=data, input_encoding='UTF-8', output_encoding='UTF-32-BE') # (B,)
    # Decode byte strings to arrays of integers
    __ints = tf.io.decode_raw(__bytes, out_type=tf.uint8, fixed_length=__dim) # (B, 4 * S)
    # group the characters into tokens
    return tf.reshape(tensor=__ints, shape=__shape) # for example (-1, G, G, G) the first dimension is not B

In [None]:
# AUGMENT #####################################################################

def _offset(ticks: int=1, layer: int=1, unit: int=4) -> int:
    return math.ceil(ticks * (unit ** (layer - 1)))

def offset(data: tf.Tensor, ticks: int=1, layer: int=1, unit: int=4) -> tf.Tensor:
    __length = _offset(ticks=ticks, layer=layer, unit=unit)
    __pad = tf.convert_to_tensor([__length * b'\x00'])
    return __pad + data

In [None]:
# END-TO-END ##################################################################

def process(dataset: tf.data.Dataset, pipeline: list, replace: bool=True, feature: str=None) -> tf.data.Dataset:
    # fetch the target feature in the dataset
    __dataset = dataset.map(lambda x: x[feature]) if feature else dataset
    # specify how to combine each operation result with the original dataset
    __replace = len(list(pipeline)) * [replace] if isinstance(replace, bool) else replace
    # apply the operation successively
    for __fn, __repl in zip(pipeline, __replace):
        __new = __dataset.map(__fn)
        __dataset = __new if __repl else __dataset.concatenate(__new)
    return __dataset

In [None]:
# PREPROCESS ##################################################################

# B = 128, T = 4, S = 128, E = 256
PIPELINE = [
    # offset by 1 to 15 character => (B, 1) bytes
    (functools.partial(offset, ticks=1, layer=1, unit=N_TOKEN_DIM), False), # double the dataset volume: (all samples with offset 0) + (offset 1)
    (functools.partial(offset, ticks=2, layer=1, unit=N_TOKEN_DIM), False), # (offsets 0 and 1) + (offsets 2 and 3)
    (functools.partial(offset, ticks=4, layer=1, unit=N_TOKEN_DIM), False), # (offsets 0, 1, 2, 3) + (offsets 4, 5, 6, 7)
    (functools.partial(offset, ticks=8, layer=1, unit=N_TOKEN_DIM), False), # (offsets 0, 1, 2, 3, 4, 5, 6, 7) + (offsets 8, 9, 10, 11, 12, 13, 14, 15)
    # tokenize => (B * T * S,) int
    (functools.partial(tokenize, layer_count=N_DEPTH, group_size=N_TOKEN_DIM, sample_size=N_SAMPLE, flatten=True), True),
    # one-hot encoding => (B * T * S, E) int (bool)
    (functools.partial(tf.one_hot, depth=N_ENCODING_DIM, axis=-1), True),
    # replace sample inputs with (inputs, target) for supervised learning
    ((lambda x: (x, x)), True)]

OPERATIONS, REPLACE = zip(*PIPELINE)

TRAIN = {__l: process(dataset=__d, feature='context', pipeline=OPERATIONS, replace=REPLACE) for __l, __d in TRAIN.items()}
TEST = {__l: process(dataset=__d, feature='context', pipeline=OPERATIONS, replace=REPLACE) for __l, __d in TEST.items()}

In [None]:
# LOG #########################################################################

VERSION = 'tokun-' + str(N_TOKEN_DIM ** (N_DEPTH - 1)) + '-keras' + ATTENTION * '-attention' + NORMALIZATION * '-normalization'

LOGPATH = os.path.join('.logs/', VERSION, datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
SUMMARY = tf.summary.create_file_writer(LOGPATH)

In [None]:
# TRAIN #######################################################################

lr_callback = tf.keras.callbacks.LearningRateScheduler(functools.partial(learning_rate_hokusai, lr_min=R_MIN, lr_max=R_MAX, lr_exp=R_EXP, rampup=N_EPOCHS_RAMPUP, sustain=N_EPOCHS_SUSTAIN), verbose=True)
tb_callback = tf.keras.callbacks.TensorBoard(log_dir=LOGPATH)

with DISTRIBUTION_STRATEGY.scope():
    TRAINING_HISTORY = MODEL.fit(
        x=TRAIN['ar'].concatenate(TRAIN['en']).concatenate(TRAIN['es']).concatenate(TRAIN['de']).concatenate(TRAIN['hi']).concatenate(TRAIN['vi']).concatenate(TRAIN['zh']),
        batch_size=N_BATCH,
        epochs=N_EPOCHS,
        validation_split=None,
        validation_data=TEST['vi'], # full of glyphs
        validation_freq=list(range(1, N_EPOCHS + 1, 1)),
        verbose=2,
        callbacks=[lr_callback, tb_callback])


Epoch 1: LearningRateScheduler setting learning rate to 0.0001.
5344/5344 - 1152s - loss: 0.1370 - accuracy: 0.9713 - val_loss: 0.1542 - val_accuracy: 0.9673 - lr: 1.0000e-04 - 1152s/epoch - 216ms/step


In [None]:
MODEL.summary()

Model: "auto_encoder"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 encoder (Encoder)           multiple                  857344    
                                                                 
 decoder (Decoder)           multiple                  859904    
                                                                 
Total params: 1717248 (6.55 MB)
Trainable params: 1717248 (6.55 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


## Dataviz

In [None]:
# GENERIC #####################################################################

def _label(c: str) -> str:
    return '#{}'.format(c.encode('utf-32-be').hex())

def label(token: str) -> str:
    return ' '.join(_label(__c) for __c in token)

def compare(left: str, right: str) -> float:
    return sum(__l == __r for __l, __r in zip(left, right)) / max(1, len(left))

def chunk(seq: list, size: int, repeats: bool=True) -> list:
    __chunks = (seq[__i:__i+size] for __i in range(0, len(seq), size))
    return list(__chunks if repeats else set(__chunks))

In [None]:
# POSTPROCESS #################################################################

def interpret(output: tf.Tensor) -> tf.Tensor:
    return tf.argmax(input=output, axis=-1, output_type=tf.dtypes.int32) # uint8 is not allowed

def detokenize(tokens: tf.Tensor) -> str:
    __b = tf.reshape(tensor=tokens, shape=(-1,)).numpy().tolist()
    return bytes(__b).decode(encoding='utf-32-be', errors='replace')

def postprocess(output: tf.Tensor) -> tf.Tensor:
    # from one-hot to indices
    __output = interpret(output=output)
    # flatten
    return detokenize(tokens=__output)

In [None]:
# SAVE ########################################################################

def write(data: any, path: str, tsv: bool=True) -> None:
    with open(path, 'w') as __f:
      for __row in data:
        __line = '\t'.join(str(__v) for __v in __row) if tsv else str(__row)
        __f.write(__line + '\n')

In [None]:
# SAMPLES #####################################################################

SAMPLES = {}
TOKENS = {(N_TOKEN_DIM ** __i): {} for __i in range(N_DEPTH)}
EMBEDDINGS = {(N_TOKEN_DIM ** __i): {} for __i in range(N_DEPTH)}

for __lang in TEST:
    # compute predictions
    __batch = iter(TEST[__lang]) # iterate over batches of samples
    __input = next(__batch)[0] # take input only
    __output = MODEL(__input)
    # sample predictions (inputs, outputs)
    SAMPLES[__lang] = (__input, __output)
    # unique (G ^ i)-tokens
    for __size in TOKENS:
        TOKENS[__size][__lang] = chunk(seq=postprocess(__input), size=__size, repeats=False)

# unique tokens, for all languages
for __size in TOKENS:
    TOKENS[__size]['all'] = list(set(__t for _, __s in TOKENS[__size].items() for __t in __s))

In [None]:
# EMBEDDINGS ##################################################################

for __size in TOKENS:
    for __lang, __tokens in TOKENS[__size].items():
        # embedding depth / nesting
        __depth = int(math.log(__size, N_TOKEN_DIM))
        # re-encode without token repeats
        __input = tf.one_hot(indices=_tokenize_scalar(text=''.join(__tokens), layer_count=N_DEPTH, group_size=N_TOKEN_DIM, flatten=True), depth=N_ENCODING_DIM, axis=-1)
        # UTF-32 embedding
        __embedding = MODEL._encoder._encoder.layers[0](__input)
        # iterative CNN tokenization
        for __i in range(__depth + 1):
            __embedding = MODEL._encoder._encoder.layers[__i + 1](__embedding)
        # remove the (tokenized) padding
        EMBEDDINGS[__size][__lang] = __embedding[:len(__tokens)]

In [None]:
# SAVE ########################################################################

for __size in TOKENS:
    write(data=[__c + ' ' + label(__c) for __c in TOKENS[__size]['all']], path='./metadata.' + str(__size) + '.tsv', tsv=False)
    write(data=EMBEDDINGS[__size]['all'].numpy(), path='./embeddings.' + str(__size) + '.tsv', tsv=True)

In [None]:
MODEL.save('model.keras', save_format='keras')

In [None]:
# TEST ########################################################################

__s = """class Encoder(tf.keras.models.Model):\n    def __init__(self, depth: int, token_dim: int, encoding_dim: int, embedding_dim: int, latent_dim: int, batch_dim: int=None, attention: bool=False, **kwargs) -> None:\n        super(Encoder, self).__init__(**kwargs)\n        self._encoder = tf.keras.Sequential([\n            tf.keras.Input(shape=(encoding_dim,), batch_size=batch_dim, name='input'), # (B * G ^ D, U)\n            tf.keras.layers.Dense(units=embedding_dim, activation=None, use_bias=False, kernel_initializer='glorot_uniform', bias_initializer=None, name='embed-1'),] # (B * G ^ D, U) => (B * G ^ D, E)\n            + [_mmtl.TokenizeBlock(left_axis=-2, right_axis=-1, token_dim=token_dim, latent_dim=latent_dim, attention=attention, name='tokenize' + (__i + 1) * '-4') for __i in range(depth)]) # (B * G ^ i, E) => (B * G ^ (i-1), E)\n\n    def call(self, x: tf.Tensor) -> tf.Tensor:\n        return self._encoder(x)\n"""

__x = tf.one_hot(indices=_tokenize_scalar(text=__s, layer_count=N_DEPTH, group_size=4, flatten=True), depth=256, axis=-1)
__e = MODEL._encoder(__x)
__p = MODEL(__x)
__y = postprocess(__p)

print(__s)
print(__y)
print(compare(__s, __y))

class Encoder(tf.keras.models.Model):
    def __init__(self, depth: int, token_dim: int, encoding_dim: int, embedding_dim: int, latent_dim: int, batch_dim: int=None, attention: bool=False, **kwargs) -> None:
        super(Encoder, self).__init__(**kwargs)
        self._encoder = tf.keras.Sequential([
            tf.keras.Input(shape=(encoding_dim,), batch_size=batch_dim, name='input'), # (B * G ^ D, U)
            tf.keras.layers.Dense(units=embedding_dim, activation=None, use_bias=False, kernel_initializer='glorot_uniform', bias_initializer=None, name='embed-1'),] # (B * G ^ D, U) => (B * G ^ D, E)
            + [_mmtl.TokenizeBlock(left_axis=-2, right_axis=-1, token_dim=token_dim, latent_dim=latent_dim, attention=attention, name='tokenize' + (__i + 1) * '-4') for __i in range(depth)]) # (B * G ^ i, E) => (B * G ^ (i-1), E)

    def call(self, x: tf.Tensor) -> tf.Tensor:
        return self._encoder(x)

class Encoder(tf.keras.models.Model):     def   init  (self, depth: int, token dim

In [None]:
__l = postprocess(SAMPLES['de'][0])
__r = postprocess(SAMPLES['de'][1])

print(__l)
print(__r)
print(compare(__l, __r))

0.9912109375


In [None]:
%tensorboard --logdir .logs