## Import deps

In [None]:
!pip install mlable tokun

In [None]:
import datetime
import functools
import itertools
import math
import os
import random
import urllib.request

import huggingface_hub
import tensorflow as tf
import tensorflow_datasets as tfds

import mlable.data
import mlable.layers.reshaping
import mlable.layers.transformer
import mlable.optimizers

import tokun.data
import tokun.evaluation
import tokun.meta
import tokun.pipeline

%load_ext tensorboard

In [None]:
print("Tensorflow version " + tf.__version__)

## Setup the GPU / TPU

In [None]:
tf.debugging.set_log_device_placement(False)

CPU = tf.config.list_logical_devices('CPU')
GPU = tf.config.list_logical_devices('GPU')
TPU = tf.config.list_logical_devices('TPU')

if TPU:
    RESOLVER = tf.distribute.cluster_resolver.TPUClusterResolver()
    tf.config.experimental_connect_to_cluster(RESOLVER)
    tf.tpu.experimental.initialize_tpu_system(RESOLVER)
    DISTRIBUTION_STRATEGY = tf.distribute.TPUStrategy(RESOLVER)
elif GPU:
    DISTRIBUTION_STRATEGY = tf.distribute.MirroredStrategy(GPU)
else:
    DISTRIBUTION_STRATEGY = tf.distribute.MirroredStrategy(CPU)

print(DISTRIBUTION_STRATEGY)

## Mode

In [None]:
# TOGGLE ######################################################################

IMPORT = False
TRAINING = True
RANDOM = True

## Defining The Metadata

In [None]:
# PARAMETERS ##################################################################

ACTIVATION = 'silu'
GATE = False
NORMALIZATION = True

SEQUENCE_AXIS = 1
FEATURE_AXIS = -1

N_TOKEN_DIM = [4, 4] # G, for each block
N_ENCODING_DIM = 256 # U
N_EMBEDDING_DIM = N_ENCODING_DIM # E
N_HIDDEN_DIM = 4 * N_EMBEDDING_DIM # H
N_LATENT_DIM = N_EMBEDDING_DIM # L

N_EPOCHS = 8
N_EPOCHS_RAMPUP = 0
N_EPOCHS_SUSTAIN = 0

N_BATCH = 128 # number of samples per batch
N_SAMPLE = 256 # number of characters per sample (=> N_TOKEN_DIM * N_SAMPLE integers per sample)

R_MIN, R_MAX, R_EXP = tokun.meta.rates(pretrained=IMPORT, normalization=NORMALIZATION, base=0.001)

In [None]:
# DERIVED #####################################################################

TOKEN_SIZES = list(itertools.accumulate(N_TOKEN_DIM, lambda x, y: x * y)) # in bytes
OFFSET_TICKS = [2 ** __i for __i in range(int(math.log(TOKEN_SIZES[-1] // 4, 2)))] # in characters

VERSION = tokun.meta.version(groups=N_TOKEN_DIM, activation=ACTIVATION, gate=GATE, normalization=NORMALIZATION)
DATETIME = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")

In [None]:
# IMPORT ######################################################################

LABEL = '3.1'
URL_IMPORT = 'https://github.com/apehex/tokun/raw/main/models/{}/{}/{}/{}/{}.keras'.format(*VERSION, LABEL)
PATH_IMPORT = 'model.keras'

if IMPORT:
    urllib.request.urlretrieve(URL_IMPORT, PATH_IMPORT)

In [None]:
# EXPORT ######################################################################

PATH_LOG = os.path.join('.logs/', *VERSION, DATETIME)
PATH_EXPORT = 'model.keras'

## Loading The Data

In [None]:
# MLQA DATASET ################################################################

LANG = ['ar', 'de', 'en', 'es', 'hi', 'vi', 'zh']
MLQA_TRAIN = {__l: tfds.load('mlqa/' + __l, split='test', as_supervised=False, shuffle_files=True, data_dir='~/.cache/tensorflow/', batch_size=None) for __l in LANG}
MLQA_TEST = {__l: tfds.load('mlqa/' + __l, split='validation', as_supervised=False, shuffle_files=True, data_dir='~/.cache/tensorflow/', batch_size=None) for __l in LANG}

In [None]:
# RANDOM DATASET ##############################################################

RANDOM_TRAIN = tokun.data.random_dataset(size=N_BATCH * 2**10, sample_size=N_SAMPLE, lower_plane=0, upper_plane=0x40000)
RANDOM_TEST = tokun.data.random_dataset(size=N_BATCH * 2**8, sample_size=N_SAMPLE, lower_plane=0, upper_plane=0x40000)

## Blocks

In [None]:
# ENCODING BLOCKS #############################################################

@tf.keras.saving.register_keras_serializable(package='blocks')
class TokenizeBlock(tf.keras.layers.Layer):
    def __init__(
        self,
        sequence_axis: int=0,
        feature_axis: int=-1,
        token_dim: int=4,
        embedding_dim: int=256,
        hidden_dim: int=1024,
        latent_dim: int=256,
        normalization: bool=False,
        gate: bool=False,
        activation: str='silu',
        **kwargs
    ) -> None:
        super(TokenizeBlock, self).__init__(**kwargs)
        # this axis is inserted and then merged
        __temp_axis = sequence_axis + 1
        # config
        self._config = {
            'sequence_axis': sequence_axis,
            'feature_axis': feature_axis,
            'token_dim': token_dim,
            'embedding_dim': embedding_dim,
            'hidden_dim': hidden_dim,
            'latent_dim': latent_dim,
            'normalization': normalization,
            'gate': gate,
            'activation': activation,}
        # layers
        self._normalization = tf.keras.layers.LayerNormalization(axis=feature_axis, epsilon=0.001, center=True, scale=True, name='normalization') if normalization else None # normalize each token unit independently
        self._divide = mlable.layers.reshaping.Divide(input_axis=sequence_axis, output_axis=__temp_axis, factor=token_dim, insert=True, name='group') # (B * G, E) => (B, G, E)
        self._gate = mlable.layers.transformer.FeedForwardGate(input_dim=embedding_dim, hidden_dim=hidden_dim, name='gate') if gate else None # (B, G, E) => (B, G, H) => (B, G, E)
        self._merge = mlable.layers.reshaping.Merge(left_axis=__temp_axis, right_axis=feature_axis, left=False, name='merging') # (B, G, E) => (B, G * E)
        self._dense = tf.keras.layers.Dense(units=latent_dim, activation=activation, use_bias=True, kernel_initializer='glorot_uniform', bias_initializer='zeros', name='compression') # (B, G * E) => (B, L), typically L = E

    def call(self, inputs: tf.Tensor) -> tf.Tensor:
        __t = self._normalization(inputs) if self._normalization else inputs
        __t = self._divide(__t)
        __t = self._gate(__t) if self._gate else __t
        return self._dense(self._merge(__t))

    def get_config(self) -> dict:
        __config = super(TokenizeBlock, self).get_config()
        __config.update(self._config)
        return __config

    @classmethod
    def from_config(cls, config) -> tf.keras.layers.Layer:
        return cls(**config)

# DECODING BLOCKS #############################################################

@tf.keras.saving.register_keras_serializable(package='blocks')
class DetokenizeBlock(tf.keras.layers.Layer):
    def __init__(
        self,
        sequence_axis: int=0,
        feature_axis: int=-1,
        token_dim: int=4,
        embedding_dim: int=256,
        hidden_dim: int=1024,
        normalization: bool=False,
        gate: bool=False,
        activation: str='silu',
        **kwargs
    ) -> None:
        super(DetokenizeBlock, self).__init__(**kwargs)
        # this axis is inserted and then merged
        __temp_axis = sequence_axis + 1
        # config
        self._config = {
            'sequence_axis': sequence_axis,
            'feature_axis': feature_axis,
            'token_dim': token_dim,
            'embedding_dim': embedding_dim,
            'hidden_dim': hidden_dim,
            'normalization': normalization,
            'gate': gate,
            'activation': activation,}
        # layers
        self._dense = tf.keras.layers.Dense(units=token_dim * embedding_dim, activation=activation, use_bias=True, kernel_initializer='glorot_uniform', bias_initializer='zeros', name='decompression') # (B, L) => (B, G * E), typically L = E
        self._divide = mlable.layers.reshaping.Divide(input_axis=feature_axis, output_axis=__temp_axis, insert=True, factor=token_dim, name='split') # (B, G * E) => (B, G, E)
        self._gate = mlable.layers.transformer.FeedForwardGate(input_dim=embedding_dim, hidden_dim=hidden_dim, name='gate') if gate else None # (B, G, E) => (B, G, H) => (B, G, E)
        self._merge = mlable.layers.reshaping.Merge(left_axis=sequence_axis, right_axis=__temp_axis, left=True) # (B, G, E) => (B * G, E)
        self._normalization = tf.keras.layers.LayerNormalization(axis=feature_axis, epsilon=0.001, center=True, scale=True, name='normalization') if normalization else None # normalize each token unit independently

    def call(self, inputs: tf.Tensor) -> tf.Tensor:
        __t = self._divide(self._dense(inputs))
        __t = self._gate(__t) if self._gate else __t
        __t = self._merge(__t)
        return self._normalization(__t) if self._normalization else __t

    def get_config(self) -> dict:
        __config = super(DetokenizeBlock, self).get_config()
        __config.update(self._config)
        return __config

    @classmethod
    def from_config(cls, config) -> tf.keras.layers.Layer:
        return cls(**config)

# HEAD ########################################################################

@tf.keras.saving.register_keras_serializable(package='blocks')
class HeadBlock(tf.keras.layers.Layer):
    def __init__(
        self,
        feature_axis: int=-1,
        encoding_dim: int=256,
        **kwargs
    ) -> None:
        super(HeadBlock, self).__init__(**kwargs)
        # config
        self._config = {'feature_axis': feature_axis, 'encoding_dim': encoding_dim}
        # layers
        self._dense = tf.keras.layers.Dense(units=encoding_dim, activation=None, use_bias=True, kernel_initializer='glorot_uniform', bias_initializer='zeros', name='projection') # (..., G, E) => (..., G, U), typically U = E
        self._softmax = tf.keras.layers.Softmax(axis=feature_axis, name='softmax') # (..., G, U)

    def call(self, inputs: tf.Tensor) -> tf.Tensor:
        return self._softmax(self._dense(inputs))

    def get_config(self) -> dict:
        __config = super(HeadBlock, self).get_config()
        __config.update(self._config)
        return __config

    @classmethod
    def from_config(cls, config) -> tf.keras.layers.Layer:
        return cls(**config)

## Model

In [None]:
# ENCODER #####################################################################

@tf.keras.saving.register_keras_serializable(package='models')
class Encoder(tf.keras.models.Model):
    def __init__(
        self,
        token_dim: list,
        encoding_dim: int,
        embedding_dim: int,
        hidden_dim: int,
        latent_dim: int,
        batch_dim: int=None,
        normalization: bool=True,
        gate: bool=True,
        activation: str='silu',
        sequence_axis: int=0,
        feature_axis: int=-1,
        **kwargs
    ) -> None:
        # init
        super(Encoder, self).__init__(**kwargs)
        # config
        self._config = {
            'token_dim': token_dim,
            'encoding_dim': encoding_dim,
            'embedding_dim': embedding_dim,
            'hidden_dim': hidden_dim,
            'latent_dim': latent_dim,
            'batch_dim': batch_dim,
            'normalization': normalization,
            'gate': gate,
            'activation': activation,
            'sequence_axis': sequence_axis,
            'feature_axis': feature_axis,}
        # successive dimensions of the merging units
        __token_dim = [token_dim] if isinstance(token_dim, int) else token_dim
        # layers
        __layers = [
            # (B * G ^ D, U) => (B * G ^ D, E)
            tf.keras.layers.Dense(
                units=embedding_dim,
                activation='linear',
                use_bias=False,
                kernel_initializer='glorot_uniform',
                name='embed-1'),] + [
            # (B * G ^ i, E) => (B * G ^ (i-1), E)
            TokenizeBlock(
                sequence_axis=sequence_axis,
                feature_axis=feature_axis,
                token_dim=__g,
                embedding_dim=embedding_dim,
                hidden_dim=hidden_dim,
                latent_dim=latent_dim,
                normalization=normalization,
                gate=gate,
                activation=activation,
                name='tokenize-{}_{}'.format(__g, __i))
            for __i, __g in enumerate(__token_dim)]
        # model
        self._encoder = tf.keras.Sequential(__layers)

    def call(self, x: tf.Tensor) -> tf.Tensor:
        return self._encoder(x)

    def get_config(self) -> dict:
        __config = super(Encoder, self).get_config()
        __config.update(self._config)
        return __config

    @classmethod
    def from_config(cls, config) -> tf.keras.layers.Layer:
        return cls(**config)

# DECODER #####################################################################

@tf.keras.saving.register_keras_serializable(package='models')
class Decoder(tf.keras.models.Model):
    def __init__(
        self,
        token_dim: list,
        encoding_dim: int,
        embedding_dim: int,
        hidden_dim: int,
        latent_dim: int,
        batch_dim: int=None,
        normalization: bool=True,
        gate: bool=True,
        activation: str='silu',
        sequence_axis: int=0,
        feature_axis: int=-1,
        **kwargs
    ) -> None:
        # init
        super(Decoder, self).__init__(**kwargs)
        # config
        self._config = {
            'token_dim': token_dim,
            'encoding_dim': encoding_dim,
            'embedding_dim': embedding_dim,
            'hidden_dim': hidden_dim,
            'latent_dim': latent_dim,
            'batch_dim': batch_dim,
            'normalization': normalization,
            'gate': gate,
            'activation': activation,
            'sequence_axis': sequence_axis,
            'feature_axis': feature_axis,}
        # successive dimensions of the dividing units
        __token_dim = [token_dim] if isinstance(token_dim, int) else token_dim
        # layers
        __layers = [
            # (B * G ^ i, E) => (B * G ^ (i+1), E)
            DetokenizeBlock(
                sequence_axis=sequence_axis,
                feature_axis=feature_axis,
                token_dim=__g,
                embedding_dim=embedding_dim,
                hidden_dim=hidden_dim,
                normalization=normalization,
                gate=gate,
                activation=activation,
                name='detokenize-{}_{}'.format(__g, __i))
            for __i, __g in enumerate(__token_dim)] + [
            # (B * G ^ D, E) => (B * G ^ D, U)
            HeadBlock(feature_axis=feature_axis, encoding_dim=encoding_dim, name='project-head')]
        # model
        self._decoder = tf.keras.Sequential(__layers)

    def call(self, x: tf.Tensor) -> tf.Tensor:
        return self._decoder(x)

    def get_config(self) -> dict:
        __config = super(Decoder, self).get_config()
        __config.update(self._config)
        return __config

    @classmethod
    def from_config(cls, config) -> tf.keras.layers.Layer:
        return cls(**config)

# VAE #########################################################################

@tf.keras.saving.register_keras_serializable(package='models')
class AutoEncoder(tf.keras.models.Model):
    def __init__(
        self,
        token_dim: list,
        encoding_dim: int,
        embedding_dim: int,
        hidden_dim: int,
        latent_dim: int,
        batch_dim: int=None,
        normalization: bool=True,
        gate: bool=True,
        activation: str='silu',
        sequence_axis: int=0,
        feature_axis: int=-1,
        **kwargs
    ) -> None:
        # init
        super(AutoEncoder, self).__init__(**kwargs)
        # layers
        self._encoder = Encoder(token_dim=token_dim, encoding_dim=encoding_dim, embedding_dim=embedding_dim, hidden_dim=hidden_dim, latent_dim=latent_dim, batch_dim=batch_dim, gate=gate, normalization=normalization, activation=activation, sequence_axis=sequence_axis, feature_axis=feature_axis)
        self._decoder = Decoder(token_dim=token_dim, encoding_dim=encoding_dim, embedding_dim=embedding_dim, hidden_dim=hidden_dim, latent_dim=latent_dim, batch_dim=batch_dim, gate=gate, normalization=normalization, activation=activation, sequence_axis=sequence_axis, feature_axis=feature_axis)

    def call(self, x: tf.Tensor) -> tf.Tensor:
        return self._decoder(self._encoder(x))

    def get_config(self) -> dict:
        __config = super(AutoEncoder, self).get_config()
        __config.update(self._encoder.get_config())
        return __config

    @classmethod
    def from_config(cls, config) -> tf.keras.layers.Layer:
        return cls(**config)

In [None]:
class TokunModel(AutoEncoder, huggingface_hub.KerasModelHubMixin):
    pass

In [None]:
# INIT ########################################################################

with DISTRIBUTION_STRATEGY.scope():
    if IMPORT and os.path.isfile(PATH_IMPORT):
        MODEL = tf.keras.models.load_model(PATH_IMPORT)
    else:
        MODEL = TokunModel(sequence_axis=SEQUENCE_AXIS, feature_axis=FEATURE_AXIS, token_dim=N_TOKEN_DIM, encoding_dim=N_ENCODING_DIM, embedding_dim=N_EMBEDDING_DIM, hidden_dim=N_HIDDEN_DIM, latent_dim=N_LATENT_DIM, batch_dim=None, gate=GATE, normalization=NORMALIZATION, activation=ACTIVATION)
    MODEL.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=R_MAX),
        loss=tf.keras.losses.CategoricalCrossentropy(from_logits=False, label_smoothing=0., axis=-1, reduction=tf.keras.losses.Reduction.SUM_OVER_BATCH_SIZE, name='loss'),
        metrics=['accuracy'])


## Preprocess

In [None]:
# MLQA ########################################################################

PIPELINE = [
    # offset by 1 to 15 character => (B, 1) bytes
    *[(functools.partial(tokun.pipeline.offset, ticks=__t), False) for __t in OFFSET_TICKS], # (offsets 0, ..., (2 ^ i) - 1) + (offsets 2 ^ i, ..., 2 ^ (i+1) - 1)
    # encode => (B, G * S,) int
    (functools.partial(tokun.pipeline.encode, token_size=TOKEN_SIZES[-1], sample_size=N_SAMPLE), True),
    # reshape => (B * G * S,) int
    (functools.partial(tokun.pipeline.reshape, groups=N_TOKEN_DIM, expand=[], flatten=True), True),
    # one-hot encoding => (B * G * S, E) int (bool)
    (functools.partial(tf.one_hot, depth=N_ENCODING_DIM, axis=-1), True),
    # replace sample inputs with (input, target) for supervised learning
    ((lambda x: (x, x)), True)]

OPERATIONS, REPLACE = zip(*PIPELINE)

MLQA_TRAIN = {__l: mlable.data.process(dataset=__d, feature='context', pipeline=OPERATIONS, replace=REPLACE) for __l, __d in MLQA_TRAIN.items()}
MLQA_TEST = {__l: mlable.data.process(dataset=__d, feature='context', pipeline=OPERATIONS, replace=REPLACE) for __l, __d in MLQA_TEST.items()}

In [None]:
# RANDOM ######################################################################

PIPELINE = [
    # reshape => (B * G * S,) int
    (functools.partial(tokun.pipeline.reshape, groups=N_TOKEN_DIM, expand=[], flatten=True), True),
    # one-hot encoding => (B * G * S, E) int (bool)
    (functools.partial(tf.one_hot, depth=N_ENCODING_DIM, axis=-1), True),
    # replace sample inputs with (input, target) for supervised learning
    ((lambda x: (x, x)), True)]

OPERATIONS, REPLACE = zip(*PIPELINE)

RANDOM_TRAIN = mlable.data.process(dataset=RANDOM_TRAIN, feature='', pipeline=OPERATIONS, replace=REPLACE)
RANDOM_TEST = mlable.data.process(dataset=RANDOM_TEST, feature='', pipeline=OPERATIONS, replace=REPLACE)

In [None]:
# TOGGLE ######################################################################

DATASET_TRAIN = RANDOM_TRAIN if RANDOM else MLQA_TRAIN['ar'].concatenate(MLQA_TRAIN['en']).concatenate(MLQA_TRAIN['es']).concatenate(MLQA_TRAIN['de']).concatenate(MLQA_TRAIN['hi']).concatenate(MLQA_TRAIN['vi']).concatenate(MLQA_TRAIN['zh'])
DATASET_TEST = MLQA_TEST['ar'].concatenate(MLQA_TEST['en']).concatenate(MLQA_TEST['es']).concatenate(MLQA_TEST['de']).concatenate(MLQA_TEST['hi']).concatenate(MLQA_TEST['vi']).concatenate(MLQA_TEST['zh'])

## Train

In [None]:
# TRAIN #######################################################################

lr_callback = tf.keras.callbacks.LearningRateScheduler(functools.partial(mlable.optimizers.learning_rate_hokusai, lr_min=R_MIN, lr_max=R_MAX, lr_exp=R_EXP, rampup=N_EPOCHS_RAMPUP, sustain=N_EPOCHS_SUSTAIN), verbose=True)
cp_callback = tf.keras.callbacks.ModelCheckpoint(PATH_EXPORT, monitor='val_loss', verbose=1, save_best_only=True, save_weights_only=False, mode='auto', save_freq='epoch')
tb_callback = tf.keras.callbacks.TensorBoard(log_dir=PATH_LOG)

if TRAINING:
    with DISTRIBUTION_STRATEGY.scope():
        TRAINING_HISTORY = MODEL.fit(
            x=DATASET_TRAIN.batch(N_BATCH).prefetch(tf.data.AUTOTUNE),
            batch_size=None,
            epochs=N_EPOCHS,
            validation_split=None,
            validation_data=RANDOM_TEST.batch(N_BATCH).prefetch(tf.data.AUTOTUNE),
            validation_freq=list(range(1, N_EPOCHS + 1, 1)),
            verbose=2,
            callbacks=[lr_callback, cp_callback, tb_callback])

In [None]:
MODEL.summary()

## Export

In [None]:
# SAMPLES #####################################################################

IO = {}
TOKENS = {__i: {} for __i in TOKEN_SIZES} # length in bytes
EMBEDDINGS = {__i: {} for __i in TOKEN_SIZES} # same

for __lang, __dataset in MLQA_TEST.items():
    # compute predictions
    __batch = iter(__dataset.batch(N_BATCH)) # iterate over batches of samples
    __input = next(__batch)[0] # take input only
    __output = MODEL(__input)
    # sample predictions (inputs, outputs)
    IO[__lang] = (__input, __output)

In [None]:
# RANDOM ######################################################################

# predictions
__batch = iter(RANDOM_TEST.batch(N_BATCH))
__input = next(__batch)[0]
__output = MODEL(__input)
# sample predictions (inputs, outputs)
IO['rd'] = (__input, __output)

In [None]:
# TOKENS ######################################################################

# unique (G ^ i)-tokens
for __lang, __sample in IO.items():
    for __size in TOKENS:
        TOKENS[__size][__lang] = tokun.pipeline.chunk(seq=tokun.pipeline.postprocess(__sample[0]), size=__size // 4, repeats=False)

# unique tokens, for all languages
for __size in TOKENS:
    TOKENS[__size]['all'] = list(set(__t for _, __s in TOKENS[__size].items() for __t in __s))

In [None]:
# EMBEDDINGS ##################################################################

for __depth, __size in enumerate(TOKEN_SIZES):
    for __lang, __tokens in TOKENS[__size].items():
        # re-encode without token repeats
        __input = tokun.pipeline.preprocess(text=''.join(__tokens), token_size=math.prod(N_TOKEN_DIM), expand=SEQUENCE_AXIS * [1])
        # UTF-32 embedding
        __embedding = MODEL._encoder._encoder.layers[0](__input)
        # iterative CNN tokenization
        for __i in range(__depth + 1):
            __embedding = MODEL._encoder._encoder.layers[__i + 1](__embedding)
        # remove the (tokenized) padding
        EMBEDDINGS[__size][__lang] = tf.squeeze(__embedding)[:len(__tokens)] # TODO squeeze?

In [None]:
# NEIGHBORHOODS ###############################################################

__unit = TOKEN_SIZES[-1]
__count = 256

TOKENS['local'] = {'all': []}
EMBEDDINGS['local'] = {'all': []}

for __lang, __tokens in TOKENS[__unit].items():
    # stats on the embeddings for the current language
    __std = tf.math.reduce_std(EMBEDDINGS[__unit][__lang], axis=0, keepdims=True)
    __radius = 2. ** (3 - math.log(__unit, 4)) * tf.reduce_mean(__std).numpy()
    # choose a single token
    __t = tokun.pipeline.preprocess(text=random.choice(__tokens), token_size=math.prod(N_TOKEN_DIM), expand=SEQUENCE_AXIS * [1])
    # encode it
    __e = MODEL._encoder(__t)
    # add noise to generate random neighbors
    __n = tokun.evaluation.neighbors(point=__e, radius=__radius, count=__count)
    # decode the noisy embeddings
    __d = MODEL._decoder(__n)
    # postprocess
    __m = tokun.pipeline.chunk(seq=tokun.pipeline.postprocess(__d), size=__unit // 4, repeats=True)
    # save
    TOKENS['local']['all'].extend(__m)
    EMBEDDINGS['local']['all'].append(tf.squeeze(__n))

# merge all the embedding tensors
EMBEDDINGS['local']['all'] = tf.concat(values=EMBEDDINGS['local']['all'], axis=0)

In [None]:
# SAVE ########################################################################

for __size in TOKENS:
    mlable.data.write(data=[__c + ' ' + mlable.data.label(__c) for __c in TOKENS[__size]['all']][:8192], path='./metadata.' + str(__size) + '.label.tsv', tsv=False)
    mlable.data.write(data=TOKENS[__size]['all'][:8192], path='./metadata.' + str(__size) + '.tsv', tsv=False)
    mlable.data.write(data=EMBEDDINGS[__size]['all'].numpy()[:8192], path='./embeddings.' + str(__size) + '.tsv', tsv=True)

## Dataviz

In [None]:
# DATA ########################################################################

SAMPLES = [
    """위키백과, 우리 모두의 백과사전.\nt-분포 확률적 임베딩(t-SNE)은 데이터의 차원 축소에 사용되는 기계 학습 알고리즘 중 하나로, 2002년 샘 로이스Sam Rowise와 제프리 힌튼에 의해 개발되었다.[1] t-SNE는 비선형 차원 축소 기법으로, 고차원 데이터를 특히 2, 3차원 등으로 줄여 가시화하는데에 유용하게 사용된다. 구체적으로 t-SNE는 비슷한 데이터는 근접한 2, 3차원의 지점으로, 다른 데이터는 멀리 떨어진 지점으로 맵핑한다.""",
    """class Encoder(tf.keras.models.Model):\n    def __init__(self, depth: int, token_dim: int, encoding_dim: int, embedding_dim: int, latent_dim: int, batch_dim: int=None, attention: bool=False, **kwargs) -> None:\n        super(Encoder, self).__init__(**kwargs)\n        self._encoder = tf.keras.Sequential([\n            tf.keras.Input(shape=(encoding_dim,), batch_size=batch_dim, name='input'), # (B * G ^ D, U)\n            tf.keras.layers.Dense(units=embedding_dim, activation=None, use_bias=False, kernel_initializer='glorot_uniform', bias_initializer=None, name='embed-1'),] # (B * G ^ D, U) => (B * G ^ D, E)\n            + [tokun.layers.TokenizeBlock(left_axis=-2, right_axis=-1, token_dim=token_dim, latent_dim=latent_dim, attention=attention, name='tokenize' + (__i + 1) * '-4') for __i in range(depth)]) # (B * G ^ i, E) => (B * G ^ (i-1), E)\n\n    def call(self, x: tf.Tensor) -> tf.Tensor:\n        return self._encoder(x)\n""",
    """class AutoEncoder(tf.keras.models.Model):\n    def __init__(self, token_dim: int, encoding_dim: int, embedding_dim: int, latent_dim: int, batch_dim: int=None, **kwargs) -> None:\n        super(AutoEncoder, self).__init__(**kwargs)\n        self._encoder = Encoder(token_dim=token_dim, encoding_dim=encoding_dim, embedding_dim=embedding_dim, latent_dim=latent_dim, batch_dim=batch_dim)\n        self._decoder = Decoder(token_dim=token_dim, encoding_dim=encoding_dim, embedding_dim=embedding_dim, latent_dim=latent_dim, batch_dim=batch_dim)\n\n    def call(self, x: tf.Tensor) -> tf.Tensor:\n        return self._decoder(self._encoder(x))""",
    """class AutoEncoder(tf.keras.models.Model):\n  def __init__(self, token_dim: int, encoding_dim: int, embedding_dim: int, latent_dim: int, batch_dim: int=None, **kwargs) -> None:\n    super(AutoEncoder, self).__init__(**kwargs)\n    self._encoder = Encoder(token_dim=token_dim, encoding_dim=encoding_dim, embedding_dim=embedding_dim, latent_dim=latent_dim, batch_dim=batch_dim)\n    self._decoder = Decoder(token_dim=token_dim, encoding_dim=encoding_dim, embedding_dim=embedding_dim, latent_dim=latent_dim, batch_dim=batch_dim)\n\n  def call(self, x: tf.Tensor) -> tf.Tensor:\n    return self._decoder(self._encoder(x))"""]

In [None]:
# COMPUTE ######################################################################

__i = 0
__x, __e, __p, __y = tokun.pipeline.sample(model=MODEL, text=SAMPLES[__i], groups=N_TOKEN_DIM, expand=SEQUENCE_AXIS * [1], flatten=True)

In [None]:
print('# INPUT ################################################################\n\n' + SAMPLES[__i])
print('\n# OUTPUT ###############################################################\n\n' + __y)
print('\n# SCORE ################################################################\n\n' + str(tokun.evaluation.compare(SAMPLES[__i], __y)))

In [None]:
# FROM DATASET ################################################################

# compute
__l = tokun.pipeline.postprocess(IO['de'][0])
__r = tokun.pipeline.postprocess(IO['de'][1])

# print
print(__l)
print(__r)
print(tokun.evaluation.compare(__l, __r))

In [None]:
%tensorboard --logdir .logs

In [None]:
print(tf.math.reduce_mean(EMBEDDINGS[TOKEN_SIZES[-1]]['en'], axis=0))
print(tf.math.reduce_std(EMBEDDINGS[TOKEN_SIZES[-1]]['en'], axis=0))

In [None]:
__std = tf.math.reduce_std(EMBEDDINGS[TOKEN_SIZES[-1]]['en'], axis=0)
__noise = tf.random.normal(shape=(256,), mean=0., stddev=tf.math.reduce_mean(__std).numpy())

__x = tokun.pipeline.preprocess(text='tokun to can tokens', token_size=math.prod(N_TOKEN_DIM), expand=SEQUENCE_AXIS * [1])
__e = MODEL._encoder(__x)

print(tokun.pipeline.postprocess(MODEL._decoder(__e)))
print(tokun.pipeline.postprocess(MODEL._decoder(__e + 0.4 * __std)))
print(tokun.pipeline.postprocess(MODEL._decoder(__e + 0.2 * __noise)))

In [None]:
__x = IO['en'][0]
MODEL._encoder(__x)