## Import deps

In [None]:
# !pip install -qq tensorflow==2.18.0
# !pip install -qq tensorflow-tpu==2.18.0 --find-links=https://storage.googleapis.com/libtpu-tf-releases/index.html

In [None]:
!pip install -U datasets mlable tokun

In [None]:
# !pip install -qq --no-index -f '/content/libs/' tokun

In [None]:
import datetime
import functools
import itertools
import math
import os
import random
import urllib.request

import datasets as hd
import tensorflow as tf

import mlable.data
import mlable.maths.ops
import mlable.metrics
import mlable.sampling
import mlable.shaping.axes
import mlable.shaping.hilbert
import mlable.text

import tokun.data
import tokun.eval
import tokun.models.klvae
import tokun.models.vqvae
import tokun.pipeline.flat.preprocess
import tokun.pipeline.flat.postprocess
import tokun.pipeline.hilbert.preprocess
import tokun.pipeline.hilbert.postprocess
import tokun.pipeline.square.preprocess
import tokun.pipeline.square.postprocess

In [None]:
print("Tensorflow version " + tf.__version__)

## Setup the GPU / TPU

In [None]:
# DEBUGGING ####################################################################

tf.keras.config.disable_traceback_filtering()

In [None]:
# MIXED PRECISION ##############################################################

tf.keras.mixed_precision.set_global_policy('mixed_bfloat16')

In [None]:
# DEVICES ######################################################################

tf.debugging.set_log_device_placement(False)

CPU = tf.config.list_logical_devices('CPU')
GPU = tf.config.list_logical_devices('GPU')
TPU = tf.config.list_logical_devices('TPU')

if TPU:
    RESOLVER = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='local')
    tf.config.experimental_connect_to_cluster(RESOLVER)
    tf.tpu.experimental.initialize_tpu_system(RESOLVER)
    DISTRIBUTION_STRATEGY = tf.distribute.TPUStrategy(RESOLVER)
elif GPU:
    DISTRIBUTION_STRATEGY = tf.distribute.MirroredStrategy(GPU)
else:
    DISTRIBUTION_STRATEGY = tf.distribute.MirroredStrategy(CPU)

print('CPU: ', CPU)
print('GPU: ', GPU)
print('TPU: ', TPU)
print('DS: ', DISTRIBUTION_STRATEGY)

## Mode

In [None]:
# TOGGLE #######################################################################

IMPORT = False
DOWNLOAD = False
TRAINING = True
RANDOM = False

DATA = 'flat' # 'flat' / 'hilbert' / 'square'
ARCH = 'vqvae' # 'klvae' / 'vqvae'

## Defining The Metadata

In [None]:
# COMMON PARAMETERS ############################################################

BASE_CONFIG = {
    'batch_dim': 128,
    'token_dim': 3, # T
    'drop_dim': 1, # number of leading bytes dropped from the encoding
    'input_dim': 256, # U_i (bytes)
    'height_dim': 64,
    'width_dim': 64 * 4,
    'sample_dim': 1024,
    'order_num': 5,
    'rank_num': 2,
    'epochs': 8,
    'steps': 2 ** 13,
    'epsilon': 1e-6,
    'dropout': 0.01,
    'trainable': True,
    'bigendian': True,
    'encoding': 'UTF-32-BE',}

In [None]:
# MODEL PARAMETERS #############################################################

MODEL_FACTORY = {
    'klvae': tokun.models.klvae.KlAutoEncoder,
    'vqvae': tokun.models.vqvae.QuantizedAutoEncoder,}

MODEL_CONFIG = {
    'vqvae': {
        'token_dim': BASE_CONFIG['token_dim'],
        'input_dim': BASE_CONFIG['input_dim'],
        'embed_dim': 64,
        'binary_dim': 8,
        'trainable': BASE_CONFIG['trainable'],},
    'klvae': {
        'channel_dim': [128, 128],
        'group_dim': 32,
        'head_dim': 128,
        'embed_dim': 64,
        'output_dim': 8 * BASE_CONFIG['token_dim'],
        'input_dim': BASE_CONFIG['input_dim'],
        'layer_num': 2,
        'step_min': 0,
        'step_max':  BASE_CONFIG['steps'],
        'beta_min': 0.0,
        'beta_max': 0.01,
        'dropout_rate': BASE_CONFIG['dropout'],
        'epsilon_rate': BASE_CONFIG['epsilon'],
        'trainable': BASE_CONFIG['trainable'],},}

In [None]:
# DERIVED MODEL PARAMETERS #####################################################

META_CONFIG = {
    'version': '{}x{}'.format(BASE_CONFIG['token_dim'], MODEL_CONFIG[ARCH]['embed_dim']),
    'label': ARCH,}

IO_CONFIG = {
    'url': 'https://github.com/apehex/tokun/raw/main/models/{}.{}.keras'.format(META_CONFIG['label'], META_CONFIG['version']),
    'path': 'tokun.keras',}

In [None]:
# PREPROCESSING ################################################################

ANSI_REGEX = r'\x1b\[[0-9;]*[mGKHF]'

BATCH_CONFIG = {
    'batch_size': BASE_CONFIG['batch_dim'],
    'drop_remainder': True,
    'num_parallel_calls': tf.data.AUTOTUNE,}

PIPELINE_FACTORY = {
    'flat': tokun.pipeline.flat.preprocess.factory,
    'hilbert': tokun.pipeline.hilbert.preprocess.factory,
    'square': tokun.pipeline.square.preprocess.factory,}

PIPELINE_CONFIG = {
    'flat': {
        'batch_dim': BATCH_CONFIG['batch_size'],
        'token_dim': BASE_CONFIG['token_dim'],
        'drop_dim': BASE_CONFIG['drop_dim'],
        'sample_dim': (BASE_CONFIG['token_dim'] + BASE_CONFIG['drop_dim']) * BASE_CONFIG['sample_dim'],
        'pattern': ANSI_REGEX,
        'rewrite': '',
        'separator': '\u001d',
        'encoding': BASE_CONFIG['encoding'],
        'bigendian': BASE_CONFIG['bigendian'],
        'targets': True,},
    'hilbert': {
        'batch_dim': BATCH_CONFIG['batch_size'],
        'token_dim': BASE_CONFIG['token_dim'],
        'order_num': BASE_CONFIG['order_num'],
        'rank_num': BASE_CONFIG['rank_num'],
        'pattern': ANSI_REGEX,
        'rewrite': '',
        'separator': '\u001d',
        'encoding': BASE_CONFIG['encoding'],
        'bigendian': BASE_CONFIG['bigendian'],
        'targets': True,},
    'square': {
        'batch_dim': BATCH_CONFIG['batch_size'],
        'token_dim': BASE_CONFIG['token_dim'],
        'drop_dim': BASE_CONFIG['drop_dim'],
        'height_dim': BASE_CONFIG['height_dim'],
        'width_dim': BASE_CONFIG['width_dim'],
        'pattern': ANSI_REGEX,
        'rewrite': '',
        'separator': '\u001d',
        'encoding': BASE_CONFIG['encoding'],
        'bigendian': BASE_CONFIG['bigendian'],
        'targets': True,},}

In [None]:
# POSTPROCESSING ###############################################################

POSTPROCESSING_FACTORY = {
    'flat': tokun.pipeline.flat.postprocess.factory,
    'hilbert': tokun.pipeline.hilbert.postprocess.factory,
    'square': tokun.pipeline.square.postprocess.factory,}

POSTPROCESSING_CONFIG = {
    'flat': {
        'drop_dim': PIPELINE_CONFIG['flat']['drop_dim'],
        'encoding': PIPELINE_CONFIG['flat']['encoding'],
        'bigendian': PIPELINE_CONFIG['flat']['bigendian'],
        'threshold': 0.0,
        'errors': 'replace',},
    'hilbert': {
        'order_num': PIPELINE_CONFIG['hilbert']['order_num'],
        'rank_num': PIPELINE_CONFIG['hilbert']['rank_num'],
        'encoding': PIPELINE_CONFIG['hilbert']['encoding'],
        'bigendian': PIPELINE_CONFIG['hilbert']['bigendian'],
        'threshold': 0.0,
        'errors': 'replace',},
    'square': {
        'drop_dim': PIPELINE_CONFIG['square']['drop_dim'],
        'encoding': PIPELINE_CONFIG['square']['encoding'],
        'bigendian': PIPELINE_CONFIG['square']['bigendian'],
        'threshold': 0.0,
        'errors': 'replace',},}

In [None]:
# RANDOM DATASET ###############################################################

RANDOM_CONFIG = {
    'sample_count': BATCH_CONFIG['batch_size'] * BASE_CONFIG['steps'],
    'sample_size': BASE_CONFIG['token_dim'] * BASE_CONFIG['sample_dim'],}

In [None]:
# TRAINING PARAMETERS ##########################################################

TRAINING_CONFIG = {
    'epochs': BASE_CONFIG['epochs'],
    'batch_size': None,
    'validation_split': None,
    'validation_freq': list(range(1, 9)),
    # 'class_weight': {__c: 1. if __c == 0 else 1. for __c in range(256)}, # there are 3 times more 0s than other bytes
    'verbose': 1,}

OPTIMIZER_CONFIG = {
    'learning_rate': 0.001 * (0.1 if IMPORT else 1.0),
    'weight_decay': 0.00001,
    'beta_1': 0.9,
    'beta_2': 0.999,
    'epsilon': 1e-6,
    'clipnorm': 0.1,
    'amsgrad': False,
    'use_ema': False,
    'ema_momentum': 0.99,
    'ema_overwrite_frequency': 1024,}
    # 'gradient_accumulation_steps': 2,

SCHEDULER_CONFIG = {
    'initial_learning_rate': OPTIMIZER_CONFIG['learning_rate'],
    'decay_steps': TRAINING_CONFIG['epochs'] * BASE_CONFIG['steps'],
    'alpha': 0.01,
    'name': 'cosine_lr',
    'warmup_target': None,
    'warmup_steps': 0,}

LOSS_CONFIG = {
    'from_logits': True,
    'label_smoothing': 0.0,
    'axis': -1,
    'reduction': 'sum_over_batch_size',
    'name': 'ce_loss',}

METRICS_CONFIG = {
    'depth': 8,
    'from_logits': True,}

CHECKPOINT_CONFIG = {
    'filepath': IO_CONFIG['path'],
    'monitor': 'val_loss',
    'mode': 'auto',
    'save_freq': 'epoch',
    'save_best_only': False,
    'save_weights_only': False,
    'verbose': 1,}

TENSORBOARD_CONFIG = {
    'log_dir': os.path.join('.logs/', META_CONFIG['version'], datetime.datetime.now().strftime("%Y%m%d-%H%M%S")),
    'histogram_freq': 1,
    'embeddings_freq': 1,
    'profile_batch': (0, 4),
    'write_graph': True,
    'write_images': True,}

In [None]:
# DATASETS #####################################################################

DATASETS_CONFIG = {
    # 'pt-fineweb-edu': {
    #     'path': 'HuggingFaceFW/fineweb-edu',
    #     'name': 'sample-10BT',
    #     'split': 'train',
    #     'features': ['text'],},
    # 'pt-fineweb-kor': {
    #     'path': 'HuggingFaceFW/fineweb-2',
    #     'name': 'kor_Hang',
    #     'split': 'train',
    #     'features': ['text'],},
    # 'pt-fineweb-fin': {
    #     'path': 'HuggingFaceFW/fineweb-2',
    #     'name': 'fin_Latn',
    #     'split': 'train',
    #     'features': ['text'],},
    # 'pt-wikipedia': {
    #     'path': 'wikimedia/wikipedia',
    #     'name': '20231101.en',
    #     'split': 'train',
    #     'features': ['text'],},
    # 'tp-wikipedia-1': {
    #     'path': 'wikimedia/wikipedia',
    #     'name': '20231101.en',
    #     'split': 'train',
    #     'features': ['text'],},
    # 'tp-wikipedia-2': {
    #     'path': 'wikimedia/wikipedia',
    #     'name': '20231101.en',
    #     'split': 'train',
    #     'features': ['text'],},
    # 'ft-retro-ascii-art': {
    #     'path': 'jdpressman/retro-ascii-art-v1',
    #     'name': None,
    #     'train': 'train',
    #     'split': 'train',
    #     'features': ['prompt', 'art_aic'],},
    # 'ft-stack-exchange': {
    #     'path': 'Alignment-Lab-AI/Stack-Exchange-April',
    #     'name': None,
    #     'split': 'train',
    #     'features': ['question', 'answer'],},
    # 'ft-math': {
    #     'path': 'HuggingFaceTB/finemath',
    #     'name': 'finemath-3plus',
    #     'split': 'train',
    #     'features': ['text'],},
    # 'cot-text-dolphin': {
    #     'path': 'cognitivecomputations/dolphin-r1',
    #     'name': 'reasoning-deepseek',
    #     'split': 'train',
    #     'features': ['reasoning', 'answer'],},
    # 'cot-text-openthoughts': {
    #     'path': 'open-thoughts/OpenThoughts-114k',
    #     'name': 'default',
    #     'split': 'train',
    #     'features': ['problem', 'solution'],},
    # 'ft-asciiart-asciiart': {
    #     'path': 'apehex/ascii-art',
    #     'name': 'asciiart',
    #     'split': 'train',
    #     'features': ['content'],},
    'ft-asciiart-copypasta': {
        'path': 'apehex/ascii-art',
        'name': 'copypasta',
        'split': 'train',
        'features': ['content'],},
    # 'ft-asciiart-graffiti': {
    #     'path': 'apehex/ascii-art',
    #     'name': 'graffiti',
    #     'split': 'train',
    #     'features': ['content'],},
    'ft-asciiart-images': {
        'path': 'apehex/ascii-art',
        'name': 'images',
        'split': 'train',
        'features': ['content'],},
    # 'ft-asciiart-datacompdr': {
    #     'path': 'apehex/ascii-art-datacompdr-12m',
    #     'name': 'default',
    #     'split': 'fixed',
    #     'features': ['content'],},
    # 'cot-math-numi': {
    #     'path': 'AI-MO/NuminaMath-CoT',
    #     'name': None,
    #     'split': 'train',
    #     'features': ['problem', 'solution'],},
}

## Downloading The Model Weights

In [None]:
# IMPORT #######################################################################

if IMPORT and DOWNLOAD:
    urllib.request.urlretrieve(IO_CONFIG['url'], IO_CONFIG['path'])

## Downloading The Data

In [None]:
# DOWNLOAD #####################################################################

DATASETS = {
    __name: hd.load_dataset(path=__args['path'], name=__args['name'], split=__args['split']).to_tf_dataset(shuffle=True, batch_size=None)
    for __name, __args in DATASETS_CONFIG.items()}

In [None]:
# RANDOM #######################################################################

RANDOM_TRAIN = tokun.data.random_dataset_of_bytes(**RANDOM_CONFIG)
RANDOM_TEST = tokun.data.random_dataset_of_bytes(**{__k: (128 if 'count' in __k else __v) for __k, __v in RANDOM_CONFIG.items()})

In [None]:
# STATS ########################################################################

STATS = {__n: mlable.data.stats(dataset=DATASETS[__n], features=DATASETS_CONFIG[__n]['features'], count=2048) for __n in DATASETS}

print(STATS)

In [None]:
# VIZ ##########################################################################

# __i = iter(DATASETS['ft-asciiart-datacompdr'])

In [None]:
# __s = next(__i)
# print(__s['caption'].numpy().decode('utf-8'), __s['labels'].numpy().decode('utf-8'), len(__s['content'].numpy().decode('utf-8')))
# print(__s['content'].numpy().decode('utf-8'))

## Preprocess

In [None]:
# ITERATE ######################################################################

for __name in DATASETS:
    # specialized preprocessing fn
    __preprocess = PIPELINE_FACTORY[DATA](
        features=DATASETS_CONFIG[__name]['features'],
        **PIPELINE_CONFIG[DATA])
    # apply
    DATASETS[__name] = DATASETS[__name].batch(**BATCH_CONFIG).map(__preprocess, num_parallel_calls=tf.data.AUTOTUNE)

In [None]:
# POSTPROCESS ##################################################################

__postprocess_greedy = POSTPROCESSING_FACTORY[DATA](**POSTPROCESSING_CONFIG[DATA])
__postprocess_sampler = POSTPROCESSING_FACTORY[DATA](temp=1.0, topp=0.9, topk=4, **POSTPROCESSING_CONFIG[DATA])
__postprocess_probs = POSTPROCESSING_FACTORY[DATA](**{__k: (0.5 if __k == 'threshold' else __v) for __k, __v in POSTPROCESSING_CONFIG[DATA].items()})

In [None]:
# RANDOM #######################################################################

def preprocess_bytes(inputs: tf.Tensor) -> tuple:
    __inputs = mlable.shaping.axes.divide(inputs, axis=-1, factor=BASE_CONFIG['token_dim'], insert=True, right=True)
    __targets = mlable.shaping.axes.merge(mlable.maths.ops.expand_base(__inputs, base=2, depth=8, bigendian=BASE_CONFIG['bigendian']), axis=-1, right=False)
    return (tf.cast(__inputs, tf.float32), tf.cast(__targets, tf.float32))

RANDOM_TRAIN = RANDOM_TRAIN.batch(**BATCH_CONFIG).map(preprocess_bytes, num_parallel_calls=tf.data.AUTOTUNE)
RANDOM_TEST = RANDOM_TEST.batch(**BATCH_CONFIG).map(preprocess_bytes, num_parallel_calls=tf.data.AUTOTUNE)

In [None]:
# CONCATENATE ##################################################################

DATASET_KEYS = set(DATASETS.keys()) - {'random'}

DATASET_ALL = functools.reduce(lambda __l, __r: __l.concatenate(__r), [DATASETS[__n] for __n in DATASET_KEYS], RANDOM_TEST.concatenate(RANDOM_TRAIN)) # RANDOM_TEST.concatenate(RANDOM_TRAIN)
DATASET_DIM = DATASET_ALL.cardinality().numpy()

DATASET_TEST = DATASET_ALL.take(128)
DATASET_TRAIN = DATASET_ALL.skip(128).take(BASE_CONFIG['steps'])

In [None]:
# INSPECT ######################################################################

__X, __T = next(iter(DATASET_TRAIN.take(1)))

print(DATASET_TRAIN.element_spec)
print(DATASET_TEST.element_spec)

print(RANDOM_TRAIN.element_spec)
print(RANDOM_TEST.element_spec)

print('train: {:,}'.format(DATASET_TRAIN.cardinality().numpy()))
print('test:  {:,}'.format(DATASET_TEST.cardinality().numpy()))

In [None]:
# VIZ ##########################################################################

# __x, __y = next(iter(DATASETS['ft-asciiart-datacompdr']))
# __o = __postprocess_probs(__y).numpy().tolist()

In [None]:
# __i = 19
# print(b'\n'.join(__o[__i]).decode('utf-8'))

## Init The Model

In [None]:
# COMPILE ######################################################################

with DISTRIBUTION_STRATEGY.scope():
    # metrics
    byte_accuracy = mlable.metrics.BinaryGroupAccuracy(group=1, name='byte_accuracy', **METRICS_CONFIG)
    token_accuracy = mlable.metrics.BinaryGroupAccuracy(group=BASE_CONFIG['token_dim'], name='token_accuracy', **METRICS_CONFIG)
    # cosing LR
    OPTIMIZER_CONFIG['learning_rate'] = tf.keras.optimizers.schedules.CosineDecay(**SCHEDULER_CONFIG)
    # weights
    MODEL = MODEL_FACTORY[ARCH](**MODEL_CONFIG[ARCH])
    if IMPORT and os.path.isfile(IO_CONFIG['path']): MODEL = tf.keras.models.load_model(IO_CONFIG['path'], compile=False)
    # compile
    MODEL.compile(
        optimizer=tf.keras.optimizers.AdamW(**OPTIMIZER_CONFIG),
        loss=tf.keras.losses.BinaryCrossentropy(**LOSS_CONFIG),
        weighted_metrics=[byte_accuracy, token_accuracy])
    # build
    MODEL(__X, training=False)
    MODEL.compute_metrics(__X, __T, __T)
    MODEL.compute_loss(__X, __T, __T)

In [None]:
# INSPECT ######################################################################

MODEL.summary()

In [None]:
print(MODEL.compute_loss(__X, __T, MODEL(__X)))
tf.reduce_mean(tf.keras.losses.binary_crossentropy(__T, MODEL(__X), from_logits=True, axis=-1))

## Train

In [None]:
# TRAIN ########################################################################

if TRAINING:
    with DISTRIBUTION_STRATEGY.scope():
        # callbacks
        cp_callback = tf.keras.callbacks.ModelCheckpoint(**CHECKPOINT_CONFIG)
        tb_callback = tf.keras.callbacks.TensorBoard(**TENSORBOARD_CONFIG)
        tn_callback = tf.keras.callbacks.TerminateOnNaN()
        # fit model
        TRAINING_HISTORY = MODEL.fit(
            x=DATASET_TRAIN.prefetch(tf.data.AUTOTUNE),
            validation_data=DATASET_TEST.prefetch(tf.data.AUTOTUNE),
            callbacks=[cp_callback, tb_callback, tn_callback],
            **TRAINING_CONFIG)

## Dataviz 1D

In [None]:
# 1D SAMPLES ###################################################################

SAMPLES_1D = [
    """위키백과, 우리 모두의 백과사전.\nt-분포 확률적 임베딩(t-SNE)은 데이터의 차원 축소에 사용되는 기계 학습 알고리즘 중 하나로, 2002년 샘 로이스Sam Rowise와 제프리 힌튼에 의해 개발되었다.[1] t-SNE는 비선형 차원 축소 기법으로, 고차원 데이터를 특히 2, 3차원 등으로 줄여 가시화하는데에 유용하게 사용된다. 구체적으로 t-SNE는 비슷한 데이터는 근접한 2, 3차원의 지점으로, 다른 데이터는 멀리 떨어진 지점으로 맵핑한다.""",
    """class Encoder(tf.keras.models.Model):\n    def __init__(self, depth: int, token_dim: int, encoding_dim: int, embedding_dim: int, batch_dim: int=None, attention: bool=False, **kwargs) -> None:\n        super(Encoder, self).__init__(**kwargs)\n        self._encoder = tf.keras.Sequential([\n            tf.keras.Input(shape=(encoding_dim,), batch_size=batch_dim, name='input'), # (B * G ^ D, U)\n            tf.keras.layers.Dense(units=embedding_dim, activation=None, use_bias=False, kernel_initializer='glorot_uniform', bias_initializer=None, name='embed-1'),] # (B * G ^ D, U) => (B * G ^ D, E)\n            + [tokun.layers.TokenizeBlock(left_axis=-2, right_axis=-1, token_dim=token_dim, attention=attention, name='tokenize' + (__i + 1) * '-4') for __i in range(depth)]) # (B * G ^ i, E) => (B * G ^ (i-1), E)\n\n    def call(self, x: tf.Tensor) -> tf.Tensor:\n        return self._encoder(x)\n""",
    """Hilbert curve\n\nThe Hilbert curve (also known as the Hilbert space-filling curve) is a continuous fractal space-filling curve first described by the German mathematician David Hilbert in 1891,[1] as a variant of the space-filling Peano curves discovered by Giuseppe Peano in 1890.[2]\n\nBecause it is space-filling, its Hausdorff dimension is 2 (precisely, its image is the unit square, whose dimension is 2 in any definition of dimension; its graph is a compact set homeomorphic to the closed unit interval, with Hausdorff dimension 1).\n\nThe Hilbert curve is constructed as a limit of piecewise linear curves. The length of the {\\displaystyle n}th curve is {\\displaystyle \\textstyle 2^{n}-{1 \\over 2^{n}}}, i.e., the length grows exponentially with {\\displaystyle n}, even though each curve is contained in a square with area {\\displaystyle 1}.\n\nImages\n\nFirst six iterations of the Hilbert curve\n\nHilbert curve, first order\n\nHilbert curves, first and second orders\n\nHilbert curves, first to third orders\n\nProduction rules\n\nHilbert curve, construction color-coded\n\nA 3-D Hilbert curve with color showing progression\n\nVariant, first three iterations[3]\n\nApplications and mapping algorithms\n\nBoth the true Hilbert curve and its discrete approximations are useful because they give a mapping between 1D and 2D space that preserves locality fairly well.[4] This means that two data points which are close to each other in one-dimensional space are also close to each other after folding. The converse cannot always be true.\n\nBecause of this locality property, the Hilbert curve is widely used in computer science. For example, the range of IP addresses used by computers can be mapped into a picture using the Hilbert curve. Code to generate the image would map from 2D to 1D to find the color of each pixel, and the Hilbert curve is sometimes used because it keeps nearby IP addresses close to each other in the picture.[5] The locality property of the Hilbert curve has also been used to design algorithms for exploring regions with mobile robots[6][7] and indexing geospatial location data.[8]\n\nIn an algorithm called Riemersma dithering, grayscale photographs can be converted to a dithered black-and-white image using thresholding, with the leftover amount from each pixel added to the next pixel along the Hilbert curve. Code to do this would map from 1D to 2D, and the Hilbert curve is sometimes used because it does not create the distracting patterns that would be visible to the eye if the order were simply left to right across each row of pixels.[9] Hilbert curves in higher dimensions are an instance of a generalization of Gray codes, and are sometimes used for similar purposes, for similar reasons. For multidimensional databases, Hilbert order has been proposed to be used instead of Z order because it has better locality-preserving behavior. For example, Hilbert curves have been used to compress and accelerate R-tree indexes[10] (see Hilbert R-tree). They have also been used to help compress data warehouses.[11][12]\n\nThe linear distance of any point along the curve can be converted to coordinates in n dimensions for a given n, and vice versa, using any of several standard mathematical techniques such as Skilling\'s method.[13][14]\n\nIt is possible to implement Hilbert curves efficiently even when the data space does not form a square.[15] Moreover, there are several possible generalizations of Hilbert curves to higher dimensions.[16][17]\n\nRepresentation as Lindenmayer system\n\nThe Hilbert Curve can be expressed by a rewrite system (L-system).\n\nDuration: 52 seconds.0:52\nHilbert curve at its sixth iteration\nAlphabet : A, B\nConstants : F + −\nAxiom : A\nProduction rules:\nA → +BF−AFA−FB+\nB → −AF+BFB+FA−\nHere, "F" means "draw forward", "+" means "turn left 90°", "-" means "turn right 90°" (see turtle graphics), and "A" and "B" are ignored during drawing.\n\nOther implementations\n\nGraphics Gems II[18][promotion?] discusses Hilbert curve coherency, and provides implementation.\n\nThe Hilbert Curve is commonly used among rendering images or videos. Common programs such as Blender and Cinema 4D use the Hilbert Curve to trace the objects, and render the scene.[citation needed]\n\nThe slicer software used to convert 3D models into toolpaths for a 3D printer typically has the Hilbert curve as an option for an infill pattern.\n""",
    """Vícerozměrná náhodná proměnná nebo náhodný vektor je v teorii pravděpodobnosti a statistice seznam matematických proměnných, jehož žádná hodnota není známa, buď protože zatím nebyla pozorována, nebo protože její hodnotu neznáme přesně. Jednotlivé proměnné jsou sdružené v náhodném vektoru, protože tvoří části jednoho matematického systému – často reprezentují různé vlastnosti určité statistické jednotky. Pokud například chceme zachytit, že každá osoba má určitý věk, výšku a hmotnost, lze tyto vlastnosti blíže neurčené osoby z určité skupiny reprezentovat náhodným vektorem. Prvky náhodných vektorů jsou obvykle reálná čísla.""",]

In [None]:
# PREPROCESS ###################################################################

__preprocess_flat_utf8_1 = PIPELINE_FACTORY['flat'](batch_dim=1, token_dim=1, drop_dim=0, sample_dim=1024, features=[], encoding='UTF-8', targets=True)
__preprocess_flat_utf8_3 = PIPELINE_FACTORY['flat'](batch_dim=1, token_dim=3, drop_dim=0, sample_dim=3 * 1024, features=[], encoding='UTF-8', targets=True)
__preprocess_flat_utf8_4 = PIPELINE_FACTORY['flat'](batch_dim=1, token_dim=4, drop_dim=0, sample_dim=4 * 1024, features=[], encoding='UTF-8', targets=True)

__preprocess_flat_utf32_3 = PIPELINE_FACTORY['flat'](batch_dim=1, token_dim=3, drop_dim=1, sample_dim=4 * 1024, features=[], encoding='UTF-32-BE', targets=True)
__preprocess_flat_utf32_4 = PIPELINE_FACTORY['flat'](batch_dim=1, token_dim=4, drop_dim=0, sample_dim=4 * 1024, features=[], encoding='UTF-32-BE', targets=True)

In [None]:
# POSTPROCESS ##################################################################

__postprocess_probs_utf8 = POSTPROCESSING_FACTORY['flat'](drop_dim=0, threshold=0.5, temp=1.0, topp=-1.0, topk=-1, encoding='UTF-8', bigendian=True, errors='replace')
__postprocess_greedy_utf8 = POSTPROCESSING_FACTORY['flat'](drop_dim=0, threshold=0.0, temp=1.0, topp=-1.0, topk=-1, encoding='UTF-8', bigendian=True, errors='replace')
__postprocess_sampler_utf8 = POSTPROCESSING_FACTORY['flat'](drop_dim=0, threshold=0.0, temp=1.0, topp=0.9, topk=4, encoding='UTF-8', bigendian=True, errors='replace')

__postprocess_probs_utf32_3 = POSTPROCESSING_FACTORY['flat'](drop_dim=1, threshold=0.5, temp=1.0, topp=-1.0, topk=-1, encoding='UTF-32-BE', bigendian=True, errors='replace')
__postprocess_greedy_utf32_3 = POSTPROCESSING_FACTORY['flat'](drop_dim=1, threshold=0.0, temp=1.0, topp=-1.0, topk=-1, encoding='UTF-32-BE', bigendian=True, errors='replace')
__postprocess_sampler_utf32_3 = POSTPROCESSING_FACTORY['flat'](drop_dim=1, threshold=0.0, temp=1.0, topp=0.9, topk=4, encoding='UTF-32-BE', bigendian=True, errors='replace')

__postprocess_probs_utf32_4 = POSTPROCESSING_FACTORY['flat'](drop_dim=0, threshold=0.5, temp=1.0, topp=-1.0, topk=-1, encoding='UTF-32-BE', bigendian=True, errors='replace')
__postprocess_greedy_utf32_4 = POSTPROCESSING_FACTORY['flat'](drop_dim=0, threshold=0.0, temp=1.0, topp=-1.0, topk=-1, encoding='UTF-32-BE', bigendian=True, errors='replace')
__postprocess_sampler_utf32_4 = POSTPROCESSING_FACTORY['flat'](drop_dim=0, threshold=0.0, temp=1.0, topp=0.9, topk=4, encoding='UTF-32-BE', bigendian=True, errors='replace')

In [None]:
# COMPUTE ######################################################################

__i = 0
__s = tf.cast([SAMPLES_1D[__i]], dtype=tf.string),
__x, __t = __preprocess_flat_utf32_3(__s)
__y = MODEL(__x, logits=True)

In [None]:
__o_t = mlable.text.unpack(__postprocess_probs_utf32_3(__t))
__o_p = mlable.text.unpack(__postprocess_sampler_utf32_3(__y))

In [None]:
print('# INPUT #################################################################\n\n' + SAMPLES_1D[__i])
print('\n# OUTPUT ################################################################\n\n' + __o_t[0])
print('\n# SCORE #################################################################\n\n' + str(tokun.eval.compare(__o_t[0], __o_p[0])))

## Dataviz 2D

In [None]:
# 2D SAMPLES ###################################################################

SAMPLES_2D = [
    '''⠄⠄⠄⠄⠄⠄⠄⠄⠄⠄⣠⣤⣄⠄⠄⠄⠄⠄⠄⠄⠄⠄⠄⠄⠄⠄⠄⠄⠄⠄\r\n⠄⠄⠄⠄⠄⠄⠄⠄⠄⢸⣿⣿⡿⠄⠄⠄⠄⠄⢠⣾⣿⣿⡆⠄⠄⠄⠄⠄⠄⠄\r\n⠄⠄⠄⠄⡏⠒⢄⡀⠄⠄⢹⣄⣇⣀⣸⣷⣄⣦⠈⡿⣿⠟⠄⠄⠄⠄⠄⠄⠄⠄\r\n⠄⠄⠄⢠⠁⠄⠄⣙⡦⠒⠹⠄⠇⠴⢿⣿⣿⣿⣿⢷⣇⠄⠄⠄⠄⠄⠄⠄⠄⠄\r\n⠄⠄⠄⠘⣀⣠⡞⠁⠄⠄⠄⠄⠄⠄⠞⠛⠉⠉⠋⠄⠿⣧⠄⠄⢀⣀⣀⣀⣀⣀\r\n⠄⠄⣴⠟⠉⣿⣿⣷⡄⠄⠄⠄⠄⠄⠄⠄⠄⠄⠄⠄⠄⢨⠿⠉⠁⢀⣀⠴⠄⡸\r\n⠄⢰⣷⣶⣾⣿⣿⣿⢸⠄⠄⢀⣴⠞⢻⣿⣷⣦⡀⠄⠄⠄⠄⢠⠚⠁⠄⠄⡰⠁\r\n⠄⠄⡻⣿⣿⣿⠿⡡⠃⠄⠄⣼⣇⣰⣾⣿⣿⣿⢻⠄⠄⠄⢀⣀⣀⣀⡠⠊⠄⠄\r\n⢀⠎⠄⠄⠉⠉⠁⠄⠄⠄⠄⠸⣿⣿⣿⣿⡿⢋⠟⢠⣿⡇⠄⡼⠄⠄⠄⠄⠄⠄\r\n⡜⠄⠤⡀⠄⠄⠄⠄⠄⠄⠄⠄⠈⠛⠯⠭⠚⠁⠄⠄⠛⣣⣦⡇⠄⠄⠄⠄⠄⠄\r\n⢇⠄⠄⠋⠄⠄⠄⠄⢀⠄⠢⠄⠄⠄⠄⠄⣠⣾⣿⣧⠸⣿⣿⠃⠄⠄⠄⠄⠄⠄\r\n⠸⡀⠄⠄⠄⠄⠄⠄⠁⠄⠄⠄⠄⠄⠄⠄⠻⠿⣿⣯⡟⣾⣿⡇⠄⠄⠄⠄⠄⠄\r\n⠄⠙⢄⡀⠄⠄⠄⠄⠄⠄⠄⠄⢀⣠⠔⠒⠒⠋⣽⣿⣇⣿⣿⡇⠄⠄⠄⠄⠄⠄\r\n⠄⠄⠄⠉⠒⠤⢄⣀⣀⣠⠤⠔⠊⢹⠄⠄⠄⠄⠈⠙⠉⣿⣿⡷⠄⠄⠄⠄⠄⠄\r\n⠄⠄⠄⠄⠄⠄⠄⠄⠄⠄⠄⠄⠄⠸⠄⠄⠄⢰⣶⡄⢰⣿⣿⣷⠄⠄⠄⠄⠄⠄''',
    '''⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⣠⣴⠶⠶⣶⠶⠶⠶⠶⠶⠶⠶⠶⠶⢶⠶⠶⠶⠤⠤⠤⠤⣄⣀⣀⡀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀\r\n⠀⠀⠀⠀⠀⠀⠀⠀⣠⡾⠋⠀⠀⠊⠀⠀⠀⠀⠀⠀⠀⠀⠒⠒⠒⠀⠀⠀⠀⠤⢤⣤⣄⠉⠉⠛⠛⠷⣦⣀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀\r\n⠀⠀⠀⠀⠀⠀⠀⣰⠟⠀⠀⠀⠀⠀⠐⠋⢑⣤⣶⣶⣤⡢⡀⠀⠀⠀⠀⠀⠀⠀⠀⢀⣠⣄⡂⠀⠀⠶⢄⠙⢷⣤⠀⠀⠀⠀⠀⠀⠀⠀\r\n⠀⠀⠀⠀⠀⠀⣸⡿⠚⠉⡀⠀⠀⠀⠀⢰⣿⣿⣿⣿⣿⣿⡄⠀⠀⠀⢢⠀⠀⡀⣰⣿⣿⣿⣿⣦⡀⠀⠀⠡⡀⢹⡆⠀⠀⠀⠀⠀⠀⠀\r\n⠀⠀⠀⠀⢀⣴⠏⠀⣀⣀⣀⡤⢤⣄⣠⣿⣿⣿⣿⣻⣿⣿⣷⠀⢋⣾⠈⠙⣶⠒⢿⣿⣿⣿⣿⡿⠟⠃⠀⡀⠡⠼⣧⡀⠀⠀⠀⠀⠀⠀\r\n⠀⠀⢀⣴⣿⢃⡴⢊⢽⣶⣤⣀⠀⠊⠉⠉⡛⢿⣿⣿⣿⠿⠋⢀⡀⠁⠀⠀⢸⣁⣀⣉⣉⣉⡉⠀⠩⡡⠀⣩⣦⠀⠈⠻⣦⡀⠀⠀⠀⠀\r\n⠀⢠⡟⢡⠇⡞⢀⠆⠀⢻⣿⣿⣷⣄⠀⢀⠈⠂⠈⢁⡤⠚⡟⠉⠀⣀⣀⠀⠈⠳⣍⠓⢆⢀⡠⢀⣨⣴⣿⣿⡏⢀⡆⠀⢸⡇⠀⠀⠀⠀\r\n⠀⣾⠁⢸⠀⠀⢸⠀⠀⠀⠹⣿⣿⣿⣿⣶⣬⣦⣤⡈⠀⠀⠇⠀⠛⠉⣩⣤⣤⣤⣿⣤⣤⣴⣾⣿⣿⣿⣿⣿⣧⠞⠀⠀⢸⡇⠀⠀⠀⠀\r\n⠀⢹⣆⠸⠀⠀⢸⠀⠀⠀⠀⠘⢿⣿⣿⣿⣿⣿⣿⣟⣛⠛⠛⣛⡛⠛⠛⣛⣋⡉⠉⣡⠶⢾⣿⣿⣿⣿⣿⣿⡇⠀⠀⢀⣾⠃⠀⠀⠀⠀\r\n⠀⠀⠻⣆⡀⠀⠈⢂⠀⠀⠀⠠⡈⢻⣿⣿⣿⣿⡟⠁⠈⢧⡼⠉⠙⣆⡞⠁⠈⢹⣴⠃⠀⢸⣿⣿⣿⣿⣿⣿⠃⠀⡆⣾⠃⠀⠀⠀⠀⠀\r\n⠀⠀⠀⠈⢻⣇⠀⠀⠀⠀⠀⠀⢡⠀⠹⣿⣿⣿⣷⡀⠀⣸⡇⠀⠀⣿⠁⠀⠀⠘⣿⠀⠀⠘⣿⣿⣿⣿⣿⣿⠀⠀⣿⡇⠀⠀⠀⠀⠀⠀\r\n⠀⠀⠀⠀⠀⠹⣇⠀⠠⠀⠀⠀⠀⠡⠐⢬⡻⣿⣿⣿⣿⣿⣷⣶⣶⣿⣦⣤⣤⣤⣿⣦⣶⣿⣿⣿⣿⣿⣿⣿⠀⠀⣿⡇⠀⠀⠀⠀⠀⠀\r\n⠀⠀⠀⠀⠀⠀⠹⣧⡀⠡⡀⠀⠀⠀⠑⠄⠙⢎⠻⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣦⠀⢿⡇⠀⠀⠀⠀⠀⠀\r\n⠀⠀⠀⠀⠀⠀⠀⠈⠳⣤⡐⡄⠀⠀⠀⠈⠂⠀⠱⣌⠻⣿⣿⣿⣿⣿⣿⣿⠿⣿⠟⢻⡏⢻⣿⣿⣿⣿⣿⣿⣿⠀⢸⡇⠀⠀⠀⠀⠀⠀\r\n⠀⠀⠀⠀⠀⠀⠀⠀⠀⠈⠻⢮⣦⡀⠂⠀⢀⠀⠀⠈⠳⣈⠻⣿⣿⣿⡇⠘⡄⢸⠀⠀⣇⠀⣻⣿⣿⣿⣿⣿⡏⠀⠸⡇⠀⠀⠀⠀⠀⠀\r\n⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠉⠛⢶⣤⣄⡑⠄⠀⠀⠈⠑⠢⠙⠻⢷⣶⣵⣞⣑⣒⣋⣉⣁⣻⣿⠿⠟⠱⠃⡸⠀⣧⠀⠀⠀⠀⠀⠀\r\n⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠉⠛⠻⣷⣄⡀⠐⠢⣄⣀⡀⠀⠉⠉⠉⠉⠛⠙⠭⠭⠄⠒⠈⠀⠐⠁⢀⣿⠀⠀⠀⠀⠀⠀\r\n⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠉⠛⠷⢦⣤⣤⣀⣀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⣒⡠⠄⣠⡾⠃⠀⠀⠀⠀⠀⠀\r\n⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠉⠙⠛⠷⠶⣦⣤⣭⣤⣬⣭⣭⣴⠶⠛⠉⠀⠀⠀⠀⠀⠀⠀⠀''',
    '''⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⣀⣀⡀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀\r\n⠀⠀⠀⠀⠀⠀⠀⡠⠦⣄⠀⣠⠖⠋⠉⠉⠉⠛⢦⣀⡤⠒⠲⡀⠀⠀⠀⠀⠀⠀\r\n⠀⠀⠀⠀⠀⢠⠎⠀⠀⠈⣿⡅⢉⣦⣀⣀⡀⡂⢰⣿⡄⠀⠀⠙⢦⡀⠀⠀⠀⠀\r\n⠀⠀⣀⡠⠞⠁⠀⠀⠀⠀⢷⠘⠛⠃⠙⠛⠃⠙⠓⢁⡇⠀⠀⡀⠀⠈⠑⠦⢄⠀\r\n⠀⠛⠤⠤⣔⢁⡴⠧⠚⠉⠙⢦⡀⠀⠀⠀⠀⠀⣀⠞⠉⠁⠑⠚⠢⠤⠕⠚⠁⠀\r\n⠀⠀⠀⠀⠈⠁⠀⠀⠀⠀⠀⠀  ⠙⠓⠒⠒⠚⠛⠁⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀''',
    '''⣀⣠⣤⣤⣤⣤⢤⣤⣄⣀⣀⣀⣀⡀⡀⠄⠄⠄⠄⠄⠄⠄⠄⠄⠄⠄\r\n⠄⠉⠹⣾⣿⣛⣿⣿⣞⣿⣛⣺⣻⢾⣾⣿⣿⣿⣶⣶⣶⣄⡀⠄⠄⠄\r\n⠄⠄⠠⣿⣷⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣯⣿⣿⣿⣿⣿⣿⣆⠄⠄\r\n⠄⠄⠘⠛⠛⠛⠛⠋⠿⣷⣿⣿⡿⣿⢿⠟⠟⠟⠻⠻⣿⣿⣿⣿⡀⠄\r\n⠄⢀⠄⠄⠄⠄⠄⠄⠄⠄⢛⣿⣁⠄⠄⠒⠂⠄⠄⣀⣰⣿⣿⣿⣿⡀\r\n⠄⠉⠛⠺⢶⣷⡶⠃⠄⠄⠨⣿⣿⡇⠄⡺⣾⣾⣾⣿⣿⣿⣿⣽⣿⣿\r\n⠄⠄⠄⠄⠄⠛⠁⠄⠄⠄⢀⣿⣿⣧⡀⠄⠹⣿⣿⣿⣿⣿⡿⣿⣻⣿\r\n⠄⠄⠄⠄⠄⠄⠄⠄⠄⠄⠄⠉⠛⠟⠇⢀⢰⣿⣿⣿⣏⠉⢿⣽⢿⡏\r\n⠄⠄⠄⠄⠄⠄⠄⠄⠄⠄⠄⠠⠤⣤⣴⣾⣿⣿⣾⣿⣿⣦⠄⢹⡿⠄\r\n⠄⠄⠄⠄⠄⠄⠄⠄⠒⣳⣶⣤⣤⣄⣀⣀⡈⣀⢁⢁⢁⣈⣄⢐⠃⠄\r\n⠄⠄⠄⠄⠄⠄⠄⠄⠄⣰⣿⣛⣻⡿⣿⣿⣿⣿⣿⣿⣿⣿⣿⡯⠄⠄\r\n⠄⠄⠄⠄⠄⠄⠄⠄⠄⣬⣽⣿⣻⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⠁⠄⠄\r\n⠄⠄⠄⠄⠄⠄⠄⠄⠄⢘⣿⣿⣻⣛⣿⡿⣟⣻⣿⣿⣿⣿⡟⠄⠄⠄\r\n⠄⠄⠄⠄⠄⠄⠄⠄⠄⠄⠄⠛⢛⢿⣿⣿⣿⣿⣿⣿⣷⡿⠁⠄⠄⠄\r\n⠄⠄⠄⠄⠄⠄⠄⠄⠄⠄⠄⠄⠄⠄⠄⠉⠉⠉⠉⠈⠄⠄⠄⠄⠄⠄''',
    '''⣿⣿⣿⣿⣿⣿⣿⣿⣿⠿⠟⠛⠻⠿⣿⣿⣿⣿⣿⠿⠿⠿⢿⣿⣿⣿⣿⣿⣿⣿\r\n⣿⣿⣿⣿⣿⣿⠟⠉⠄⠄⠄⠄⠄⠄⠄⠉⢟⠉⠄⠄⠄⠄⠄⠈⢻⣿⣿⣿⣿⣿\r\n⣿⣿⣿⣿⡿⠃⠄⠄⠤⠐⠉⠉⠉⠉⠉⠒⠬⡣⠤⠤⠄⠄⠄⠤⠤⠿⣿⣿⣿⣿\r\n⣿⣿⣿⣿⠁⠄⠄⠄⠄⠄⠄⠠⢀⡒⠤⠭⠅⠚⣓⡆⡆⣔⡙⠓⠚⠛⠄⣹⠿⣿\r\n⣿⠟⠁⡌⠄⠄⠄⢀⠤⠬⠐⣈⠠⡤⠤⠤⣤⠤⢄⡉⢁⣀⣠⣤⣤⣀⣐⡖⢦⣽\r\n⠏⠄⠄⠄⠄⠄⠄⠄⠐⠄⡿⠛⠯⠍⠭⣉⣉⠉⠍⢀⢀⡀⠉⠉⠉⠒⠒⠂⠄⣻\r\n⠄⠄⠄⠄⠄⠄⠄⠄⠄⠄⠄⠩⠵⠒⠒⠲⢒⡢⡉⠁⢐⡀⠬⠍⠁⢉⣉⣴⣿⣿\r\n⠄⠄⠄⠄⠄⠄⠄⠄⠄⠄⠄⠄⠉⢉⣒⡉⠁⠁⠄⠄⠉⠂⠙⣉⣁⣀⣙⡿⣿⣿\r\n⠄⠄⠄⠄⠄⠄⠄⠄⢠⠄⡖⢉⠥⢤⠐⢲⠒⢲⠒⢲⠒⠲⡒⠒⡖⢲⠂⠄⢀⣿\r\n⠄⠄⠄⠄⠄⠄⠄⠄⠈⢆⡑⢄⠳⢾⠒⢺⠒⢺⠒⠚⡖⠄⡏⠉⣞⠞⠁⣠⣾⣿\r\n⠄⠄⠄⠄⠄⠄⢆⠄⠄⠄⠈⠢⠉⠢⠍⣘⣒⣚⣒⣚⣒⣒⣉⠡⠤⣔⣾⣿⣿⣿\r\n⠷⣤⠄⣀⠄⠄⠄⠈⠁⠄⠄⠄⠄⠄⠄⠄⠄⠄⠄⠄⠄⠄⢀⣤⣾⣿⣿⣿⣿⣿\r\n⠄⠄⠉⠐⠢⠭⠄⢀⣒⣒⡒⠄⠄⠄⠄⠄⠄⣀⡠⠶⢶⣿⣿⣿⣿⣿⣿⣿⣿⣿\r\n⠄⠄⠄⠄⠄⠄⠄⠄⠄⠄⠄⠄⠈⠁⠈⠄⠄⠄⠄⠄⠄⠈⠻⣿⣿⣿⣿⣿⣿⣿''',]

In [None]:
# UNPACK #######################################################################

def unpack(data: tf.Tensor) -> list:
    return [b'\n'.join(__s).decode('utf-8') for __s in data.numpy().tolist()]

In [None]:
# PREPROCESS ###################################################################

__preprocess_square_utf8 = PIPELINE_FACTORY['square'](batch_dim=1, token_dim=1, drop_dim=0, height_dim=64, width_dim=64, features=[], encoding='UTF-8', targets=True)

__preprocess_square_utf32_3 = PIPELINE_FACTORY['square'](batch_dim=1, token_dim=3, drop_dim=1, height_dim=64, width_dim=4 * 64, features=[], encoding='UTF-32-BE', targets=True)
__preprocess_square_utf32_4 = PIPELINE_FACTORY['square'](batch_dim=1, token_dim=4, drop_dim=0, height_dim=64, width_dim=4 * 64, features=[], encoding='UTF-32-BE', targets=True)

In [None]:
# COMPUTE ######################################################################

__i = 0
__s = tf.cast([SAMPLES_2D[__i]], dtype=tf.string),
__x, __t = __preprocess_square_utf32_3(__s)
__y = MODEL(__x, logits=True)

In [None]:
__o_t = unpack(__postprocess_probs_utf32_3(__t))
__o_p = unpack(__postprocess_sampler_utf32_3(__y))

In [None]:
print('# INPUT #################################################################\n\n' + SAMPLES_2D[__i])
print('\n# OUTPUT ################################################################\n\n' + __o_t[0])
print('\n# SCORE #################################################################\n\n' + str(tokun.eval.compare(__o_t[0], __o_p[0])))

## Dataviz Embeddings

In [None]:
# SAMPLE #######################################################################

__i = 0

if 'square' in DATA:
    __S = tf.cast([SAMPLES_2D[__i]], dtype=tf.string),
    __X, __T = __preprocess_square_utf32_3(__S)
else:
    __S = tf.cast([SAMPLES_1D[__i]], dtype=tf.string),
    __X, __T = __preprocess_flat_utf32_3(__S)

In [None]:
# EMBEDDINGS ###################################################################

if 'vq' in ARCH:
    __Z = MODEL.encode(__X)
    __Y = MODEL(__X)
    print(tf.math.reduce_mean(MODEL._encoder._layers[0].embeddings, axis=0))
    print(tf.math.reduce_std(MODEL._encoder._layers[0].embeddings, axis=0))
elif 'kl' in ARCH:
    __M, __V = MODEL.encode(__X)
    __Y = MODEL(__X)
    print(tf.math.reduce_mean(__M))
    print(tf.math.reduce_mean(__V))
    print(tf.math.reduce_mean(MODEL.compute_kl(__M, __V)))

In [None]:
%load_ext tensorboard

In [None]:
%tensorboard --logdir .logs

In [None]:
MODEL.save('tokun.keras', overwrite=True, zipped=True)

In [None]:
__I = iter(DATASETS['ft-asciiart-copypasta'])

In [None]:
for _ in range(4):
    __S = next(__I)
    __O = __S['content'].numpy().decode('utf-8')
    print(repr(__O))
    print(__O)

In [None]:
print(SAMPLES_2D[-1])