## Import deps

In [None]:
!pip install -U mlable tokun

Collecting mlable
  Downloading mlable-0.7.4-py3-none-any.whl.metadata (4.7 kB)
Collecting tokun
  Downloading tokun-0.10.10-py3-none-any.whl.metadata (7.7 kB)
Downloading mlable-0.7.4-py3-none-any.whl (20 kB)
Downloading tokun-0.10.10-py3-none-any.whl (11 kB)
Installing collected packages: mlable, tokun
Successfully installed mlable-0.7.4 tokun-0.10.10


In [None]:
import datetime
import functools
import itertools
import math
import os
import random
import urllib.request

import tensorflow as tf
import tensorflow_datasets as tfds

import mlable.data
import mlable.metrics
import mlable.ops

import tokun.data
import tokun.evaluation
import tokun.meta
import tokun.model
import tokun.pipeline

In [None]:
print("Tensorflow version " + tf.__version__)

Tensorflow version 2.15.0


## Setup the GPU / TPU

In [None]:
# MIXED PRECISION #############################################################

tf.keras.mixed_precision.set_global_policy('mixed_bfloat16')

In [None]:
# DEVICES #####################################################################

tf.debugging.set_log_device_placement(False)

CPU = tf.config.list_logical_devices('CPU')
GPU = tf.config.list_logical_devices('GPU')
TPU = tf.config.list_logical_devices('TPU')

if TPU:
    RESOLVER = tf.distribute.cluster_resolver.TPUClusterResolver()
    tf.config.experimental_connect_to_cluster(RESOLVER)
    tf.tpu.experimental.initialize_tpu_system(RESOLVER)
    DISTRIBUTION_STRATEGY = tf.distribute.TPUStrategy(RESOLVER)
elif GPU:
    DISTRIBUTION_STRATEGY = tf.distribute.MirroredStrategy(GPU)
else:
    DISTRIBUTION_STRATEGY = tf.distribute.MirroredStrategy(CPU)

print(DISTRIBUTION_STRATEGY)

<tensorflow.python.distribute.tpu_strategy.TPUStrategyV2 object at 0x7dae3da95ed0>


## Mode

In [None]:
# TOGGLE ######################################################################

BINARY = True

## Defining The Metadata

In [None]:
# MODEL PARAMETERS ############################################################

N_SEQUENCE_AXIS = 1
N_FEATURE_AXIS = -1

N_TOKEN_DIM = [4, 4, 4] # G, for each block
N_INPUT_DIM = 256 # U_i (bytes)
N_OUTPUT_DIM = 8 if BINARY else 256 # U_o (8 bits)
N_EMBEDDING_DIM = 256 # E

OUTPUT = 'binary' if BINARY else 'categorical'

In [None]:
# TRAINING PARAMETERS #########################################################

N_BATCH_DIM = 128 # number of samples per batch
N_SAMPLE_DIM = 4 * 512 # number of bytes per sample

In [None]:
# DERIVED #####################################################################

N_TOKEN_FACTOR = math.prod(N_TOKEN_DIM) // 4
N_TOKEN_SIZES = list(itertools.accumulate(N_TOKEN_DIM, lambda x, y: x * y)) # in BITS (0, 1)
N_OFFSET_TICKS = [2 ** __i for __i in range(int(math.log(N_TOKEN_FACTOR, 2)))] # in characters

VERSION = tokun.meta.version(token_dim=N_TOKEN_DIM, sequence_axis=N_SEQUENCE_AXIS, input_dim=N_INPUT_DIM, embed_dim=N_EMBEDDING_DIM, output_dim=N_OUTPUT_DIM)
DATETIME = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")

In [None]:
# IMPORT ######################################################################

LABEL = '6.4'
URL_IMPORT = 'https://github.com/apehex/tokun/raw/main/models/{}/{}/{}/{}.keras'.format(*VERSION, LABEL)
PATH_IMPORT = 'model.keras'

urllib.request.urlretrieve(URL_IMPORT, PATH_IMPORT)

('model.keras', <http.client.HTTPMessage at 0x7dad160538b0>)

## Loading The Data

In [None]:
# RANDOM DATASET ##############################################################

RANDOM_TRAIN = tokun.data.random_dataset(size=N_BATCH_DIM * 2**14, sample_size=N_SAMPLE_DIM // 4, lower_plane=0, upper_plane=0x40000, binary=False)
RANDOM_TEST = tokun.data.random_dataset(size=N_BATCH_DIM * 2**8, sample_size=N_SAMPLE_DIM // 4, lower_plane=0, upper_plane=0x40000, binary=False)

In [None]:
# MLQA DATASET ################################################################

LANG = ['ar', 'de', 'en', 'es', 'hi', 'vi', 'zh']
MLQA_TRAIN = {__l: tfds.load('mlqa/' + __l, split='test', as_supervised=False, shuffle_files=True, data_dir='~/.cache/tensorflow/', batch_size=None) for __l in LANG}
MLQA_TEST = {__l: tfds.load('mlqa/' + __l, split='validation', as_supervised=False, shuffle_files=True, data_dir='~/.cache/tensorflow/', batch_size=None) for __l in LANG}

Downloading and preparing dataset 72.21 MiB (download: 72.21 MiB, generated: 9.27 MiB, total: 81.49 MiB) to /root/.cache/tensorflow/mlqa/ar/1.0.0...


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

Extraction completed...: 0 file [00:00, ? file/s]

Generating splits...:   0%|          | 0/2 [00:00<?, ? splits/s]

Generating test examples...:   0%|          | 0/5335 [00:00<?, ? examples/s]

Shuffling /root/.cache/tensorflow/mlqa/ar/incomplete.7XTSWD_1.0.0/mlqa-test.tfrecord*...:   0%|          | 0/5…

Generating validation examples...:   0%|          | 0/517 [00:00<?, ? examples/s]

Shuffling /root/.cache/tensorflow/mlqa/ar/incomplete.7XTSWD_1.0.0/mlqa-validation.tfrecord*...:   0%|         …

Dataset mlqa downloaded and prepared to /root/.cache/tensorflow/mlqa/ar/1.0.0. Subsequent calls will reuse this data.
Downloading and preparing dataset 72.21 MiB (download: 72.21 MiB, generated: 5.06 MiB, total: 77.28 MiB) to /root/.cache/tensorflow/mlqa/de/1.0.0...


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

Extraction completed...: 0 file [00:00, ? file/s]

Generating splits...:   0%|          | 0/2 [00:00<?, ? splits/s]

Generating test examples...:   0%|          | 0/4517 [00:00<?, ? examples/s]

Shuffling /root/.cache/tensorflow/mlqa/de/incomplete.3ZBLJ4_1.0.0/mlqa-test.tfrecord*...:   0%|          | 0/4…

Generating validation examples...:   0%|          | 0/512 [00:00<?, ? examples/s]

Shuffling /root/.cache/tensorflow/mlqa/de/incomplete.3ZBLJ4_1.0.0/mlqa-validation.tfrecord*...:   0%|         …

Dataset mlqa downloaded and prepared to /root/.cache/tensorflow/mlqa/de/1.0.0. Subsequent calls will reuse this data.
Downloading and preparing dataset 72.21 MiB (download: 72.21 MiB, generated: 15.72 MiB, total: 87.94 MiB) to /root/.cache/tensorflow/mlqa/en/1.0.0...


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

Extraction completed...: 0 file [00:00, ? file/s]

Generating splits...:   0%|          | 0/2 [00:00<?, ? splits/s]

Generating test examples...:   0%|          | 0/11590 [00:00<?, ? examples/s]

Shuffling /root/.cache/tensorflow/mlqa/en/incomplete.VHPA1T_1.0.0/mlqa-test.tfrecord*...:   0%|          | 0/1…

Generating validation examples...:   0%|          | 0/1148 [00:00<?, ? examples/s]

Shuffling /root/.cache/tensorflow/mlqa/en/incomplete.VHPA1T_1.0.0/mlqa-validation.tfrecord*...:   0%|         …

Dataset mlqa downloaded and prepared to /root/.cache/tensorflow/mlqa/en/1.0.0. Subsequent calls will reuse this data.
Downloading and preparing dataset 72.21 MiB (download: 72.21 MiB, generated: 5.09 MiB, total: 77.30 MiB) to /root/.cache/tensorflow/mlqa/es/1.0.0...


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

Extraction completed...: 0 file [00:00, ? file/s]

Generating splits...:   0%|          | 0/2 [00:00<?, ? splits/s]

Generating test examples...:   0%|          | 0/5253 [00:00<?, ? examples/s]

Shuffling /root/.cache/tensorflow/mlqa/es/incomplete.M7IYED_1.0.0/mlqa-test.tfrecord*...:   0%|          | 0/5…

Generating validation examples...:   0%|          | 0/500 [00:00<?, ? examples/s]

Shuffling /root/.cache/tensorflow/mlqa/es/incomplete.M7IYED_1.0.0/mlqa-validation.tfrecord*...:   0%|         …

Dataset mlqa downloaded and prepared to /root/.cache/tensorflow/mlqa/es/1.0.0. Subsequent calls will reuse this data.
Downloading and preparing dataset 72.21 MiB (download: 72.21 MiB, generated: 12.83 MiB, total: 85.04 MiB) to /root/.cache/tensorflow/mlqa/hi/1.0.0...


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

Extraction completed...: 0 file [00:00, ? file/s]

Generating splits...:   0%|          | 0/2 [00:00<?, ? splits/s]

Generating test examples...:   0%|          | 0/4918 [00:00<?, ? examples/s]

Shuffling /root/.cache/tensorflow/mlqa/hi/incomplete.3RBHUV_1.0.0/mlqa-test.tfrecord*...:   0%|          | 0/4…

Generating validation examples...:   0%|          | 0/507 [00:00<?, ? examples/s]

Shuffling /root/.cache/tensorflow/mlqa/hi/incomplete.3RBHUV_1.0.0/mlqa-validation.tfrecord*...:   0%|         …

Dataset mlqa downloaded and prepared to /root/.cache/tensorflow/mlqa/hi/1.0.0. Subsequent calls will reuse this data.
Downloading and preparing dataset 72.21 MiB (download: 72.21 MiB, generated: 8.77 MiB, total: 80.98 MiB) to /root/.cache/tensorflow/mlqa/vi/1.0.0...


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

Extraction completed...: 0 file [00:00, ? file/s]

Generating splits...:   0%|          | 0/2 [00:00<?, ? splits/s]

Generating test examples...:   0%|          | 0/5495 [00:00<?, ? examples/s]

Shuffling /root/.cache/tensorflow/mlqa/vi/incomplete.Y3LRUH_1.0.0/mlqa-test.tfrecord*...:   0%|          | 0/5…

Generating validation examples...:   0%|          | 0/511 [00:00<?, ? examples/s]

Shuffling /root/.cache/tensorflow/mlqa/vi/incomplete.Y3LRUH_1.0.0/mlqa-validation.tfrecord*...:   0%|         …

Dataset mlqa downloaded and prepared to /root/.cache/tensorflow/mlqa/vi/1.0.0. Subsequent calls will reuse this data.
Downloading and preparing dataset 72.21 MiB (download: 72.21 MiB, generated: 5.13 MiB, total: 77.34 MiB) to /root/.cache/tensorflow/mlqa/zh/1.0.0...


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

Extraction completed...: 0 file [00:00, ? file/s]

Generating splits...:   0%|          | 0/2 [00:00<?, ? splits/s]

Generating test examples...:   0%|          | 0/5137 [00:00<?, ? examples/s]

Shuffling /root/.cache/tensorflow/mlqa/zh/incomplete.GORM02_1.0.0/mlqa-test.tfrecord*...:   0%|          | 0/5…

Generating validation examples...:   0%|          | 0/504 [00:00<?, ? examples/s]

Shuffling /root/.cache/tensorflow/mlqa/zh/incomplete.GORM02_1.0.0/mlqa-validation.tfrecord*...:   0%|         …

Dataset mlqa downloaded and prepared to /root/.cache/tensorflow/mlqa/zh/1.0.0. Subsequent calls will reuse this data.


## Preprocess

In [None]:
# OUTPUT ENCODING #############################################################

_encode_binary = lambda __x: tf.cast(mlable.ops.expand_base(__x, base=2, depth=N_OUTPUT_DIM), dtype=tf.dtypes.float32)
_encode_categorical = lambda __x: tf.one_hot(__x, depth=N_OUTPUT_DIM, axis=-1)
_encode_output = _encode_binary if BINARY else _encode_categorical

In [None]:
# MLQA ########################################################################

PIPELINE = [
    # join the features
    ((lambda __x: tf.strings.join(inputs=[__x['context'], __x['question']], separator='\u001d')), True),
    # offset by 1 to 15 character => (B,) scalar bytes
    *[(functools.partial(tokun.pipeline.offset, ticks=__t), False) for __t in N_OFFSET_TICKS], # (offsets 0, ..., (2 ^ i) - 1) + (offsets 2 ^ i, ..., 2 ^ (i+1) - 1)
    # encode => (B, 4 * S,) int (4 UTF-32 bytes per character)
    (functools.partial(tokun.pipeline.encode, token_size=N_TOKEN_SIZES[-1], sample_size=N_SAMPLE_DIM, dtype=tf.dtypes.int32), True),
    # reshape => (B, 4 * S,) int
    (functools.partial(tf.reshape, shape=(N_SAMPLE_DIM,)), True),
    # encode classes on 8 bits for the 256 possibilities / byte
    ((lambda __x: (__x, _encode_output(__x))), True)]

OPERATIONS, REPLACE = zip(*PIPELINE)

MLQA_TRAIN = {__l: mlable.data.process(dataset=__d, pipeline=OPERATIONS, replace=REPLACE) for __l, __d in MLQA_TRAIN.items()}
MLQA_TEST = {__l: mlable.data.process(dataset=__d, pipeline=OPERATIONS, replace=REPLACE) for __l, __d in MLQA_TEST.items()}

In [None]:
# RANDOM ######################################################################

PIPELINE = [
    # reshape each sample => (32 * S,) int
    (functools.partial(tf.reshape, shape=(N_SAMPLE_DIM,)), True),
    # encode classes on 8 bits for the 256 possibilities / byte
    ((lambda __x: (__x, _encode_output(__x))), True)]

OPERATIONS, REPLACE = zip(*PIPELINE)

RANDOM_TRAIN = mlable.data.process(dataset=RANDOM_TRAIN, pipeline=OPERATIONS, replace=REPLACE)
RANDOM_TEST = mlable.data.process(dataset=RANDOM_TEST, pipeline=OPERATIONS, replace=REPLACE)

## Init The Model

In [None]:
# METRICS #####################################################################

_Accuracy = mlable.metrics.BinaryGroupAccuracy if BINARY else mlable.metrics.CategoricalGroupAccuracy
_Loss = tf.keras.losses.BinaryCrossentropy if BINARY else tf.keras.losses.CategoricalCrossentropy

In [None]:
# COMPILE #####################################################################

with DISTRIBUTION_STRATEGY.scope():
    # metrics
    byte_accuracy = _Accuracy(group=1, name='byte_accuracy')
    character_accuracy = _Accuracy(group=4, name='character_accuracy')
    token_accuracy = _Accuracy(group=N_TOKEN_SIZES[-1], name='token_accuracy')
    # weights
    MODEL = tf.keras.models.load_model(PATH_IMPORT, compile=False)
    # compile
    MODEL.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
        loss=_Loss(from_logits=False, label_smoothing=0., axis=-1, reduction='sum_over_batch_size', name='ce_loss'),
        metrics=[byte_accuracy, character_accuracy, token_accuracy])

In [None]:
MODEL.summary()

Model: "auto_encoder_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 encoder_1 (Encoder)         multiple                  854272    
                                                                 
 decoder_1 (Decoder)         multiple                  793096    
                                                                 
Total params: 1647368 (6.28 MB)
Trainable params: 1647368 (6.28 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


## Export

In [None]:
# SAMPLES #####################################################################

IO = {}
TOKENS = {__i: {} for __i in N_TOKEN_SIZES} # length in bytes
EMBEDDINGS = {__i: {} for __i in N_TOKEN_SIZES} # same

for __lang, __dataset in MLQA_TEST.items():
    # compute predictions
    __batch = iter(__dataset.batch(N_BATCH_DIM, drop_remainder=True, num_parallel_calls=tf.data.AUTOTUNE)) # iterate over batches of samples
    __inputs, __targets = next(__batch)
    __outputs = MODEL(__inputs)
    # sample predictions (targets, preditions)
    IO[__lang] = (__targets, __outputs)

In [None]:
# TOKENS ######################################################################

# unique (G ^ i)-tokens
for __lang, __sample in IO.items():
    for __size in TOKENS:
        # concatenate all the samples in a batch
        __all = tokun.pipeline.postprocess(__sample[0], binary=BINARY, random=False)
        __all = tokun.pipeline.unpack(__all)
        __all = ''.join(__all)
        # save all the unique chunks
        TOKENS[__size][__lang] = tokun.pipeline.chunk(seq=__all, size=__size // 4, repeats=False)

# unique tokens, for all languages
for __size in TOKENS:
    TOKENS[__size]['all'] = list(set(__t for _, __s in TOKENS[__size].items() for __t in __s))

In [None]:
# EMBEDDINGS ##################################################################

for __depth, __size in enumerate(N_TOKEN_SIZES):
    for __lang, __tokens in TOKENS[__size].items():
        # re-encode without token repeats
        __input = tokun.pipeline.preprocess(text=''.join(__tokens), token_size=math.prod(N_TOKEN_DIM), expand=N_SEQUENCE_AXIS * [1])
        # UTF-32 embedding
        __embedding = MODEL._encoder._encoder.layers[0](__input)
        # iterative CNN tokenization
        for __i in range(__depth + 1):
            __embedding = MODEL._encoder._encoder.layers[__i + 1](__embedding)
        # mixed precision: bfloat16 => float32
        __embedding = tf.cast(__embedding, dtype=tf.dtypes.float32)
        # remove the (tokenized) padding
        EMBEDDINGS[__size][__lang] = tf.squeeze(__embedding)[:len(__tokens)] # TODO squeeze?

In [None]:
# NEIGHBORHOODS ###############################################################

__unit = N_TOKEN_SIZES[-1]
__count = 256

TOKENS['local'] = {'all': []}
EMBEDDINGS['local'] = {'all': []}

for __lang, __tokens in TOKENS[__unit].items():
    # stats on the embeddings for the current language
    __std = tf.math.reduce_std(EMBEDDINGS[__unit][__lang], axis=1, keepdims=True)
    __radius = 2. ** (1 - math.log(__unit, 4)) * tf.reduce_mean(__std).numpy()
    # choose a single token
    __t = tokun.pipeline.preprocess(text=random.choice(__tokens), token_size=math.prod(N_TOKEN_DIM), expand=N_SEQUENCE_AXIS * [1])
    # encode it
    __e = tf.cast(MODEL._encoder(__t), dtype=tf.dtypes.float32)
    # add noise to generate random neighbors
    __n = tokun.evaluation.neighbors(point=__e, radius=__radius, count=__count)
    # decode the noisy embeddings
    __d = MODEL._decoder(__n)
    # postprocess
    __r = tokun.pipeline.postprocess(__d, binary=BINARY, random=False)
    __r = ''.join(tokun.pipeline.unpack(__r))
    # chunk
    __m = tokun.pipeline.chunk(seq=__r, size=__unit // 4, repeats=True)
    # save
    TOKENS['local']['all'].extend(__m)
    EMBEDDINGS['local']['all'].append(tf.squeeze(__n))

# merge all the embedding tensors
EMBEDDINGS['local']['all'] = tf.concat(values=EMBEDDINGS['local']['all'], axis=0)

In [None]:
# SAVE ########################################################################

for __size in TOKENS:
    mlable.data.write(data=[__c + ' ' + mlable.data.label(__c) for __c in TOKENS[__size]['all']][:8192], path='./metadata.' + str(__size) + '.label.tsv', tsv=False)
    mlable.data.write(data=TOKENS[__size]['all'][:8192], path='./metadata.' + str(__size) + '.tsv', tsv=False)
    mlable.data.write(data=EMBEDDINGS[__size]['all'].numpy()[:8192], path='./embeddings.' + str(__size) + '.tsv', tsv=True)

## Dataviz

In [None]:
# DATA ########################################################################

SAMPLES = [
    """위키백과, 우리 모두의 백과사전.\nt-분포 확률적 임베딩(t-SNE)은 데이터의 차원 축소에 사용되는 기계 학습 알고리즘 중 하나로, 2002년 샘 로이스Sam Rowise와 제프리 힌튼에 의해 개발되었다.[1] t-SNE는 비선형 차원 축소 기법으로, 고차원 데이터를 특히 2, 3차원 등으로 줄여 가시화하는데에 유용하게 사용된다. 구체적으로 t-SNE는 비슷한 데이터는 근접한 2, 3차원의 지점으로, 다른 데이터는 멀리 떨어진 지점으로 맵핑한다.""",
    """class Encoder(tf.keras.models.Model):\n    def __init__(self, depth: int, token_dim: int, encoding_dim: int, embedding_dim: int, batch_dim: int=None, attention: bool=False, **kwargs) -> None:\n        super(Encoder, self).__init__(**kwargs)\n        self._encoder = tf.keras.Sequential([\n            tf.keras.Input(shape=(encoding_dim,), batch_size=batch_dim, name='input'), # (B * G ^ D, U)\n            tf.keras.layers.Dense(units=embedding_dim, activation=None, use_bias=False, kernel_initializer='glorot_uniform', bias_initializer=None, name='embed-1'),] # (B * G ^ D, U) => (B * G ^ D, E)\n            + [tokun.layers.TokenizeBlock(left_axis=-2, right_axis=-1, token_dim=token_dim, attention=attention, name='tokenize' + (__i + 1) * '-4') for __i in range(depth)]) # (B * G ^ i, E) => (B * G ^ (i-1), E)\n\n    def call(self, x: tf.Tensor) -> tf.Tensor:\n        return self._encoder(x)\n""",
    """class AutoEncoder(tf.keras.models.Model):\n    def __init__(self, token_dim: int, encoding_dim: int, embedding_dim: int, batch_dim: int=None, **kwargs) -> None:\n        super(AutoEncoder, self).__init__(**kwargs)\n        self._encoder = Encoder(token_dim=token_dim, encoding_dim=encoding_dim, embedding_dim=embedding_dim, batch_dim=batch_dim)\n        self._decoder = Decoder(token_dim=token_dim, encoding_dim=encoding_dim, embedding_dim=embedding_dim, batch_dim=batch_dim)\n\n    def call(self, x: tf.Tensor) -> tf.Tensor:\n        return self._decoder(self._encoder(x))""",
    """class AutoEncoder(tf.keras.models.Model):\n  def __init__(self, token_dim: int, encoding_dim: int, embedding_dim: int, batch_dim: int=None, **kwargs) -> None:\n    super(AutoEncoder, self).__init__(**kwargs)\n    self._encoder = Encoder(token_dim=token_dim, encoding_dim=encoding_dim, embedding_dim=embedding_dim, batch_dim=batch_dim)\n    self._decoder = Decoder(token_dim=token_dim, encoding_dim=encoding_dim, embedding_dim=embedding_dim, batch_dim=batch_dim)\n\n  def call(self, x: tf.Tensor) -> tf.Tensor:\n    return self._decoder(self._encoder(x))"""]

In [None]:
# COMPUTE ######################################################################

__i = 0
__x = tokun.pipeline.preprocess(text=SAMPLES[__i], token_size=math.prod(N_TOKEN_DIM), expand=[1])
__p = MODEL(__x)
__y = tokun.pipeline.postprocess(__p, binary=BINARY, random=False)
__o = tokun.pipeline.unpack(data=__y)

In [None]:
print('# INPUT ################################################################\n\n' + SAMPLES[__i])
print('\n# OUTPUT ###############################################################\n\n' + __o[0])
print('\n# SCORE ################################################################\n\n' + str(tokun.evaluation.compare(SAMPLES[__i], __o[0])))

# INPUT ################################################################

위키백과, 우리 모두의 백과사전.
t-분포 확률적 임베딩(t-SNE)은 데이터의 차원 축소에 사용되는 기계 학습 알고리즘 중 하나로, 2002년 샘 로이스Sam Rowise와 제프리 힌튼에 의해 개발되었다.[1] t-SNE는 비선형 차원 축소 기법으로, 고차원 데이터를 특히 2, 3차원 등으로 줄여 가시화하는데에 유용하게 사용된다. 구체적으로 t-SNE는 비슷한 데이터는 근접한 2, 3차원의 지점으로, 다른 데이터는 멀리 떨어진 지점으로 맵핑한다.

# OUTPUT ###############################################################

위키백과, 우리 모두의 백과사전.
t-분포 확률적 임베딩(t-SNE)은 데이터의 차원 축소에 사용되는 기계 학습 알고리즘 중 하나로, 2002년 샘 로이스Sam Rowise와 제프리 힌튼에 의해 개발되었다.[1] t-SNE는 비선형 차원 축소 기법으로, 고차원 데이터를 특히 2, 3차원 등으로 줄여 가시화하는데에 유용하게 사용된다. 구체적으로 t-SNE는 비슷한 데이터는 근접한 2, 3차원의 지점으로, 다른 데이터는 멀리 떨어진 지점으로 맵핑한다.    

# SCORE ################################################################

1.0


In [None]:
# NOISE #######################################################################

__std = tf.math.reduce_std(EMBEDDINGS[N_TOKEN_SIZES[-1]]['en'], axis=1)
__noise = tf.random.normal(shape=(256,), mean=0., stddev=tf.math.reduce_mean(__std).numpy())

In [None]:
__s = """Une unité lexicale ou token lexical ou plus simplement token est un couple composé d'un nom et d'une valeur optionnelle (e.g. 135677)."""
__x = tokun.pipeline.preprocess(__s, token_size=math.prod(N_TOKEN_DIM), expand=N_SEQUENCE_AXIS * [1])
__e = tf.cast(MODEL._encoder(__x), dtype=tf.dtypes.float32)
__r0 = tokun.pipeline.postprocess(MODEL._decoder(__e), binary=BINARY, random=False)
__r1 = tokun.pipeline.postprocess(MODEL._decoder(__e + 0.06 * __noise), binary=BINARY, random=False)

print(tokun.pipeline.unpack(__r0))
print(tokun.pipeline.unpack(__r1))

["Une unité lexicale ou token lexical ou plus simplement token est un couple composé d'un nom et d'une valeur optionnelle (e.g. 135677).\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"]
["Une unité lexicale ou token lexical ou plus simplement token est un couple composé d'un nom et0d'une valeur opvionnelle (e.g. 135677).\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"]


In [None]:
# LARGER SAMPLE ###############################################################

__s = """[ This program prints "Hello World!" and a newline to the screen; its
  length is 106 active command characters. [It is not the shortest.]

  This loop is an "initial comment loop", a simple way of adding a comment
  to a BF program such that you don't have to worry about any command
  characters. Any ".", ",", "+", "-", "<" and ">" characters are simply
  ignored, the "[" and "]" characters just have to be balanced. This
  loop and the commands it contains are ignored because the current cell
  defaults to a value of 0; the 0 value causes this loop to be skipped.
]
++++++++                Set Cell #0 to 8
[
    >++++               Add 4 to Cell #1; this will always set Cell #1 to 4
    [                   as the cell will be cleared by the loop
        >++             Add 2 to Cell #2
        >+++            Add 3 to Cell #3
        >+++            Add 3 to Cell #4
        >+              Add 1 to Cell #5
        <<<<-           Decrement the loop counter in Cell #1
    ]                   Loop until Cell #1 is zero; number of iterations is 4
    >+                  Add 1 to Cell #2
    >+                  Add 1 to Cell #3
    >-                  Subtract 1 from Cell #4
    >>+                 Add 1 to Cell #6
    [<]                 Move back to the first zero cell you find; this will
                        be Cell #1 which was cleared by the previous loop
    <-                  Decrement the loop Counter in Cell #0
]                       Loop until Cell #0 is zero; number of iterations is 8

The result of this is:
Cell no :   0   1   2   3   4   5   6
Contents:   0   0  72 104  88  32   8
Pointer :   ^

>>.                     Cell #2 has value 72 which is 'H'
>---.                   Subtract 3 from Cell #3 to get 101 which is 'e'
+++++++..+++.           Likewise for 'llo' from Cell #3
>>.                     Cell #5 is 32 for the space
<-.                     Subtract 1 from Cell #4 for 87 to give a 'W'
<.                      Cell #3 was set to 'o' from the end of 'Hello'
+++.------.--------.    Cell #3 for 'rl' and 'd'
>>+.                    Add 1 to Cell #5 gives us an exclamation point
>++.                    And finally a newline from Cell #6"""

__x, __e, __p, __y, __o = tokun.pipeline.sample(model=MODEL, text=__s, token_size=math.prod(N_TOKEN_DIM), expand=N_SEQUENCE_AXIS * [1], binary=BINARY, random=False)

In [None]:
tokun.evaluation.compare(__s, __o[0])

1.0

In [None]:
print(__o[0])

[ This program prints "Hello World!" and a newline to the screen; its
  length is 106 active command characters. [It is not the shortest.]

  This loop is an "initial comment loop", a simple way of adding a comment
  to a BF program such that you don't have to worry about any command
  characters. Any ".", ",", "+", "-", "<" and ">" characters are simply
  ignored, the "[" and "]" characters just have to be balanced. This
  loop and the commands it contains are ignored because the current cell
  defaults to a value of 0; the 0 value causes this loop to be skipped.
]
++++++++                Set Cell #0 to 8
[
    >++++               Add 4 to Cell #1; this will always set Cell #1 to 4
    [                   as the cell will be cleared by the loop
        >++             Add 2 to Cell #2
        >+++            Add 3 to Cell #3
        >+++            Add 3 to Cell #4
        >+              Add 1 to Cell #5
        <<<<-           Decrement the loop counter in Cell #1
    ]             