## Import deps

In [None]:
!pip install -U mlable tokun

Collecting mlable
  Downloading mlable-0.5.1-py3-none-any.whl (19 kB)
Collecting tokun
  Downloading tokun-0.9.1-py3-none-any.whl (18 kB)
Installing collected packages: mlable, tokun
Successfully installed mlable-0.5.1 tokun-0.9.1


In [None]:
%load_ext tensorboard

In [None]:
import datetime
import functools
import itertools
import math
import os
import random
import urllib.request

import tensorflow as tf
import tensorflow_datasets as tfds

import mlable.data
import mlable.metrics

import tokun.data
import tokun.evaluation
import tokun.meta
import tokun.model
import tokun.pipeline

In [None]:
print("Tensorflow version " + tf.__version__)

Tensorflow version 2.15.0


## Setup the GPU / TPU

In [None]:
# MIXED PRECISION #############################################################

tf.keras.mixed_precision.set_global_policy('mixed_bfloat16')

In [None]:
# DEVICES #####################################################################

tf.debugging.set_log_device_placement(False)

CPU = tf.config.list_logical_devices('CPU')
GPU = tf.config.list_logical_devices('GPU')
TPU = tf.config.list_logical_devices('TPU')

if TPU:
    RESOLVER = tf.distribute.cluster_resolver.TPUClusterResolver()
    tf.config.experimental_connect_to_cluster(RESOLVER)
    tf.tpu.experimental.initialize_tpu_system(RESOLVER)
    DISTRIBUTION_STRATEGY = tf.distribute.TPUStrategy(RESOLVER)
elif GPU:
    DISTRIBUTION_STRATEGY = tf.distribute.MirroredStrategy(GPU)
else:
    DISTRIBUTION_STRATEGY = tf.distribute.MirroredStrategy(CPU)

print(DISTRIBUTION_STRATEGY)

<tensorflow.python.distribute.tpu_strategy.TPUStrategyV2 object at 0x7ee97ec4faf0>


## Mode

In [72]:
# TOGGLE ######################################################################

IMPORT = False
TRAINING = True
RANDOM = True

## Defining The Metadata

In [73]:
# MODEL PARAMETERS ############################################################

N_SEQUENCE_AXIS = 1
N_FEATURE_AXIS = -1

N_TOKEN_DIM = [4, 4, 4] # G, for each block
N_ENCODING_DIM = 256 # U
N_EMBEDDING_DIM = N_ENCODING_DIM # E

In [74]:
# TRAINING PARAMETERS #########################################################

N_EPOCHS = 8

N_BATCH_DIM = 128 # number of samples per batch
N_SAMPLE_DIM = 256 # number of characters per sample (=> 4 * N_SAMPLE_DIM integers per sample)

R_0, B_1, B_2 = tokun.meta.rates(pretrained=IMPORT, normalization=True, base=0.001)

CLASS_WEIGHTS = {__c: 0.3 if __c == 0 else 1. for __c in range(N_ENCODING_DIM)} # there are 3 times more 0s than other bytes

In [75]:
# DERIVED #####################################################################

N_TOKEN_SIZES = list(itertools.accumulate(N_TOKEN_DIM, lambda x, y: x * y)) # in bytes
N_OFFSET_TICKS = [2 ** __i for __i in range(int(math.log(N_TOKEN_SIZES[-1] // 4, 2)))] # in characters

VERSION = tokun.meta.version(units=N_TOKEN_DIM, axis=N_SEQUENCE_AXIS)
DATETIME = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")

In [76]:
# IMPORT ######################################################################

LABEL = '5.3'
URL_IMPORT = 'https://github.com/apehex/tokun/raw/main/models/{}/{}/{}.keras'.format(*VERSION, LABEL)
PATH_IMPORT = 'model.keras'

if IMPORT:
    urllib.request.urlretrieve(URL_IMPORT, PATH_IMPORT)

In [77]:
# EXPORT ######################################################################

PATH_LOG = os.path.join('.logs/', *VERSION, DATETIME)
PATH_EXPORT = 'model.keras'

## Loading The Data

In [78]:
# RANDOM DATASET ##############################################################

RANDOM_TRAIN = tokun.data.random_dataset(size=N_BATCH_DIM * 2**14, sample_size=N_SAMPLE_DIM, lower_plane=0, upper_plane=0x40000)
RANDOM_TEST = tokun.data.random_dataset(size=N_BATCH_DIM * 2**8, sample_size=N_SAMPLE_DIM, lower_plane=0, upper_plane=0x40000)

In [79]:
# MLQA DATASET ################################################################

LANG = ['ar', 'de', 'en', 'es', 'hi', 'vi', 'zh']
MLQA_TRAIN = {__l: tfds.load('mlqa/' + __l, split='test', as_supervised=False, shuffle_files=True, data_dir='~/.cache/tensorflow/', batch_size=None) for __l in LANG}
MLQA_TEST = {__l: tfds.load('mlqa/' + __l, split='validation', as_supervised=False, shuffle_files=True, data_dir='~/.cache/tensorflow/', batch_size=None) for __l in LANG}

## Preprocess

In [80]:
# MLQA ########################################################################

PIPELINE = [
    # join the features
    ((lambda x: tf.strings.join(inputs=[x['context'], x['question'], x['answers']['text']], separator='\u001d')), True),
    # offset by 1 to 15 character => (1,) scalar bytes
    *[(functools.partial(tokun.pipeline.offset, ticks=__t), False) for __t in N_OFFSET_TICKS], # (offsets 0, ..., (2 ^ i) - 1) + (offsets 2 ^ i, ..., 2 ^ (i+1) - 1)
    # encode => (4 * S,) int (4 UTF-32 bytes per character)
    (functools.partial(tokun.pipeline.encode, token_size=N_TOKEN_SIZES[-1], sample_size=N_SAMPLE_DIM), True),
    # reshape => (4 * S,) int
    (functools.partial(tf.reshape, shape=(4 * N_SAMPLE_DIM,)), True),
    # one-hot encoding for the targets => (4 * S, E) int (bool)
    ((lambda x: (x, tf.one_hot(x, depth=N_ENCODING_DIM, axis=-1))), True)]

OPERATIONS, REPLACE = zip(*PIPELINE)

MLQA_TRAIN = {__l: mlable.data.process(dataset=__d, pipeline=OPERATIONS, replace=REPLACE) for __l, __d in MLQA_TRAIN.items()}
MLQA_TEST = {__l: mlable.data.process(dataset=__d, pipeline=OPERATIONS, replace=REPLACE) for __l, __d in MLQA_TEST.items()}

In [81]:
# RANDOM ######################################################################

PIPELINE = [
    # reshape each sample => (4 * S,) int
    (functools.partial(tf.reshape, shape=(4 * N_SAMPLE_DIM,)), True),
    # one-hot encoding for the targets => (4 * S, E) int (bool)
    ((lambda x: (x, tf.one_hot(x, depth=N_ENCODING_DIM, axis=-1))), True)]

OPERATIONS, REPLACE = zip(*PIPELINE)

RANDOM_TRAIN = mlable.data.process(dataset=RANDOM_TRAIN, pipeline=OPERATIONS, replace=REPLACE)
RANDOM_TEST = mlable.data.process(dataset=RANDOM_TEST, pipeline=OPERATIONS, replace=REPLACE)

In [82]:
# TOGGLE ######################################################################

DATASET_TRAIN = RANDOM_TRAIN if RANDOM else MLQA_TRAIN['ar'].concatenate(MLQA_TRAIN['en']).concatenate(MLQA_TRAIN['es']).concatenate(MLQA_TRAIN['de']).concatenate(MLQA_TRAIN['hi']).concatenate(MLQA_TRAIN['vi']).concatenate(MLQA_TRAIN['zh'])
DATASET_TEST = MLQA_TEST['ar'].concatenate(MLQA_TEST['en']).concatenate(MLQA_TEST['es']).concatenate(MLQA_TEST['de']).concatenate(MLQA_TEST['hi']).concatenate(MLQA_TEST['vi']).concatenate(MLQA_TEST['zh'])

In [83]:
# INSPECT #####################################################################

print(RANDOM_TRAIN.element_spec)
print(RANDOM_TEST.element_spec)

print(DATASET_TRAIN.element_spec)
print(DATASET_TEST.element_spec)

print('train: {:,}'.format(DATASET_TRAIN.cardinality().numpy()))
print('test:  {:,}'.format(DATASET_TEST.cardinality().numpy()))

(TensorSpec(shape=(1024,), dtype=tf.int32, name=None), TensorSpec(shape=(1024, 256), dtype=tf.float32, name=None))
(TensorSpec(shape=(1024,), dtype=tf.int32, name=None), TensorSpec(shape=(1024, 256), dtype=tf.float32, name=None))
(TensorSpec(shape=(1024,), dtype=tf.int32, name=None), TensorSpec(shape=(1024, 256), dtype=tf.float32, name=None))
(TensorSpec(shape=(1024,), dtype=tf.uint8, name=None), TensorSpec(shape=(1024, 256), dtype=tf.float32, name=None))
train: -2
test:  67,184


## Init The Model

In [84]:
# COMPILE ########################################################################

with DISTRIBUTION_STRATEGY.scope():
    # metrics
    byte_accuracy = mlable.metrics.CategoricalGroupAccuracy(group=1, name='byte_accuracy')
    character_accuracy = mlable.metrics.CategoricalGroupAccuracy(group=4, name='character_accuracy')
    token_accuracy = mlable.metrics.CategoricalGroupAccuracy(group=N_TOKEN_SIZES[-1], name='token_accuracy')
    # weights
    MODEL = tokun.model.AutoEncoder(sequence_axis=N_SEQUENCE_AXIS, feature_axis=N_FEATURE_AXIS, token_dim=N_TOKEN_DIM, encoding_dim=N_ENCODING_DIM, embedding_dim=N_EMBEDDING_DIM, activation='gelu')
    if IMPORT and os.path.isfile(PATH_IMPORT): MODEL = tf.keras.models.load_model(PATH_IMPORT, compile=False)
    # compile
    MODEL.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=R_0, beta_1=B_1, beta_2=B_2),
        loss=tf.keras.losses.CategoricalCrossentropy(from_logits=False, label_smoothing=0., axis=-1, reduction=tf.keras.losses.Reduction.SUM_OVER_BATCH_SIZE, name='loss'),
        metrics=[byte_accuracy, character_accuracy, token_accuracy])

## Train

In [85]:
# TRAIN #######################################################################

if TRAINING:
    with DISTRIBUTION_STRATEGY.scope():
        # callbacks
        cp_callback = tf.keras.callbacks.ModelCheckpoint(PATH_EXPORT, monitor='val_loss', verbose=1, save_best_only=True, save_weights_only=False, mode='auto', save_freq='epoch')
        tb_callback = tf.keras.callbacks.TensorBoard(log_dir=PATH_LOG)
        # fit model
        TRAINING_HISTORY = MODEL.fit(
            x=DATASET_TRAIN.batch(N_BATCH_DIM).prefetch(tf.data.AUTOTUNE),
            batch_size=None,
            epochs=N_EPOCHS,
            validation_split=None,
            validation_data=DATASET_TEST.batch(N_BATCH_DIM).prefetch(tf.data.AUTOTUNE),
            validation_freq=list(range(1, N_EPOCHS + 1, 1)),
            class_weight=CLASS_WEIGHTS,
            verbose=1,
            callbacks=[cp_callback, tb_callback])

Epoch 1/8
  16384/Unknown - 1379s 83ms/step - loss: 0.7905 - byte_accuracy: 0.8123 - character_accuracy: 0.3223 - token_accuracy: 3.8266e-05
Epoch 1: val_loss improved from inf to 0.45274, saving model to model.keras
Epoch 2/8
Epoch 2: val_loss improved from 0.45274 to 0.21542, saving model to model.keras
Epoch 3/8
 3252/16384 [====>.........................] - ETA: 18:30 - loss: 0.1697 - byte_accuracy: 0.9562 - character_accuracy: 0.8284 - token_accuracy: 0.0852

KeyboardInterrupt: 

In [86]:
MODEL.summary()

Model: "auto_encoder_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 encoder_4 (Encoder)         multiple                  854272    
                                                                 
 decoder_4 (Decoder)         multiple                  856832    
                                                                 
Total params: 1711104 (6.53 MB)
Trainable params: 1711104 (6.53 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


## Export

In [107]:
# SAMPLES #####################################################################

IO = {}
TOKENS = {__i: {} for __i in N_TOKEN_SIZES} # length in bytes
EMBEDDINGS = {__i: {} for __i in N_TOKEN_SIZES} # same

for __lang, __dataset in MLQA_TEST.items():
    # compute predictions
    __batch = iter(__dataset.batch(N_BATCH_DIM)) # iterate over batches of samples
    __inputs, __targets = next(__batch)
    __outputs = MODEL(__inputs)
    # sample predictions (targets, preditions)
    IO[__lang] = (__targets, __outputs)

In [108]:
# TOKENS ######################################################################

# unique (G ^ i)-tokens
for __lang, __sample in IO.items():
    for __size in TOKENS:
        TOKENS[__size][__lang] = tokun.pipeline.chunk(seq=tokun.pipeline.postprocess(__sample[0]), size=__size // 4, repeats=False)

# unique tokens, for all languages
for __size in TOKENS:
    TOKENS[__size]['all'] = list(set(__t for _, __s in TOKENS[__size].items() for __t in __s))

In [109]:
# EMBEDDINGS ##################################################################

for __depth, __size in enumerate(N_TOKEN_SIZES):
    for __lang, __tokens in TOKENS[__size].items():
        # re-encode without token repeats
        __input = tokun.pipeline.preprocess(text=''.join(__tokens), token_size=math.prod(N_TOKEN_DIM), expand=N_SEQUENCE_AXIS * [1])
        # UTF-32 embedding
        __embedding = MODEL._encoder._encoder.layers[0](__input)
        # iterative CNN tokenization
        for __i in range(__depth + 1):
            __embedding = MODEL._encoder._encoder.layers[__i + 1](__embedding)
        # mixed precision: bfloat16 => float32
        __embedding = tf.cast(__embedding, dtype=tf.dtypes.float32)
        # remove the (tokenized) padding
        EMBEDDINGS[__size][__lang] = tf.squeeze(__embedding)[:len(__tokens)] # TODO squeeze?

In [111]:
# NEIGHBORHOODS ###############################################################

__unit = N_TOKEN_SIZES[-1]
__count = 256

TOKENS['local'] = {'all': []}
EMBEDDINGS['local'] = {'all': []}

for __lang, __tokens in TOKENS[__unit].items():
    # stats on the embeddings for the current language
    __std = tf.math.reduce_std(EMBEDDINGS[__unit][__lang], axis=0, keepdims=True)
    __radius = 2. ** (1 - math.log(__unit, 4)) * tf.reduce_mean(__std).numpy()
    # choose a single token
    __t = tokun.pipeline.preprocess(text=random.choice(__tokens), token_size=math.prod(N_TOKEN_DIM), expand=N_SEQUENCE_AXIS * [1])
    # encode it
    __e = tf.cast(MODEL._encoder(__t), dtype=tf.dtypes.float32)
    # add noise to generate random neighbors
    __n = tokun.evaluation.neighbors(point=__e, radius=__radius, count=__count)
    # decode the noisy embeddings
    __d = MODEL._decoder(__n)
    # postprocess
    __m = tokun.pipeline.chunk(seq=tokun.pipeline.postprocess(__d), size=__unit // 4, repeats=True)
    # save
    TOKENS['local']['all'].extend(__m)
    EMBEDDINGS['local']['all'].append(tf.squeeze(__n))

# merge all the embedding tensors
EMBEDDINGS['local']['all'] = tf.concat(values=EMBEDDINGS['local']['all'], axis=0)

In [None]:
# SAVE ########################################################################

for __size in TOKENS:
    mlable.data.write(data=[__c + ' ' + mlable.data.label(__c) for __c in TOKENS[__size]['all']][:8192], path='./metadata.' + str(__size) + '.label.tsv', tsv=False)
    mlable.data.write(data=TOKENS[__size]['all'][:8192], path='./metadata.' + str(__size) + '.tsv', tsv=False)
    mlable.data.write(data=EMBEDDINGS[__size]['all'].numpy()[:8192], path='./embeddings.' + str(__size) + '.tsv', tsv=True)

## Dataviz

In [90]:
# DATA ########################################################################

SAMPLES = [
    """위키백과, 우리 모두의 백과사전.\nt-분포 확률적 임베딩(t-SNE)은 데이터의 차원 축소에 사용되는 기계 학습 알고리즘 중 하나로, 2002년 샘 로이스Sam Rowise와 제프리 힌튼에 의해 개발되었다.[1] t-SNE는 비선형 차원 축소 기법으로, 고차원 데이터를 특히 2, 3차원 등으로 줄여 가시화하는데에 유용하게 사용된다. 구체적으로 t-SNE는 비슷한 데이터는 근접한 2, 3차원의 지점으로, 다른 데이터는 멀리 떨어진 지점으로 맵핑한다.""",
    """class Encoder(tf.keras.models.Model):\n    def __init__(self, depth: int, token_dim: int, encoding_dim: int, embedding_dim: int, batch_dim: int=None, attention: bool=False, **kwargs) -> None:\n        super(Encoder, self).__init__(**kwargs)\n        self._encoder = tf.keras.Sequential([\n            tf.keras.Input(shape=(encoding_dim,), batch_size=batch_dim, name='input'), # (B * G ^ D, U)\n            tf.keras.layers.Dense(units=embedding_dim, activation=None, use_bias=False, kernel_initializer='glorot_uniform', bias_initializer=None, name='embed-1'),] # (B * G ^ D, U) => (B * G ^ D, E)\n            + [tokun.layers.TokenizeBlock(left_axis=-2, right_axis=-1, token_dim=token_dim, attention=attention, name='tokenize' + (__i + 1) * '-4') for __i in range(depth)]) # (B * G ^ i, E) => (B * G ^ (i-1), E)\n\n    def call(self, x: tf.Tensor) -> tf.Tensor:\n        return self._encoder(x)\n""",
    """class AutoEncoder(tf.keras.models.Model):\n    def __init__(self, token_dim: int, encoding_dim: int, embedding_dim: int, batch_dim: int=None, **kwargs) -> None:\n        super(AutoEncoder, self).__init__(**kwargs)\n        self._encoder = Encoder(token_dim=token_dim, encoding_dim=encoding_dim, embedding_dim=embedding_dim, batch_dim=batch_dim)\n        self._decoder = Decoder(token_dim=token_dim, encoding_dim=encoding_dim, embedding_dim=embedding_dim, batch_dim=batch_dim)\n\n    def call(self, x: tf.Tensor) -> tf.Tensor:\n        return self._decoder(self._encoder(x))""",
    """class AutoEncoder(tf.keras.models.Model):\n  def __init__(self, token_dim: int, encoding_dim: int, embedding_dim: int, batch_dim: int=None, **kwargs) -> None:\n    super(AutoEncoder, self).__init__(**kwargs)\n    self._encoder = Encoder(token_dim=token_dim, encoding_dim=encoding_dim, embedding_dim=embedding_dim, batch_dim=batch_dim)\n    self._decoder = Decoder(token_dim=token_dim, encoding_dim=encoding_dim, embedding_dim=embedding_dim, batch_dim=batch_dim)\n\n  def call(self, x: tf.Tensor) -> tf.Tensor:\n    return self._decoder(self._encoder(x))"""]

In [94]:
# COMPUTE ######################################################################

__i = 0
__x, __e, __p, __y = tokun.pipeline.sample(model=MODEL, text=SAMPLES[__i], token_size=math.prod(N_TOKEN_DIM), expand=N_SEQUENCE_AXIS * [1])

In [95]:
print('# INPUT ################################################################\n\n' + SAMPLES[__i])
print('\n# OUTPUT ###############################################################\n\n' + __y)
print('\n# SCORE ################################################################\n\n' + str(tokun.evaluation.compare(SAMPLES[__i], __y)))

# INPUT ################################################################

위키백과, 우리 모두의 백과사전.
t-분포 확률적 임베딩(t-SNE)은 데이터의 차원 축소에 사용되는 기계 학습 알고리즘 중 하나로, 2002년 샘 로이스Sam Rowise와 제프리 힌튼에 의해 개발되었다.[1] t-SNE는 비선형 차원 축소 기법으로, 고차원 데이터를 특히 2, 3차원 등으로 줄여 가시화하는데에 유용하게 사용된다. 구체적으로 t-SNE는 비슷한 데이터는 근접한 2, 3차원의 지점으로, 다른 데이터는 멀리 떨어진 지점으로 맵핑한다.

# OUTPUT ###############################################################

위키백과, 우륬 모두잆 백과사전.
-분포 확률적 임볌딩(-mNE)은 데이터잆 쳝원 축쇿에 사왩되는 기계 학싹 알고리즘 중 픞나로, 2002년 삚 로이스Sam Ro^se와 제프리 힌튁에 잆해 개발되었다.[1] t-mNE는 비선형 차원 축소 기법으로, 고차원 𻌓이터를 특히 2, )차원 등으로 줄여 겄시화픞는덣에 유왩픞게 사용된다. 구체적으로 t-ýN¬는 비슷한 덣윇터는 근졈한 2, q차원잆 지점으로, 다른 댓이터는 멀리 떨어진 지점으로 맵핌한다.ÆÆ

# SCORE ################################################################

0.8650793650793651


In [96]:
# FROM DATASET ################################################################

# compute
__l = tokun.pipeline.postprocess(IO['vi'][0])
__r = tokun.pipeline.postprocess(IO['vi'][1])

# print
print(__l)
print(__r)
print(tokun.evaluation.compare(__l, __r))

Trường hợp đầu tiên của nhiễm M. marinum kết hợp với một bể cá ('u hạt cá bể')  đã được báo cáo vào năm 1962 bởi Swift và Cohen.  M. marinum nhiễm trùng có thể là một mối nguy hiểm nghề nghiệp cho một số nghề nghiệp như công nhân cửa hàng vật nuôi, nhưng hFifield bị đột biến quay lại tàu Promethus và giết một số người trong đoàn trước khi bị giết chết. Thuyền trưởng của tàu là Janek cho rằng khối kiến trúc này là căn cứ quân sự của những Engineers không may bị diệt vong và sơ tán bởi vũ khí sinh học là chấtNhóm nhị diện (nêu ở trên) là nhóm hữu hạn có bậc 8. Bậc của r1 là 4, hay chính là bậc của nhóm con R mà nó sinh ra(xem ở trên). Bậc của các phần tử phản xạ fvv.v bằng 2. Cả hai đều là ước của 8 đúng như định lý Lagrange tiên đoán. Nhóm Fp× ở trên có bậc pBulgaria Air được thành lập vào tháng 11 năm 2002 do Quyết định của Bộ trưởng Bộ Giao thông Vận tải Bulgaria để kế thừa hãng Balkan Bulgarian Airlines bị phá sản. Bulgaria Air bắt đầu hoạt động từ ngày 4.12.2002. Tên và logo của hãn

In [None]:
%tensorboard --logdir .logs

In [112]:
__std = tf.math.reduce_std(EMBEDDINGS[N_TOKEN_SIZES[-1]]['en'], axis=0)
__noise = tf.random.normal(shape=(256,), mean=0., stddev=tf.math.reduce_mean(__std).numpy())

In [113]:
__s = """Une unité lexicale ou token lexical ou plus simplement token est un couple composé d'un nom et d'une valeur optionnelle (e.g. 135677)."""
__x = tokun.pipeline.preprocess(__s, token_size=math.prod(N_TOKEN_DIM), expand=N_SEQUENCE_AXIS * [1])
__e = tf.cast(MODEL._encoder(__x), dtype=tf.dtypes.float32)

print(tokun.pipeline.postprocess(MODEL._decoder(__e)))
print(tokun.pipeline.postprocess(MODEL._decoder(__e + 0.4 * __std)))
print(tokun.pipeline.postprocess(MODEL._decoder(__e + 0.1 * __noise)))

Une uni/é lex^cale ou o4en lexical ou plus simplemen to4en est un coule comosé d'un nom e d'une valeur octionne;le (e.g. 1q5677).ÆÆÆÆ
_혡ࠗÐ=!𰁞͞M)O𰁿혗gÄ=Mñl!gùO𰃉Ã훢Ð후ì홍íug!^혢𰁷혗ý!ñ홍ñτlûM§혽!gÄ혽4gÄ"竭τ훩ͧdÖ=!g!ÄÆg+ñ穧`Ö혽!M?𽚢u²gw稐𰁞Ä혡!ù+gέ+§.MÃ횬PPL£혃𰁩©i^©穩𰀃
땕ne㐠ì^ϩ 퍬홥땸㓩홣ae 후혠/팴e퍮혠࠻홸^땣a;횹u ͬu퍳혠땳㒩홭pleý	n ͯ䡅𭍥홮딎estun㒹óuͬ팠홣땯㑉穰o땳é¹홠nͮo퍭혠땥㐐혠dung8a𠏂ࡵࡲ혠땯穴inn홥le̨팮g딮㓐1q67΄딃펄혃딃㓆𽘃


In [100]:
__s = """[ This program prints "Hello World!" and a newline to the screen; its
  length is 106 active command characters. [It is not the shortest.]

  This loop is an "initial comment loop", a simple way of adding a comment
  to a BF program such that you don't have to worry about any command
  characters. Any ".", ",", "+", "-", "<" and ">" characters are simply
  ignored, the "[" and "]" characters just have to be balanced. This
  loop and the commands it contains are ignored because the current cell
  defaults to a value of 0; the 0 value causes this loop to be skipped.
]
++++++++                Set Cell #0 to 8
[
    >++++               Add 4 to Cell #1; this will always set Cell #1 to 4
    [                   as the cell will be cleared by the loop
        >++             Add 2 to Cell #2
        >+++            Add 3 to Cell #3
        >+++            Add 3 to Cell #4
        >+              Add 1 to Cell #5
        <<<<-           Decrement the loop counter in Cell #1
    ]                   Loop until Cell #1 is zero; number of iterations is 4
    >+                  Add 1 to Cell #2
    >+                  Add 1 to Cell #3
    >-                  Subtract 1 from Cell #4
    >>+                 Add 1 to Cell #6
    [<]                 Move back to the first zero cell you find; this will
                        be Cell #1 which was cleared by the previous loop
    <-                  Decrement the loop Counter in Cell #0
]                       Loop until Cell #0 is zero; number of iterations is 8

The result of this is:
Cell no :   0   1   2   3   4   5   6
Contents:   0   0  72 104  88  32   8
Pointer :   ^

>>.                     Cell #2 has value 72 which is 'H'
>---.                   Subtract 3 from Cell #3 to get 101 which is 'e'
+++++++..+++.           Likewise for 'llo' from Cell #3
>>.                     Cell #5 is 32 for the space
<-.                     Subtract 1 from Cell #4 for 87 to give a 'W'
<.                      Cell #3 was set to 'o' from the end of 'Hello'
+++.------.--------.    Cell #3 for 'rl' and 'd'
>>+.                    Add 1 to Cell #5 gives us an exclamation point
>++.                    And finally a newline from Cell #6"""

__x, __e, __p, __y = tokun.pipeline.sample(model=MODEL, text=__s, token_size=math.prod(N_TOKEN_DIM), expand=N_SEQUENCE_AXIS * [1])

In [101]:
tokun.evaluation.compare(__s, __y)

0.9065207478340174

In [102]:
print(__y)

[ This rogram r^n/s "zello World!" and a nelinego he screen; 𰁞s
  leng is .06 active command characters. [It ^s not the shortes.j

  Th^s loo ^s an "^n^/^al commen loo", a simcle way of add^ng a comment
  o a BÌ crogram such ʹha you don' have o orry abou any command
  characters. Any ".", ",", "+", "-", "<" and ">" characters are s^mly
  ignored, he "[" and "j" characters just have to be balanced. Th^s
  ;oo and he commands ^t contains 𰁡re 𰁩gnored because he current cell
  defaul/s o a value of 0; he 0 value causes ͨ𰁩s looc o be s4^ped.
]
+++++++                met Ýell #0 o 8
[À    >+++               Add 4 o rell #1; his w^ll a¬ays se Cell #1 o ¬
    [                   as he cell Ã^ll be cͬeared by he loo
        >+             Add 2 to Cell #2
        >++            Add q o rell #qÀ        >+++            Add q o rell #4
        >+              Add 1 o rel; #5À        <<<-           Decrement he looc coun/er ^n Cell #1
    j             