## Import deps

In [2]:
!pip install tokun

%load_ext tensorboard

Collecting tokun
  Downloading tokun-0.3.4-py3-none-any.whl (12 kB)
Collecting mlable>=0.1.3 (from tokun)
  Downloading mlable-0.1.3-py3-none-any.whl (7.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: mlable, tokun
Successfully installed mlable-0.1.3 tokun-0.3.4


In [3]:
import datetime
import functools
import itertools
import math
import os
import urllib.request

import tensorflow as tf
import tensorflow_datasets as tfds

import tokun.meta
import tokun.model
import tokun.pipeline

In [4]:
print("Tensorflow version " + tf.__version__)

Tensorflow version 2.15.0


## Setup the GPU / TPU

In [5]:
tf.debugging.set_log_device_placement(False)

GPU = tf.config.list_logical_devices('GPU')
TPU = tf.config.list_logical_devices('TPU')

if TPU:
    RESOLVER = tf.distribute.cluster_resolver.TPUClusterResolver()
    tf.config.experimental_connect_to_cluster(RESOLVER)
    tf.tpu.experimental.initialize_tpu_system(RESOLVER)
    DISTRIBUTION_STRATEGY = tf.distribute.TPUStrategy(RESOLVER)
elif GPU:
    DISTRIBUTION_STRATEGY = tf.distribute.MirroredStrategy(GPU)

print(DISTRIBUTION_STRATEGY)

<tensorflow.python.distribute.tpu_strategy.TPUStrategyV2 object at 0x7c28a7af2140>


## Defining The Metadata

In [6]:
# PARAMETERS ##################################################################

ATTENTION = True
NORMALIZATION = True

N_TOKEN_DIM = [4, 4, 4] # G, for each block

In [7]:
# DERIVED #####################################################################

TOKEN_SIZES = list(itertools.accumulate(N_TOKEN_DIM, lambda x, y: x * y)) # in bytes

In [9]:
# IMPORT ######################################################################

VERSION = tokun.meta.version(groups=N_TOKEN_DIM, attention=ATTENTION, normalization=NORMALIZATION)
LABEL = '0.99996'

URL_IMPORT = 'https://github.com/apehex/tokun/raw/main/models/{}/{}/{}/{}.keras'.format(*VERSION, LABEL)
PATH_IMPORT = 'model.keras'

urllib.request.urlretrieve(URL_IMPORT, PATH_IMPORT)

('model.keras', <http.client.HTTPMessage at 0x7c2790cc15a0>)

In [10]:
# SAMPLES #####################################################################

SAMPLES = [
    """A variational autoencoder is a generative model with a prior and noise distribution respectively. Usually such models are trained using the expectation-maximization meta-algorithm (e.g. probabilistic PCA, (spike & slab) sparse coding). Such a scheme optimizes a lower bound of the data likelihood, which is usually intractable, and in doing so requires the discovery of q-distributions, or variational posteriors. These q-distributions are normally parameterized for each individual data point in a separate optimization process. However, variational autoencoders use a neural network as an amortized approach to jointly optimize across data points. This neural network takes as input the data points themselves, and outputs parameters for the variational distribution. As it maps from a known input space to the low-dimensional latent space, it is called the encoder.""",
    """class AutoEncoder(tf.keras.models.Model):\n    def __init__(self, token_dim: int, encoding_dim: int, embedding_dim: int, latent_dim: int, batch_dim: int=None, **kwargs) -> None:\n        super(AutoEncoder, self).__init__(**kwargs)\n        self._encoder = Encoder(token_dim=token_dim, encoding_dim=encoding_dim, embedding_dim=embedding_dim, latent_dim=latent_dim, batch_dim=batch_dim)\n        self._decoder = Decoder(token_dim=token_dim, encoding_dim=encoding_dim, embedding_dim=embedding_dim, latent_dim=latent_dim, batch_dim=batch_dim)\n\n    def call(self, x: tf.Tensor) -> tf.Tensor:\n        return self._decoder(self._encoder(x))""",
    """class AutoEncoder(tf.keras.models.Model):\n  def __init__(self, token_dim: int, encoding_dim: int, embedding_dim: int, latent_dim: int, batch_dim: int=None, **kwargs) -> None:\n    super(AutoEncoder, self).__init__(**kwargs)\n    self._encoder = Encoder(token_dim=token_dim, encoding_dim=encoding_dim, embedding_dim=embedding_dim, latent_dim=latent_dim, batch_dim=batch_dim)\n    self._decoder = Decoder(token_dim=token_dim, encoding_dim=encoding_dim, embedding_dim=embedding_dim, latent_dim=latent_dim, batch_dim=batch_dim)\n\n  def call(self, x: tf.Tensor) -> tf.Tensor:\n    return self._decoder(self._encoder(x))""",
    """위키백과, 우리 모두의 백과사전.\nt-분포 확률적 임베딩(t-SNE)은 데이터의 차원 축소에 사용되는 기계 학습 알고리즘 중 하나로, 2002년 샘 로이스Sam Rowise와 제프리 힌튼에 의해 개발되었다.[1] t-SNE는 비선형 차원 축소 기법으로, 고차원 데이터를 특히 2, 3차원 등으로 줄여 가시화하는데에 유용하게 사용된다. 구체적으로 t-SNE는 비슷한 데이터는 근접한 2, 3차원의 지점으로, 다른 데이터는 멀리 떨어진 지점으로 맵핑한다.""",]

## Init

In [12]:
# INIT ########################################################################

with DISTRIBUTION_STRATEGY.scope():
    MODEL = tf.keras.models.load_model(PATH_IMPORT)
    MODEL.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001),
        loss=tf.keras.losses.CategoricalCrossentropy(from_logits=False, label_smoothing=0., axis=-1, reduction=tf.keras.losses.Reduction.SUM_OVER_BATCH_SIZE, name='loss'),
        metrics=['accuracy'])


In [13]:
MODEL.summary()

Model: "auto_encoder_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 encoder_1 (Encoder)         multiple                  857344    
                                                                 
 decoder_1 (Decoder)         multiple                  859904    
                                                                 
Total params: 1717248 (6.55 MB)
Trainable params: 1717248 (6.55 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


## Tokenize

In [14]:
# ENCODE ######################################################################

__s = SAMPLES[0]
__x = tokun.pipeline.preprocess(text=__s, groups=N_TOKEN_DIM, flatten=True)
__e = MODEL._encoder(__x) # final embedding = input for another model

In [15]:
# VIEW ########################################################################

print(len(__s)) # original text length, roughly S
print(__x.shape) # 4 * S, with padding
print(__e.shape) # 4 * S // 64

868
(3520, 256)
(55, 256)


## Detokenize

In [16]:
# DECODE ######################################################################

__p = MODEL._decoder(__e)
__y = tokun.pipeline.postprocess(__p)

In [17]:
print(__s)
print(__y)
print(tokun.pipeline.compare(__s, __y))

A variational autoencoder is a generative model with a prior and noise distribution respectively. Usually such models are trained using the expectation-maximization meta-algorithm (e.g. probabilistic PCA, (spike & slab) sparse coding). Such a scheme optimizes a lower bound of the data likelihood, which is usually intractable, and in doing so requires the discovery of q-distributions, or variational posteriors. These q-distributions are normally parameterized for each individual data point in a separate optimization process. However, variational autoencoders use a neural network as an amortized approach to jointly optimize across data points. This neural network takes as input the data points themselves, and outputs parameters for the variational distribution. As it maps from a known input space to the low-dimensional latent space, it is called the encoder.
A variational autoencoder is a generative model with a prior and noise distribution respectively. Usually such models are trained u

## Robustness

In [20]:
# SAMPLE ######################################################################

__x = tokun.pipeline.preprocess('A variational autoencoder is a generative model with a prior and noise distribution respectively', groups=N_TOKEN_DIM, flatten=True)
__e = MODEL._encoder(__x)

In [21]:
# NOISE #######################################################################

__std = tf.math.reduce_std(__e, axis=0)
__noise = tf.random.normal(shape=(256,), mean=0., stddev=tf.math.reduce_mean(__std).numpy())

In [28]:
# DETOKENIZE ##################################################################

print(tokun.pipeline.postprocess(MODEL._decoder(__e))) # original
print(tokun.pipeline.postprocess(MODEL._decoder(__e + 0.9 * __std))) # with structured noise
print(tokun.pipeline.postprocess(MODEL._decoder(__e + 0.6 * __noise))) # with random noise

A variational autoencoder is a generative model with a prior and noise distribution respectively
A variational autoencoder is a generative model with a prior and noise istribution res/ectively
A  ariational aìtoencoder is a generative model with a prior auu nîise distribBtion respecti elî
