## Import deps

In [None]:
!pip install mlable tokun

%load_ext tensorboard

Collecting mlable
  Downloading mlable-0.3.13-py3-none-any.whl (16 kB)
Collecting tokun
  Downloading tokun-0.6.1-py3-none-any.whl (14 kB)
Installing collected packages: mlable, tokun
Successfully installed mlable-0.3.13 tokun-0.6.1


In [None]:
import datetime
import functools
import itertools
import math
import os
import urllib.request

import tensorflow as tf
import tensorflow_datasets as tfds

import mlable.metrics
import mlable.io

import tokun.evaluation
import tokun.meta
import tokun.model
import tokun.pipeline

In [None]:
print("Tensorflow version " + tf.__version__)

Tensorflow version 2.15.0


## Setup the GPU / TPU

In [None]:
# DEVICES #####################################################################

tf.debugging.set_log_device_placement(False)

CPU = tf.config.list_logical_devices('CPU')
GPU = tf.config.list_logical_devices('GPU')
TPU = tf.config.list_logical_devices('TPU')

if TPU:
    RESOLVER = tf.distribute.cluster_resolver.TPUClusterResolver()
    tf.config.experimental_connect_to_cluster(RESOLVER)
    tf.tpu.experimental.initialize_tpu_system(RESOLVER)
    DISTRIBUTION_STRATEGY = tf.distribute.TPUStrategy(RESOLVER)
elif GPU:
    DISTRIBUTION_STRATEGY = tf.distribute.MirroredStrategy(GPU)
else:
    DISTRIBUTION_STRATEGY = tf.distribute.MirroredStrategy(CPU)

print(DISTRIBUTION_STRATEGY)

<tensorflow.python.distribute.tpu_strategy.TPUStrategyV2 object at 0x7c8aa5755090>


## Defining The Metadata

In [None]:
# TOGGLE ######################################################################

BINARY = True

In [None]:
# PARAMETERS ##################################################################

N_SEQUENCE_AXIS = 1
N_FEATURE_AXIS = -1

N_TOKEN_DIM = [4, 16] # G, for each block
N_INPUT_DIM = 256 # U_i (bytes)
N_EMBED_DIM = 256 # E
N_OUTPUT_DIM = 8 if BINARY else 256 # U_o (8 bits)

In [None]:
# DERIVED #####################################################################

N_TOKEN_SIZES = list(itertools.accumulate(N_TOKEN_DIM, lambda x, y: x * y)) # in bytes

In [None]:
# IMPORT ######################################################################

LABEL = '8.6'
VERSION = tokun.meta.version(token_dim=N_TOKEN_DIM, sequence_axis=N_SEQUENCE_AXIS, input_dim=N_INPUT_DIM, embed_dim=N_EMBED_DIM, output_dim=N_OUTPUT_DIM)

URL_IMPORT = 'https://github.com/apehex/tokun/raw/main/models/{}/{}/{}/{}.keras'.format(*VERSION, LABEL)
PATH_IMPORT = 'model.keras'

urllib.request.urlretrieve(URL_IMPORT, PATH_IMPORT)

('model.keras', <http.client.HTTPMessage at 0x7c897e52f9d0>)

In [None]:
# SAMPLES #####################################################################

SAMPLES = [
    """A variational autoencoder is a generative model with a prior and noise distribution respectively. Usually such models are trained using the expectation-maximization meta-algorithm (e.g. probabilistic PCA, (spike & slab) sparse coding). Such a scheme optimizes a lower bound of the data likelihood, which is usually intractable, and in doing so requires the discovery of q-distributions, or variational posteriors. These q-distributions are normally parameterized for each individual data point in a separate optimization process. However, variational autoencoders use a neural network as an amortized approach to jointly optimize across data points. This neural network takes as input the data points themselves, and outputs parameters for the variational distribution. As it maps from a known input space to the low-dimensional latent space, it is called the encoder.""",
    """Une unité lexicale ou token lexical ou plus simplement token est un couple composé d'un nom et d'une valeur optionnelle (e.g. 135677).""",
    """class AutoEncoder(tf.keras.models.Model):\n    def __init__(self, token_dim: int, encoding_dim: int, embedding_dim: int, batch_dim: int=None, **kwargs) -> None:\n        super(AutoEncoder, self).__init__(**kwargs)\n        self._encoder = Encoder(token_dim=token_dim, encoding_dim=encoding_dim, embedding_dim=embedding_dim, batch_dim=batch_dim)\n        self._decoder = Decoder(token_dim=token_dim, encoding_dim=encoding_dim, embedding_dim=embedding_dim, batch_dim=batch_dim)\n\n    def call(self, x: tf.Tensor) -> tf.Tensor:\n        return self._decoder(self._encoder(x))""",
    """class AutoEncoder(tf.keras.models.Model):\n  def __init__(self, token_dim: int, encoding_dim: int, embedding_dim: int, batch_dim: int=None, **kwargs) -> None:\n    super(AutoEncoder, self).__init__(**kwargs)\n    self._encoder = Encoder(token_dim=token_dim, encoding_dim=encoding_dim, embedding_dim=embedding_dim, batch_dim=batch_dim)\n    self._decoder = Decoder(token_dim=token_dim, encoding_dim=encoding_dim, embedding_dim=embedding_dim, batch_dim=batch_dim)\n\n  def call(self, x: tf.Tensor) -> tf.Tensor:\n    return self._decoder(self._encoder(x))""",
    """위키백과, 우리 모두의 백과사전.\nt-분포 확률적 임베딩(t-SNE)은 데이터의 차원 축소에 사용되는 기계 학습 알고리즘 중 하나로, 2002년 샘 로이스Sam Rowise와 제프리 힌튼에 의해 개발되었다.[1] t-SNE는 비선형 차원 축소 기법으로, 고차원 데이터를 특히 2, 3차원 등으로 줄여 가시화하는데에 유용하게 사용된다. 구체적으로 t-SNE는 비슷한 데이터는 근접한 2, 3차원의 지점으로, 다른 데이터는 멀리 떨어진 지점으로 맵핑한다.""",]

## Init

In [None]:
# METRICS #####################################################################

_Accuracy = mlable.metrics.BinaryGroupAccuracy if BINARY else mlable.metrics.CategoricalGroupAccuracy
_Loss = tf.keras.losses.BinaryCrossentropy if BINARY else tf.keras.losses.CategoricalCrossentropy

In [None]:
# INIT ########################################################################

with DISTRIBUTION_STRATEGY.scope():
    # metrics
    byte_accuracy = _Accuracy(group=1, name='byte_accuracy')
    character_accuracy = _Accuracy(group=4, name='character_accuracy')
    token_accuracy = _Accuracy(group=N_TOKEN_SIZES[-1], name='token_accuracy')
    # weights
    MODEL = tf.keras.models.load_model(PATH_IMPORT, compile=False)
    # compilation
    MODEL.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001),
        loss=_Loss(from_logits=False, label_smoothing=0., axis=-1, reduction='sum_over_batch_size', name='ce_loss'),
        metrics=[byte_accuracy, character_accuracy, token_accuracy])


In [None]:
MODEL.summary()

Model: "auto_encoder"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 encoder (Encoder)           multiple                  1377792   
                                                                 
 decoder (Decoder)           multiple                  1382656   
                                                                 
Total params: 2760448 (10.53 MB)
Trainable params: 2760448 (10.53 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


## Tokenize

In [None]:
# ENCODE ######################################################################

__s = SAMPLES[1]
__x = tokun.pipeline.preprocess(text=__s, token_size=math.prod(N_TOKEN_DIM), expand=N_SEQUENCE_AXIS * [1])
__e = MODEL._encoder(__x) # final embedding = input for another model

In [None]:
# VIEW ########################################################################

print(len(__s)) # original text length, roughly S
print(__x.shape) # 4 * S, with padding
print(__e.shape) # 4 * S // 64

134
(1, 576, 256)
(1, 9, 256)


## Detokenize

In [None]:
# DECODE ######################################################################

__p = MODEL._decoder(__e)
__y = tokun.pipeline.postprocess(__p, binary=BINARY, random=False)
__o = tokun.pipeline.unpack(__y)

In [None]:
print('# INPUT ################################################################\n\n' + __s)
print('\n# OUTPUT ###############################################################\n\n' + __o[0])
print('\n# SCORE ################################################################\n\n' + str(tokun.evaluation.compare(__s, __o[0])))

# INPUT ################################################################

Une unité lexicale ou token lexical ou plus simplement token est un couple composé d'un nom et d'une valeur optionnelle (e.g. 135677).

# OUTPUT ###############################################################

Une unité lexicale ou token lexical ou plus simplement token est un couple composé d'un nom et d'une valeur optionnelle (e.g. 135677).

# SCORE ################################################################

1.0


## Robustness

In [None]:
# SAMPLE ######################################################################

__x = tokun.pipeline.preprocess("""Une unité lexicale ou token lexical ou plus simplement token est un couple composé d'un nom et d'une valeur optionnelle (e.g. 135677).""", token_size=math.prod(N_TOKEN_DIM), expand=N_SEQUENCE_AXIS * [1])
__e = MODEL._encoder(__x)

In [None]:
# NOISE #######################################################################

__std = tf.math.reduce_std(__e, axis=1)
__noise = tf.random.normal(shape=(256,), mean=0., stddev=tf.math.reduce_mean(__std).numpy())

In [None]:
# DETOKENIZE ##################################################################

print(tokun.pipeline.postprocess(MODEL._decoder(__e), binary=BINARY, random=False)) # original
print(tokun.pipeline.postprocess(MODEL._decoder(__e + 0.8 * __std), binary=BINARY, random=False)) # with structured noise
print(tokun.pipeline.postprocess(MODEL._decoder(__e + 1. * __noise), binary=BINARY, random=False)) # with random noise

Une unité lexicale ou token lexical ou plus simplement token est un couple composé d'un nom et d'une valeur optionnelle (e.g. 135677).
Une unité lexicale ou token lexical ou plus simplement token est un couple composé d'un nom et d'une valeur optionnelle (e.g. 135677).
Une unité lexicale ou token lexical ou plus simplement token est un couple composé d'un nom et d'une valeur optionnelle (e.g. 135677).


## Text Similarity

In [None]:
# SAMPLES #####################################################################

REFERENCE = """A couple is enjoying fresh sushis at home."""
CANDIDATES = [
    """A couple is cooking fresh sushis at home.""",
    """Ana and Bob are enjoying sushis while watching TV.""",
    """A couple is enjoying california rolls.""",
    """You can order fresh sushis from home.""",
    """Enjoying fresh sushis at home a couple is.""",
    """Sushis are the best!""",]

In [None]:
# METRIC ######################################################################

__f = tf.keras.losses.CosineSimilarity(axis=-1, reduction=tf.keras.losses.Reduction.SUM_OVER_BATCH_SIZE)

# REFERENCE ###################################################################

__xr, __er, __pr, __yr = tokun.pipeline.sample(model=MODEL, text=REFERENCE, token_size=math.prod(N_TOKEN_DIM), expand=N_SEQUENCE_AXIS * [1])

# SHAPE #######################################################################

__sr = list(__er.shape) # shape
__rr = len(__sr) # rank

In [None]:
# DISTANCES ###################################################################

DISTANCES = {}

for __c in CANDIDATES:
  __xl, __el, __pl, __yl, __ol = tokun.pipeline.sample(model=MODEL, text=__c, token_size=math.prod(__sr[1:2] + N_TOKEN_DIM), expand=N_SEQUENCE_AXIS * [1], binary=BINARY, random=False)
  __el = tf.slice(__el, begin=__rr * [0], size=__sr) # match reference shape
  DISTANCES[__c] = __f(y_true=__er, y_pred=__el).numpy()

In [None]:
# SORT ########################################################################

DISTANCES = dict(sorted(DISTANCES.items(), key=lambda __x: __x[1]))

In [None]:
DISTANCES

{'A couple is enjoying california rolls.': -0.9971239,
 'A couple is cooking fresh sushis at home.': -0.9952307,
 'Sushis are the best!': -0.9944132,
 'Enjoying fresh sushis at home a couple is.': -0.9937381,
 'You can order fresh sushis from home.': -0.993692,
 'Ana and Bob are enjoying sushis while watching TV.': -0.9927745}