## Import deps

In [None]:
!pip install -U datasets mlable

In [None]:
!pip install -U --no-index -f '/content/libs/' tokun

In [None]:
import datetime
import functools
import itertools
import math
import os
import random
import urllib.request

import datasets as hd
import tensorflow as tf

import mlable.data
import mlable.metrics
import mlable.ops

import tokun.data
import tokun.models.meta
import tokun.models.mlp
import tokun.pipeline.evaluate
import tokun.pipeline.preprocess
import tokun.pipeline.text

In [None]:
print("Tensorflow version " + tf.__version__)

## Setup the GPU / TPU

In [None]:
# MIXED PRECISION ##############################################################

tf.keras.mixed_precision.set_global_policy('mixed_bfloat16')

In [None]:
# DEVICES ######################################################################

tf.debugging.set_log_device_placement(False)

CPU = tf.config.list_logical_devices('CPU')
GPU = tf.config.list_logical_devices('GPU')
TPU = tf.config.list_logical_devices('TPU')

if TPU:
    RESOLVER = tf.distribute.cluster_resolver.TPUClusterResolver()
    tf.config.experimental_connect_to_cluster(RESOLVER)
    tf.tpu.experimental.initialize_tpu_system(RESOLVER)
    DISTRIBUTION_STRATEGY = tf.distribute.TPUStrategy(RESOLVER)
elif GPU:
    DISTRIBUTION_STRATEGY = tf.distribute.MirroredStrategy(GPU)
else:
    DISTRIBUTION_STRATEGY = tf.distribute.MirroredStrategy(CPU)

print(DISTRIBUTION_STRATEGY)

## Mode

In [None]:
# TOGGLE #######################################################################

IMPORT = True
DOWNLOAD = False
TRAINING = True
RANDOM = True

## Defining The Metadata

In [None]:
# MODEL PARAMETERS #############################################################

TOKUN_CONFIG = {
    'sequence_axis': 1,
    'feature_axis': -1,
    'token_dim': [4, 2, 2, 2,], # G, for each block
    'latent_dim': [16, 32, 64, 128], # L, for each block
    'input_dim': 256, # U_i (bytes)
    'embed_dim': 8, # E
    'output_dim': 8, # U_o (8 bits)
    'activation': 'gelu',}

In [None]:
# DERIVED MODEL PARAMETERS #####################################################

VERSION_CONFIG = {
    'token_dim': TOKUN_CONFIG['token_dim'],
    'input_dim': TOKUN_CONFIG['input_dim'],
    'embed_dim': TOKUN_CONFIG['latent_dim'],
    'output_dim': TOKUN_CONFIG['output_dim'],
    'sequence_axis': TOKUN_CONFIG['sequence_axis']}

META_CONFIG = {
    'version': tokun.models.meta.version(**VERSION_CONFIG),
    'label': '6.1',}

IO_CONFIG = {
    'url': 'https://github.com/apehex/tokun/raw/main/models/{}/{}/{}/{}.keras'.format(*META_CONFIG['version'], META_CONFIG['label']),
    'path': 'tokun.keras',}

In [None]:
# TRAINING PARAMETERS ##########################################################

TRAINING_CONFIG = {
    'epochs': 8,
    'batch_size': None,
    'validation_split': None,
    'validation_freq': list(range(1, 9)),
    # 'class_weight': {__c: 1. if __c == 0 else 1. for __c in range(256)}, # there are 3 times more 0s than other bytes
    'verbose': 1,}

OPTIMIZER_CONFIG = {
    'learning_rate': 0.001 * (0.1 if IMPORT else 1.0),
    'weight_decay': 0.001,
    'beta_1': 0.9,
    'beta_2': 0.95,
    'epsilon': 1e-6,
    'clipnorm': 0.1,
    'amsgrad': False,
    'use_ema': False,
    'ema_momentum': 0.99,
    'ema_overwrite_frequency': 1024,}
    # 'gradient_accumulation_steps': 2,

SCHEDULER_CONFIG = {
    'initial_learning_rate': OPTIMIZER_CONFIG['learning_rate'],
    'decay_steps': TRAINING_CONFIG['epochs'] * 1024,
    'alpha': 0.01,
    'name': 'cosine_lr',
    'warmup_target': None,
    'warmup_steps': 0,}

LOSS_CONFIG = {
    'from_logits': True,
    'label_smoothing': 0.0,
    'axis': -1,
    'reduction': 'sum_over_batch_size',
    'name': 'ce_loss',}

METRICS_CONFIG = {
    # 'factor': 256,}
    'depth': -1,}

CHECKPOINT_CONFIG = {
    'filepath': IO_CONFIG['path'],
    'monitor': 'val_loss',
    'mode': 'auto',
    'save_freq': 'epoch',
    'save_best_only': False,
    'save_weights_only': False,
    'verbose': 1,}

TENSORBOARD_CONFIG = {
    'log_dir': os.path.join('.logs/', *META_CONFIG['version'], datetime.datetime.now().strftime("%Y%m%d-%H%M%S")),
    'histogram_freq': 1,
    'embeddings_freq': 1,
    'profile_batch': (0, 4),
    'write_graph': True,
    'write_images': True,}

In [None]:
# PREPROCESSING ################################################################

BATCH_CONFIG = {
    'batch_size': 256,
    'drop_remainder': True,
    'num_parallel_calls': tf.data.AUTOTUNE,}

PIPELINE_CONFIG = {
    'batch_dim': BATCH_CONFIG['batch_size'],
    'sample_dim': 4 * 512,
    'token_dim': math.prod(TOKUN_CONFIG['token_dim']),
    'separator': '\u001d',}

In [None]:
# DATASETS #####################################################################

DATASETS_CONFIG = {
    # 'pt-fineweb-edu': {
    #     'path': 'HuggingFaceFW/fineweb-edu',
    #     'name': 'sample-10BT',
    #     'splits': [f'train[{__p}%:{__p + 10}%]' for __p in range(0, 100, 10)],
    #     'features': ['text'],},
    # 'pt-fineweb-kor': {
    #     'path': 'HuggingFaceFW/fineweb-2',
    #     'name': 'kor_Hang',
    #     'splits': [f'train[{__p}%:{__p + 10}%]' for __p in range(0, 100, 10)],
    #     'features': ['text'],},
    # 'pt-fineweb-fin': {
    #     'path': 'HuggingFaceFW/fineweb-2',
    #     'name': 'fin_Latn',
    #     'splits': [f'train[{__p}%:{__p + 10}%]' for __p in range(0, 100, 10)],
    #     'features': ['text'],},
    'pt-wikipedia': {
        'path': 'wikimedia/wikipedia',
        'name': '20231101.en',
        'splits': [f'train[{__p}%:{__p + 9}%]' for __p in range(0, 80, 8)],
        'features': ['text'],},
    # 'tp-wikipedia-1': {
    #     'path': 'wikimedia/wikipedia',
    #     'name': '20231101.en',
    #     'splits': [f'train[{__p}%:{__p + 1}%]' for __p in range(80, 90, 1)],
    #     'features': ['text'],},
    # 'tp-wikipedia-2': {
    #     'path': 'wikimedia/wikipedia',
    #     'name': '20231101.en',
    #     'splits': [f'train[{__p}%:{__p + 1}%]' for __p in range(90, 100, 1)],
    #     'features': ['text'],},
    # 'ft-retro-ascii-art': {
    #     'path': 'jdpressman/retro-ascii-art-v1',
    #     'name': None,
    #     'train': 'train',
    #     'splits': [f'train[{__p}%:{__p + 10}%]+validation[{__p}%:{__p + 10}%]' for __p in range(0, 100, 10)],
    #     'features': ['prompt', 'art_aic'],},
    # 'ft-stack-exchange': {
    #     'path': 'Alignment-Lab-AI/Stack-Exchange-April',
    #     'name': None,
    #     'splits': [f'train[{__p}%:{__p + 10}%]' for __p in range(0, 100, 10)],
    #     'features': ['question', 'answer'],},
    # 'ft-math': {
    #     'path': 'HuggingFaceTB/finemath',
    #     'name': 'finemath-3plus',
    #     'splits': [f'train[{__p}%:{__p + 10}%]' for __p in range(0, 100, 10)],
    #     'features': ['text'],},
    # 'cot-text-dolphin': {
    #     'path': 'cognitivecomputations/dolphin-r1',
    #     'name': 'reasoning-deepseek',
    #     'splits': [f'train[{__p}%:{__p + 10}%]' for __p in range(0, 100, 10)],
    #     'features': ['reasoning', 'answer'],},
    # 'cot-text-openthoughts': {
    #     'path': 'open-thoughts/OpenThoughts-114k',
    #     'name': 'default',
    #     'splits': [f'train[{__p}%:{__p + 10}%]' for __p in range(0, 100, 10)],
    #     'features': ['problem', 'solution'],},
    # 'cot-math-numi': {
    #     'path': 'AI-MO/NuminaMath-CoT',
    #     'name': None,
    #     'splits': [f'train[{__p}%:{__p + 10}%]' for __p in range(0, 100, 10)],
    #     'features': ['problem', 'solution'],},
}

## Downloading The Model Weights

In [None]:
# IMPORT #######################################################################

if IMPORT and DOWNLOAD:
    urllib.request.urlretrieve(IO_CONFIG['url'], IO_CONFIG['path'])

## Downloading The Data

In [None]:
# DOWNLOAD #####################################################################

DATASETS = {
    __name: [
        hd.load_dataset(path=__args['path'], name=__args['name'], split=__s).to_tf_dataset(shuffle=True, batch_size=None)
        for __s in __args['splits']]
    for __name, __args in DATASETS_CONFIG.items()}

In [None]:
# STATS #######################################################################

STATS = {__n: mlable.data.stats(dataset=DATASETS[__n][0], features=DATASETS_CONFIG[__n]['features'], count=2048) for __n in DATASETS}

print(STATS)

## Preprocess

In [None]:
# ITERATE #####################################################################

for __name in DATASETS:
    # specialized preprocessing fn
    __preprocess = tokun.pipeline.preprocess.factory(
        features=DATASETS_CONFIG[__name]['features'],
        **PIPELINE_CONFIG)
    # apply
    for __idx in range(len(DATASETS[__name])):
        DATASETS[__name][__idx] = DATASETS[__name][__idx].batch(**BATCH_CONFIG).map(__preprocess, num_parallel_calls=tf.data.AUTOTUNE)

In [None]:
# CONCATENATE #################################################################

DATASET_KEYS = set(DATASETS.keys()) - {'ft-retro-ascii-art'}

# FINE_TRAIN = functools.reduce(lambda __l, __r: __l.concatenate(__r), DATASETS['pt-fineweb-edu'][:-1])
# FINE_TEST = DATASETS['pt-fineweb-edu'][-1]

DATASET_TRAIN = functools.reduce(lambda __l, __r: __l.concatenate(__r), [DATASETS[__n][__i] for __n in DATASET_KEYS for __i in range(len(DATASETS[__n]) - 1)])
DATASET_TEST = functools.reduce(lambda __l, __r: __l.concatenate(__r), [DATASETS[__n][-1] for __n in DATASET_KEYS])

In [None]:
# INSPECT #####################################################################

__X, __T = next(iter(DATASET_TRAIN.take(1)))

# print(FINE_TRAIN.element_spec)
# print(FINE_TEST.element_spec)

print(DATASET_TRAIN.element_spec)
print(DATASET_TEST.element_spec)

print('train: {:,}'.format(DATASET_TRAIN.cardinality().numpy()))
print('test:  {:,}'.format(DATASET_TEST.cardinality().numpy()))

## Init The Model

In [None]:
# COMPILE ######################################################################

with DISTRIBUTION_STRATEGY.scope():
    # metrics
    byte_accuracy = mlable.metrics.BinaryGroupAccuracy(group=1, name='byte_accuracy')
    character_accuracy = mlable.metrics.BinaryGroupAccuracy(group=4, name='character_accuracy')
    token_accuracy = mlable.metrics.BinaryGroupAccuracy(group=PIPELINE_CONFIG['token_dim'], name='token_accuracy')
    # weights
    MODEL = tokun.models.mlp.AutoEncoder(**TOKUN_CONFIG)
    if IMPORT and os.path.isfile(IO_CONFIG['path']): MODEL = tf.keras.models.load_model(IO_CONFIG['path'], compile=False)
    # compile
    MODEL.compile(
        optimizer=tf.keras.optimizers.AdamW(**OPTIMIZER_CONFIG),
        loss=tf.keras.losses.BinaryCrossentropy(**LOSS_CONFIG),
        weighted_metrics=[byte_accuracy, character_accuracy, token_accuracy])
    # build
    MODEL(__X, training=False)
    MODEL.compute_metrics(__X, __T, __T)
    MODEL.compute_loss(__X, __T, __T)

In [None]:
# INSPECT ######################################################################

MODEL.summary()

In [None]:
print(MODEL.compute_loss(__X, __T, MODEL(__X)))
tf.reduce_mean(tf.keras.losses.binary_crossentropy(__T, MODEL(__X), from_logits=True, axis=-1))

## Train

In [None]:
# TRAIN ########################################################################

if TRAINING:
    with DISTRIBUTION_STRATEGY.scope():
        # callbacks
        cp_callback = tf.keras.callbacks.ModelCheckpoint(**CHECKPOINT_CONFIG)
        tb_callback = tf.keras.callbacks.TensorBoard(**TENSORBOARD_CONFIG)
        tn_callback = tf.keras.callbacks.TerminateOnNaN()
        # fit model
        TRAINING_HISTORY = MODEL.fit(
            x=DATASET_TRAIN.take(2048).prefetch(tf.data.AUTOTUNE),
            validation_data=DATASET_TEST.take(128).prefetch(tf.data.AUTOTUNE),
            callbacks=[cp_callback, tb_callback, tn_callback],
            **TRAINING_CONFIG)

## Dataviz

In [None]:
# DATA ########################################################################

SAMPLES = [
    """위키백과, 우리 모두의 백과사전.\nt-분포 확률적 임베딩(t-SNE)은 데이터의 차원 축소에 사용되는 기계 학습 알고리즘 중 하나로, 2002년 샘 로이스Sam Rowise와 제프리 힌튼에 의해 개발되었다.[1] t-SNE는 비선형 차원 축소 기법으로, 고차원 데이터를 특히 2, 3차원 등으로 줄여 가시화하는데에 유용하게 사용된다. 구체적으로 t-SNE는 비슷한 데이터는 근접한 2, 3차원의 지점으로, 다른 데이터는 멀리 떨어진 지점으로 맵핑한다.""",
    """class Encoder(tf.keras.models.Model):\n    def __init__(self, depth: int, token_dim: int, encoding_dim: int, embedding_dim: int, batch_dim: int=None, attention: bool=False, **kwargs) -> None:\n        super(Encoder, self).__init__(**kwargs)\n        self._encoder = tf.keras.Sequential([\n            tf.keras.Input(shape=(encoding_dim,), batch_size=batch_dim, name='input'), # (B * G ^ D, U)\n            tf.keras.layers.Dense(units=embedding_dim, activation=None, use_bias=False, kernel_initializer='glorot_uniform', bias_initializer=None, name='embed-1'),] # (B * G ^ D, U) => (B * G ^ D, E)\n            + [tokun.layers.TokenizeBlock(left_axis=-2, right_axis=-1, token_dim=token_dim, attention=attention, name='tokenize' + (__i + 1) * '-4') for __i in range(depth)]) # (B * G ^ i, E) => (B * G ^ (i-1), E)\n\n    def call(self, x: tf.Tensor) -> tf.Tensor:\n        return self._encoder(x)\n""",
    """class AutoEncoder(tf.keras.models.Model):\n    def __init__(self, token_dim: int, encoding_dim: int, embedding_dim: int, batch_dim: int=None, **kwargs) -> None:\n        super(AutoEncoder, self).__init__(**kwargs)\n        self._encoder = Encoder(token_dim=token_dim, encoding_dim=encoding_dim, embedding_dim=embedding_dim, batch_dim=batch_dim)\n        self._decoder = Decoder(token_dim=token_dim, encoding_dim=encoding_dim, embedding_dim=embedding_dim, batch_dim=batch_dim)\n\n    def call(self, x: tf.Tensor) -> tf.Tensor:\n        return self._decoder(self._encoder(x))""",
    """class AutoEncoder(tf.keras.models.Model):\n  def __init__(self, token_dim: int, encoding_dim: int, embedding_dim: int, batch_dim: int=None, **kwargs) -> None:\n    super(AutoEncoder, self).__init__(**kwargs)\n    self._encoder = Encoder(token_dim=token_dim, encoding_dim=encoding_dim, embedding_dim=embedding_dim, batch_dim=batch_dim)\n    self._decoder = Decoder(token_dim=token_dim, encoding_dim=encoding_dim, embedding_dim=embedding_dim, batch_dim=batch_dim)\n\n  def call(self, x: tf.Tensor) -> tf.Tensor:\n    return self._decoder(self._encoder(x))"""]

In [None]:
# COMPUTE ######################################################################

__i = 1
__x, __e, __p, __y, __o = tokun.pipeline.text.sample(model=MODEL, text=SAMPLES[__i], token_dim=PIPELINE_CONFIG['token_dim'], threshold=0.5)

In [None]:
print('# INPUT ################################################################\n\n' + SAMPLES[__i])
print('\n# OUTPUT ###############################################################\n\n' + __o[0])
print('\n# SCORE ################################################################\n\n' + str(tokun.pipeline.evaluate.compare(SAMPLES[__i], __o[0])))

In [None]:
%load_ext tensorboard

In [None]:
%tensorboard --logdir .logs