## Import deps

In [None]:
!pip install -U datasets mlable tokun llaminate



In [None]:
!huggingface-cli login

%load_ext tensorboard


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    A token is already saved on your machine. Run `huggingface-cli whoami` to get more information or `huggingface-cli logout` if you want to log out.
    Setting a new token will erase the existing one.
    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) n
Token is valid (permission: read).

In [None]:
import datetime
import functools
import itertools
import math
import os
import random
import urllib.request

import datasets as ds
import tensorflow as tf
import tensorflow_datasets as tfds

import mlable.data
import mlable.io
import mlable.metrics
import mlable.optimizers

import tokun.data
import tokun.evaluation
import tokun.meta
import tokun.pipeline
import tokun.model

import llaminate.model
import llaminate.pipeline
import llaminate.utils

In [None]:
print("Tensorflow version " + tf.__version__)

Tensorflow version 2.15.0


## Setup the GPU / TPU

In [None]:
# DEVICES #####################################################################

tf.debugging.set_log_device_placement(False)

CPU = tf.config.list_logical_devices('CPU')
GPU = tf.config.list_logical_devices('GPU')
TPU = tf.config.list_logical_devices('TPU')

if TPU:
    RESOLVER = tf.distribute.cluster_resolver.TPUClusterResolver()
    tf.config.experimental_connect_to_cluster(RESOLVER)
    tf.tpu.experimental.initialize_tpu_system(RESOLVER)
    DISTRIBUTION_STRATEGY = tf.distribute.TPUStrategy(RESOLVER)
elif GPU:
    DISTRIBUTION_STRATEGY = tf.distribute.MirroredStrategy(GPU)
else:
    DISTRIBUTION_STRATEGY = tf.distribute.MirroredStrategy(CPU)

print(DISTRIBUTION_STRATEGY)

<tensorflow.python.distribute.tpu_strategy.TPUStrategyV2 object at 0x7988ac94fa00>


## Mode

In [None]:
# TOGGLE ######################################################################

IMPORT = False
FREEZE = True # freeze tokun weights
TRAINING = True
DEBUG = False

## Defining The Metadata

In [None]:
# MODEL PARAMETERS ############################################################

N_SEQUENCE_AXIS = 1
N_FEATURE_AXIS = -1

N_LAYERS_NUM = 16
N_HEADS_NUM = 4

N_CACHE_DIM = 256 # 2048 in llama3-8B but tokun embeddings = 16 chr = 4 llama3 tokens
N_EMBED_DIM = 256
N_HIDDEN_DIM = 4 * N_EMBED_DIM
N_HEAD_DIM = N_EMBED_DIM // N_HEADS_NUM

LLAMINATE_PATH = 'llaminate.keras'

In [None]:
# TOKENIZER PARAMETERS ########################################################

TOKUN_DIM = [16, 4]
TOKUN_FACTOR = math.prod(TOKUN_DIM) // 4
TOKUN_VERSION = tokun.meta.version(units=TOKUN_DIM, axis=1)

TOKUN_LABEL = '7.7'
TOKUN_PATH = 'tokun.keras'
TOKUN_URL = 'https://github.com/apehex/tokun/raw/main/models/{}/{}/{}.keras'.format(*TOKUN_VERSION, TOKUN_LABEL)

In [None]:
# TRAINING PARAMETERS #########################################################

N_BATCH_DIM = 128
N_SAMPLE_DIM = N_CACHE_DIM * TOKUN_FACTOR

N_EPOCHS = 8

R_0, B_1, B_2 = (0.1 if IMPORT else 1.) * 0.001, 0.9, 0.99

CLASS_WEIGHTS = {__c: 0.3 if __c == 0 else 1. for __c in range(256)} # there are 3 times more 0s than other bytes

In [None]:
# DERIVED PARAMETERS ##########################################################

DATETIME = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")

LLAMINATE_VERSION = [str(N_LAYERS_NUM), str(N_HIDDEN_DIM)]
LLAMINATE_LOGS_PATH = os.path.join('.logs/', *LLAMINATE_VERSION, DATETIME)
LLAMINATE_MODEL_PATH = 'llaminate.keras'

## Loading The Data

In [None]:
# META ########################################################################

# TODO bigcode/the-stack
# TODO ArmelR/stack-exchange-instruction

DATASETS_META = {
    'pt-wikipedia': {
        'path': 'wikimedia/wikipedia',
        'name': '20231101.en',
        'train': 'train[:90%]',
        'test': 'train[-10%:]',
        'features': ['text'],},
    'ft-retro-ascii-art': {
        'path': 'jdpressman/retro-ascii-art-v1',
        'name': None,
        'train': 'train',
        'test': 'validation',
        'features': ['prompt', 'art_aic'],},
    'ft-stack-exchange': {
        'path': 'Alignment-Lab-AI/Stack-Exchange-April',
        'name': None,
        'train': 'train[:90%]',
        'test': 'train[-10%:]',
        'features': ['question', 'answer'],},
    'ft-math': {
        'path': 'hendrycks/competition_math',
        'name': None,
        'train': 'train',
        'test': 'test',
        'features': ['problem', 'solution'],},}

In [None]:
# DOWNLOAD ####################################################################

DATASETS = {
    __name: {
        'train': ds.load_dataset(path=__args['path'], name=__args['name'], split=__args['train']).to_tf_dataset(shuffle=True, batch_size=None),
        'test': ds.load_dataset(path=__args['path'], name=__args['name'], split=__args['test']).to_tf_dataset(shuffle=True, batch_size=None),}
    for __name, __args in DATASETS_META.items()}

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Resolving data files:   0%|          | 0/41 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/33 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/41 [00:00<?, ?it/s]

## Checking The Data

In [None]:
# STATS #######################################################################

STATS = {__n: {'min': 0, 'max': 0, 'mean': 0} for __n in DATASETS}

for __name in DATASETS:
    # sample each dataset
    __m = DATASETS_META[__name]
    __b = iter(DATASETS[__name]['train'])
    __s = [next(__b) for _ in range(128)]
    __l = [len(tf.strings.join(inputs=[__e[__f] for __f in __m['features']], separator='\x1d').numpy()) for __e in __s]
    # save the stats
    STATS[__name]['min'] = min(__l)
    STATS[__name]['max'] = max(__l)
    STATS[__name]['mean'] = tf.reduce_mean(__l).numpy()

In [None]:
print(STATS)

{'pt-wikipedia': {'min': 131, 'max': 52493, 'mean': 3256}, 'ft-retro-ascii-art': {'min': 3068, 'max': 3286, 'mean': 3146}, 'ft-stack-exchange': {'min': 251, 'max': 7072, 'mean': 1539}, 'ft-math': {'min': 100, 'max': 2600, 'mean': 847}}


In [None]:
__b = iter(DATASETS['ft-stack-exchange']['train'])

In [None]:
__s = next(__b)
tf.strings.join(inputs=[__s['question'], __s['answer']], separator='\x1d')

<tf.Tensor: shape=(), dtype=string, numpy=b"Use of static methods to implement logic within a class\n\nI've recently caught an odd behaviour in my own coding:  At some point in the last year I began creating protected static methods to implement the logic of my public methods, ensuring that the protected static version contains all of the logic, but is not affected by state.\ne.g.\nclass Demo\n{\n    string someAttribute;\n    public Demo(string someAttribute)\n    {\n        this.someAttribute = someAttribute;\n    }\n\n    // public method only calls the static method, passing all \n    // parameters sent to the public function as well as any \n    // of the object's attributes required by the function.\n    public string AppendSomeAttribute(string textToAppendTo) =>\n        AppendSomeAttribute(textToAppendTo, someAttribute);\n\n    // static method is not publicly exposed.  It contains all \n    // of the function's logic, but is not affected by state\n    protected static string A

## Preprocess

In [None]:
# ITERATE #####################################################################

for __name in DATASETS:
    DATASETS[__name]['train'] = llaminate.pipeline.preprocess(dataset=DATASETS[__name]['train'], token_dim=math.prod(TOKUN_DIM), embed_dim=N_EMBED_DIM, sample_dim=N_SAMPLE_DIM, features=DATASETS_META[__name]['features'])
    DATASETS[__name]['test'] = llaminate.pipeline.preprocess(dataset=DATASETS[__name]['test'], token_dim=math.prod(TOKUN_DIM), embed_dim=N_EMBED_DIM, sample_dim=N_SAMPLE_DIM, features=DATASETS_META[__name]['features'])

In [None]:
# CONCATENATE #################################################################

DATASET_TRAIN = functools.reduce(lambda __l, __r: __l.concatenate(__r), [DATASETS[__n]['train'] for __n in (set(DATASETS.keys()) - {'ft-retro-ascii-art'})]) # - {'pt-wikipedia'}
DATASET_TEST = functools.reduce(lambda __l, __r: __l.concatenate(__r), [DATASETS[__n]['test'] for __n in (set(DATASETS.keys()) - {'ft-retro-ascii-art'})]) # - {'pt-wikipedia'}

In [None]:
# CHECK DATASET ###############################################################

print(DATASET_TRAIN.element_spec)
print(DATASET_TEST.element_spec)

(TensorSpec(shape=(16384,), dtype=tf.uint8, name=None), TensorSpec(shape=(16384, 256), dtype=tf.float32, name=None))
(TensorSpec(shape=(16384,), dtype=tf.uint8, name=None), TensorSpec(shape=(16384, 256), dtype=tf.float32, name=None))


In [None]:
print('train: {:,} samples'.format(DATASET_TRAIN.cardinality().numpy()))
print('test:  {:,} samples'.format(DATASET_TEST.cardinality().numpy()))

train: 9,431,038 samples
test:  1,052,059 samples


## Downloading The Tokenizer

In [None]:
# IMPORT ######################################################################

urllib.request.urlretrieve(TOKUN_URL, TOKUN_PATH)

('tokun.keras', <http.client.HTTPMessage at 0x797ed43487f0>)

## Initializing The Model

In [None]:
# DEBUG MODEL #################################################################

class DebugModel(tf.keras.Sequential):
    def __init__(self, **kwargs) -> None:
        super(DebugModel, self).__init__(
            layers=[
                tf.keras.layers.Embedding(
                    input_dim=N_EMBED_DIM,
                    output_dim=N_EMBED_DIM,
                    embeddings_initializer='glorot_uniform',
                    name='embed-1')])


In [None]:
# OVERALL SCOPE ###############################################################

with DISTRIBUTION_STRATEGY.scope():
    # TOKENIZER ###############################################################
    TOKUN = tf.keras.models.load_model(TOKUN_PATH, compile=False)
    TOKUN.trainable = not FREEZE # freeze the weights
    # TOKUN.compile(
    #     optimizer=tf.keras.optimizers.Adam(learning_rate=R_MAX),
    #     loss=tf.keras.losses.CategoricalCrossentropy(from_logits=False, label_smoothing=0., axis=-1, reduction=tf.keras.losses.Reduction.SUM_OVER_BATCH_SIZE, name='loss'),
    #     metrics=[byte_accuracy, character_accuracy, token_accuracy])

    # METRICS #################################################################
    byte_accuracy = mlable.metrics.CategoricalGroupAccuracy(group=1, name='byte_accuracy')
    character_accuracy = mlable.metrics.CategoricalGroupAccuracy(group=4, name='character_accuracy')
    token_accuracy = mlable.metrics.CategoricalGroupAccuracy(group=math.prod(TOKUN_DIM), name='token_accuracy')

    # WEIGHTS #################################################################
    if IMPORT and os.path.isfile(LLAMINATE_MODEL_PATH):
        LLAMINATE = tf.keras.models.load_model(LLAMINATE_MODEL_PATH, compile=False)
    else:
        LLAMINATE = llaminate.model.Transformer(num_layers=N_LAYERS_NUM, num_heads=N_HEADS_NUM, cache_dim=N_CACHE_DIM, embed_dim=N_EMBED_DIM, head_dim=N_HEAD_DIM, hidden_dim=N_HIDDEN_DIM)

    # INIT ####################################################################
    LLAMINATE.set_tokenizer(encoder=TOKUN._encoder, decoder=TOKUN._decoder)
    # simpler model to debug
    if DEBUG: LLAMINATE = DebugModel()

    # INPUT ###################################################################
    # __input = tf.keras.Input(shape=(4 * TOKUN_FACTOR * N_CACHE_DIM,), batch_size=N_BATCH_DIM)
    # LLAMINATE = tf.keras.models.Model(__input, LLAMINATE(__input))

    # COMPILE #################################################################
    LLAMINATE.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=R_0, beta_1=B_1, beta_2=B_2),
        loss=tf.keras.losses.CategoricalCrossentropy(from_logits=False, label_smoothing=0., axis=-1, reduction=tf.keras.losses.Reduction.SUM_OVER_BATCH_SIZE, name='loss'),
        metrics=[byte_accuracy, character_accuracy, token_accuracy])

In [None]:
__b = iter(DATASET_TEST.batch(N_BATCH_DIM))

In [None]:
__x, __y = next(__b)
print(__x.shape)
print(LLAMINATE._encoder(__x, training=True).shape)
print(LLAMINATE(__x, training=True).shape)

(128, 16384)
(128, 256, 256)
(128, 16384, 256)


In [None]:
TOKUN.summary()

Model: "auto_encoder_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 encoder_1 (Encoder)         multiple                  1377792   
                                                                 
 decoder_1 (Decoder)         multiple                  1382656   
                                                                 
Total params: 2760448 (10.53 MB)
Trainable params: 0 (0.00 Byte)
Non-trainable params: 2760448 (10.53 MB)
_________________________________________________________________


In [None]:
LLAMINATE.summary()

Model: "transformer_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 block-0 (DecoderBlock)      multiple                  1049600   
                                                                 
 block-1 (DecoderBlock)      multiple                  1049600   
                                                                 
 block-2 (DecoderBlock)      multiple                  1049600   
                                                                 
 block-3 (DecoderBlock)      multiple                  1049600   
                                                                 
 block-4 (DecoderBlock)      multiple                  1049600   
                                                                 
 block-5 (DecoderBlock)      multiple                  1049600   
                                                                 
 block-6 (DecoderBlock)      multiple                

## Train

In [None]:
# TRAIN #######################################################################

if TRAINING:
    with DISTRIBUTION_STRATEGY.scope():
        # callbacks
        lr_callback = tf.keras.callbacks.LearningRateScheduler(functools.partial(mlable.optimizers.learning_rate_hokusai, lr_min=R_MIN, lr_max=R_MAX, lr_exp=R_EXP, rampup=N_EPOCHS_RAMPUP, sustain=N_EPOCHS_SUSTAIN), verbose=True)
        cp_callback = tf.keras.callbacks.ModelCheckpoint(LLAMINATE_MODEL_PATH, monitor='val_loss', verbose=1, save_best_only=True, save_weights_only=False, mode='auto', save_freq='epoch')
        tb_callback = tf.keras.callbacks.TensorBoard(log_dir=LLAMINATE_LOGS_PATH)
        # model fitting
        TRAINING_HISTORY = LLAMINATE.fit(
            x=DATASETS['ft-stack-exchange']['train'].batch(N_BATCH_DIM).prefetch(1),
            batch_size=None,
            epochs=N_EPOCHS,
            validation_split=None,
            validation_data=DATASETS['ft-stack-exchange']['test'].batch(N_BATCH_DIM).prefetch(1),
            validation_freq=list(range(1, N_EPOCHS + 1, 1)),
            class_weight=CLASS_WEIGHTS,
            verbose=1,
            callbacks=[lr_callback, cp_callback, tb_callback])


Epoch 1: LearningRateScheduler setting learning rate to 0.0001.
Epoch 1/8


AttributeError: in user code:

    File "/usr/local/lib/python3.10/dist-packages/keras/src/engine/training.py", line 1401, in train_function  *
        return step_function(self, iterator)
    File "/usr/local/lib/python3.10/dist-packages/keras/src/engine/training.py", line 1384, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/usr/local/lib/python3.10/dist-packages/keras/src/engine/training.py", line 1373, in run_step
        outputs = model.train_step(data)
    File "/usr/local/lib/python3.10/dist-packages/keras/src/engine/training.py", line 1154, in train_step
        self.optimizer.minimize(loss, self.trainable_variables, tape=tape)
    File "/usr/local/lib/python3.10/dist-packages/keras/src/optimizers/optimizer.py", line 544, in minimize
        self.apply_gradients(grads_and_vars)
    File "/usr/local/lib/python3.10/dist-packages/keras/src/optimizers/optimizer.py", line 1223, in apply_gradients
        return super().apply_gradients(grads_and_vars, name=name)
    File "/usr/local/lib/python3.10/dist-packages/keras/src/optimizers/optimizer.py", line 638, in apply_gradients
        self.build(trainable_variables)
    File "/usr/local/lib/python3.10/dist-packages/keras/src/optimizers/adam.py", line 145, in build
        self.add_variable_from_reference(
    File "/usr/local/lib/python3.10/dist-packages/keras/src/optimizers/optimizer.py", line 1124, in add_variable_from_reference
        with strategy.extended.colocate_vars_with(model_variable):

    AttributeError: 'NoneType' object has no attribute 'extended'


## Export

## Dataviz

In [None]:
# DATA ########################################################################

SAMPLES = [
    """위키백과, 우리 모두의 백과사전.\nt-분포 확률적 임베딩(t-SNE)은 데이터의 차원 축소에 사용되는 기계 학습 알고리즘 중 하나로, 2002년 샘 로이스Sam Rowise와 제프리 힌튼에 의해 개발되었다.[1] t-SNE는 비선형 차원 축소 기법으로, 고차원 데이터를 특히 2, 3차원 등으로 줄여 가시화하는데에 유용하게 사용된다. 구체적으로 t-SNE는 비슷한 데이터는 근접한 2, 3차원의 지점으로, 다른 데이터는 멀리 떨어진 지점으로 맵핑한다.""",
    """class Encoder(tf.keras.models.Model):\n    def __init__(self, depth: int, token_dim: int, encoding_dim: int, embedding_dim: int, latent_dim: int, batch_dim: int=None, attention: bool=False, **kwargs) -> None:\n        super(Encoder, self).__init__(**kwargs)\n        self._encoder = tf.keras.Sequential([\n            tf.keras.Input(shape=(encoding_dim,), batch_size=batch_dim, name='input'), # (B * G ^ D, U)\n            tf.keras.layers.Dense(units=embedding_dim, activation=None, use_bias=False, kernel_initializer='glorot_uniform', bias_initializer=None, name='embed-1'),] # (B * G ^ D, U) => (B * G ^ D, E)\n            + [tokun.layers.TokenizeBlock(left_axis=-2, right_axis=-1, token_dim=token_dim, latent_dim=latent_dim, attention=attention, name='tokenize' + (__i + 1) * '-4') for __i in range(depth)]) # (B * G ^ i, E) => (B * G ^ (i-1), E)\n\n    def call(self, x: tf.Tensor) -> tf.Tensor:\n        return self._encoder(x)\n""",
    """class AutoEncoder(tf.keras.models.Model):\n    def __init__(self, token_dim: int, encoding_dim: int, embedding_dim: int, latent_dim: int, batch_dim: int=None, **kwargs) -> None:\n        super(AutoEncoder, self).__init__(**kwargs)\n        self._encoder = Encoder(token_dim=token_dim, encoding_dim=encoding_dim, embedding_dim=embedding_dim, latent_dim=latent_dim, batch_dim=batch_dim)\n        self._decoder = Decoder(token_dim=token_dim, encoding_dim=encoding_dim, embedding_dim=embedding_dim, latent_dim=latent_dim, batch_dim=batch_dim)\n\n    def call(self, x: tf.Tensor) -> tf.Tensor:\n        return self._decoder(self._encoder(x))""",
    """class AutoEncoder(tf.keras.models.Model):\n  def __init__(self, token_dim: int, encoding_dim: int, embedding_dim: int, latent_dim: int, batch_dim: int=None, **kwargs) -> None:\n    super(AutoEncoder, self).__init__(**kwargs)\n    self._encoder = Encoder(token_dim=token_dim, encoding_dim=encoding_dim, embedding_dim=embedding_dim, latent_dim=latent_dim, batch_dim=batch_dim)\n    self._decoder = Decoder(token_dim=token_dim, encoding_dim=encoding_dim, embedding_dim=embedding_dim, latent_dim=latent_dim, batch_dim=batch_dim)\n\n  def call(self, x: tf.Tensor) -> tf.Tensor:\n    return self._decoder(self._encoder(x))"""]

In [None]:
# CACHE #######################################################################

__cache = llaminate.utils.create_cache(batch_dim=N_BATCH_DIM, cache_dim=N_CACHE_DIM, head_dim=N_HEAD_DIM, num_layers=N_LAYERS_NUM, num_heads=N_HEADS_NUM)
__step = 4

In [None]:
# PREPROCESS ##################################################################

__prompt = """Skynet is an artificial neural network-based conscious group mind and artificial general superintelligence system that serves as the antagonistic force of the Terminator franchise."""
__inputs = tokun.pipeline.preprocess(text=__prompt, groups=TOKUN_DIM, expand=N_SEQUENCE_AXIS * [1], flatten=True)

In [None]:
# PREDICT #####################################################################

__predictions, _ = LLAMINATE(inputs=__inputs, cache=None, position=0, training=False, mask=None)
tokun.pipeline.postprocess(__predictions)

ValueError: Input 0 of layer "model" is incompatible with the layer: expected shape=(None, 16384), found shape=(1, 768)

In [None]:
__batch = iter(DATASET_TRAIN.batch(1))

In [None]:
__x, __y = next(__batch)

In [None]:
__p = LLAMINATE(inputs=__x, cache=None, position=0, training=True, mask=None)

__xt = tokun.pipeline.postprocess(tf.one_hot(__x, depth=256))
__yt = tokun.pipeline.postprocess(__y)
__yp = tokun.pipeline.postprocess(__p)

In [None]:
print(tokun.evaluation.compare(__yt, __yp))
print(__yt)
print(__yp)

In [None]:
tf.argmax(LLAMINATE._decoder(LLAMINATE._encoder(__x[:, :128])), axis=-1)

In [None]:
__x[:, :128]

## Logs

In [None]:
%tensorboard --logdir .logs