## Import deps

In [None]:
!pip install -U datasets mlable tokun revml

Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl.metadata (19 kB)
Collecting mlable
  Downloading mlable-0.7.4-py3-none-any.whl.metadata (4.7 kB)
Collecting tokun
  Downloading tokun-0.11.1-py3-none-any.whl.metadata (7.7 kB)
Collecting revml
  Downloading revml-0.0.3-py3-none-any.whl.metadata (1.3 kB)
Collecting pyarrow-hotfix (from datasets)
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl.metadata (3.6 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.5.0,>=2023.1.0 (from fsspec[http]<=2024.5.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.5.0-py3-none-any.whl.metadata (11 kB)
Collecting aiohttp (from datasets)
  Downloading aiohttp-3.10

In [None]:
import datetime
import functools
import itertools
import math
import os
import random
import urllib.request

import datasets as hd
import tensorflow as tf
import tensorflow_datasets as tfds

import mlable.data
import mlable.metrics
import mlable.ops

import revml.contract.decoder.model
import revml.contract.decoder.pipeline

import tokun.data
import tokun.evaluation
import tokun.meta
import tokun.model
import tokun.pipeline

In [None]:
print("Tensorflow version " + tf.__version__)

Tensorflow version 2.15.0


## Setup the GPU / TPU

In [None]:
# MIXED PRECISION #############################################################

tf.keras.mixed_precision.set_global_policy('mixed_bfloat16')

In [None]:
# DEVICES #####################################################################

tf.debugging.set_log_device_placement(False)

CPU = tf.config.list_logical_devices('CPU')
GPU = tf.config.list_logical_devices('GPU')
TPU = tf.config.list_logical_devices('TPU')

if TPU:
    RESOLVER = tf.distribute.cluster_resolver.TPUClusterResolver()
    tf.config.experimental_connect_to_cluster(RESOLVER)
    tf.tpu.experimental.initialize_tpu_system(RESOLVER)
    DISTRIBUTION_STRATEGY = tf.distribute.TPUStrategy(RESOLVER)
elif GPU:
    DISTRIBUTION_STRATEGY = tf.distribute.MirroredStrategy(GPU)
else:
    DISTRIBUTION_STRATEGY = tf.distribute.MirroredStrategy(CPU)

print(DISTRIBUTION_STRATEGY)

<tensorflow.python.distribute.tpu_strategy.TPUStrategyV2 object at 0x7b553d962a40>


## Mode

In [None]:
# TOGGLE ######################################################################

IMPORT = False
DOWNLOAD = False
TRAINING = True
BINARY = True

## Defining The Metadata

In [None]:
# ENCODER PARAMETERS ##########################################################

ENCODER_CONFIG = {
    'token_dim': [4, 4, 4],
    'input_dim': 256,
    'embed_dim': 256,
    'output_dim': 8 if BINARY else 256,
    'sequence_axis': 1,}

In [None]:
# DERIVED ENCODING PARAMETERS #################################################

ENCODER_META = {
    'token_factor': math.prod(ENCODER_CONFIG['token_dim']) // 4, # number of character per embedding
    'token_sizes': list(itertools.accumulate(ENCODER_CONFIG['token_dim'], lambda x, y: x * y)), # in bytes
    'version': tokun.meta.version(**ENCODER_CONFIG),
    'path': 'encoder.keras',
    'url': 'https://github.com/apehex/tokun/raw/main/models/{}/{}/{}/{}.keras'.format(*tokun.meta.version(**ENCODER_CONFIG), '5.7'),}

In [None]:
# DECODER PARAMETERS ##########################################################

DECODER_CONFIG = {
  'num_layers': 4,
  'num_heads': 4,
  'embed_dim': 256,
  'head_dim': 256 // 4,
  'hidden_dim': 256 * 4,
  'input_dim': 256,
  'output_dim': 8 if BINARY else 256,
  'token_dim': [33],
  'epsilon': 1e-6,
  'activation': 'gelu',
  'output': 'binary' if BINARY else 'categorical',}

In [None]:
# DERIVED DECODING PARAMETERS #################################################

DECODER_META = {
    'token_factor': math.prod(DECODER_CONFIG['token_dim']) // 33, # number of instructions per embedding
    'token_sizes': list(itertools.accumulate(DECODER_CONFIG['token_dim'], lambda x, y: x * y)), # in bytes
    'version': '0.1',
    'path': 'decoder.keras',
    'url': '',}

In [None]:
# DATA PARAMETERS #############################################################

BATCH_CONFIG = {
    'batch_size': 128,
    'drop_remainder': True,
    'num_parallel_calls': tf.data.AUTOTUNE,}

PREPROCESSING_CONFIG = {
    'encoder': {
        'batch_dim': BATCH_CONFIG['batch_size'],
        'sample_dim': math.prod(ENCODER_CONFIG['token_dim']) * 1024,
        'token_dim': math.prod(ENCODER_CONFIG['token_dim']),
        'embed_dim': ENCODER_CONFIG['embed_dim'],},
    'decoder': {
        'batch_dim': BATCH_CONFIG['batch_size'],
        'sample_dim': math.prod(DECODER_CONFIG['token_dim']) * 1024,
        'token_dim': math.prod(DECODER_CONFIG['token_dim']),
        'output_dim': DECODER_CONFIG['output_dim'],
        'padding_weight': 0.01,},}

In [None]:
# TRAINING PARAMETERS #########################################################

OPTIMIZER_CONFIG {
    'learning_rate': 0.001 * (0.1 if IMPORT else 1.0),
    'weight_decay': 0.1,
    'beta_1': 0.9,
    'beta_2': 0.99,
    'clipnorm': 1.0,}

LOSS_CONFIG = {
    'from_logits': False,
    'label_smoothing': 0.,
    'axis': -1,
    'reduction': 'sum_over_batch_size',
    'name': 'ce_loss',}

CHECKPOINT_CONFIG = {
    'filepath': DECODER_META['path'],
    'monitor': 'val_loss',
    'mode': 'auto',
    'save_freq': 'epoch',
    'save_best_only': False,
    'save_weights_only': False,
    'verbose': 1,}

TENSORBOARD_CONFIG = {
    'log_dir': os.path.join('.logs/', *DECODER_META['version'], datetime.datetime.now().strftime("%Y%m%d-%H%M%S")),
    'histogram_freq': 1,
    'embeddings_freq': 0,
    'profile_batch': (128, 256),
    'write_graph': False,
    'write_images': True,}

TRAINING_CONFIG = {
    'epochs': 8,
    'batch_size': None,
    'validation_split': None,
    'validation_freq': list(range(1, 9)),
    'class_weights': {__c: 0.03 if __c == 0 else 1. for __c in range(DECODER_CONFIG['input_dim'])}, # there are 32 times more 0s than other bytes (most instructions have null data)
    'verbose': 1,}

## Download The Model Weights

In [None]:
# ENCODER #####################################################################

urllib.request.urlretrieve(ENCODER_META['url'], ENCODER_META['path'])

# DECODER #####################################################################

if IMPORT and DOWNLOAD:
    urllib.request.urlretrieve(DECODER_META['url'], DECODER_META['path'])

## Loading The Data

In [None]:
# EVMC DATASET ################################################################

EVMC_TRAIN = hd.load_dataset('apehex/evm_contracts', name='hex-ethereum', split='cleaned[:90%]').to_tf_dataset(shuffle=True, batch_size=None)
EVMC_TEST = hd.load_dataset('apehex/evm_contracts', name='hex-ethereum', split='cleaned[90%:]').to_tf_dataset(shuffle=True, batch_size=None)

Resolving data files:   0%|          | 0/1687 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/100 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/1687 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/100 [00:00<?, ?it/s]

Downloading data:   0%|          | 0/1687 [00:00<?, ?files/s]

Downloading data:   0%|          | 0/100 [00:00<?, ?files/s]

Generating train split:   0%|          | 0/1294247 [00:00<?, ? examples/s]

Generating cleaned split:   0%|          | 0/122800 [00:00<?, ? examples/s]

Resolving data files:   0%|          | 0/1687 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/100 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/1687 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/100 [00:00<?, ?it/s]

## Init The Models

In [None]:
# METRICS #####################################################################

_Accuracy = mlable.metrics.BinaryGroupAccuracy if BINARY else mlable.metrics.CategoricalGroupAccuracy
_Loss = tf.keras.losses.BinaryCrossentropy if BINARY else tf.keras.losses.CategoricalCrossentropy

In [None]:
# COMPILE #####################################################################

with DISTRIBUTION_STRATEGY.scope():
    # metrics
    byte_accuracy = _Accuracy(group=1, name='byte_accuracy')
    instruction_accuracy = _Accuracy(group=33, name='instruction_accuracy')
    token_accuracy = _Accuracy(group=N_TOKEN_SIZES[-1], name='token_accuracy')
    # encoder
    ENCODER = tf.keras.model.load_model(ENCODER_META['path'], compile=False)
    # decoder
    DECODER = revml.contract.decoder.model.Transformer(**DECODER_CONFIG)
    if IMPORT and os.path.isfile(PATH_IMPORT): DECODER = tf.keras.models.load_model(DECODER_META['path'], compile=False)
    # compile
    DECODER.compile(
        optimizer=tf.keras.optimizers.AdamW(**OPTIMIZER_CONFIG),
        loss=_Loss(**LOSS_CONFIG),
        weighted_metrics=[byte_accuracy, instruction_accuracy, token_accuracy])

## Preprocess

In [None]:
# EVMC ########################################################################

# specialized preprocessing fn
__preprocess = revml.contract.decoder.pipeline.preprocess(
    decoder_config=PREPROCESSING_CONFIG['decoder'],
    encoder_config=PREPROCESSING_CONFIG['encoder'],
    encoder_model=ENCODER._encoder)

# apply
EVMC_TRAIN = EVMC_TRAIN.batch(**BATCH_CONFIG).map(__preprocess, num_parallel_calls=tf.data.AUTOTUNE)
EVMC_TEST = EVMC_TEST.batch(**BATCH_CONFIG).map(__preprocess, num_parallel_calls=tf.data.AUTOTUNE)

In [None]:
# INSPECT #####################################################################

print(EVMC_TRAIN.element_spec)
print(EVMC_TEST.element_spec)

print('evmc train: {:,}'.format(EVMC_TRAIN.cardinality().numpy()))
print('evmc test:  {:,}'.format(EVMC_TEST.cardinality().numpy()))

(TensorSpec(shape=(128, 33792), dtype=tf.int32, name=None), TensorSpec(shape=(128, 33792, 8), dtype=tf.float32, name=None), TensorSpec(shape=(128, 33792), dtype=tf.float32, name=None))
(TensorSpec(shape=(128, 33792), dtype=tf.int32, name=None), TensorSpec(shape=(128, 33792, 8), dtype=tf.float32, name=None), TensorSpec(shape=(128, 33792), dtype=tf.float32, name=None))
evmc train: 863
evmc test:  95


## Train

In [None]:
# TRAIN #######################################################################

if TRAINING:
    with DISTRIBUTION_STRATEGY.scope():
        # callbacks
        cp_callback = tf.keras.callbacks.ModelCheckpoint(**CHECKPOINT_CONFIG)
        tb_callback = tf.keras.callbacks.TensorBoard(**TENSORBOARD_CONFIG)
        # fit model
        TRAINING_HISTORY = DECODER.fit(
            x=EVMC_TRAIN.prefetch(tf.data.AUTOTUNE),
            validation_data=EVMC_TEST.prefetch(tf.data.AUTOTUNE),
            callbacks=[cp_callback, tb_callback],
            **TRAINING_CONFIG)

Epoch 1/8




Epoch 1: saving model to decoder.keras
Epoch 2/8




Epoch 2: saving model to decoder.keras
Epoch 3/8




Epoch 3: saving model to decoder.keras
Epoch 4/8
133/863 [===>..........................] - ETA: 1:05:57 - loss: 0.0026 - byte_accuracy: 0.9867 - instruction_accuracy: 0.7668 - token_accuracy: 0.7668

KeyboardInterrupt: 

In [None]:
DECODER.summary()

## Dataviz

In [None]:
# DATA ########################################################################

SAMPLES = [
    """위키백과, 우리 모두의 백과사전.\nt-분포 확률적 임베딩(t-SNE)은 데이터의 차원 축소에 사용되는 기계 학습 알고리즘 중 하나로, 2002년 샘 로이스Sam Rowise와 제프리 힌튼에 의해 개발되었다.[1] t-SNE는 비선형 차원 축소 기법으로, 고차원 데이터를 특히 2, 3차원 등으로 줄여 가시화하는데에 유용하게 사용된다. 구체적으로 t-SNE는 비슷한 데이터는 근접한 2, 3차원의 지점으로, 다른 데이터는 멀리 떨어진 지점으로 맵핑한다.""",
    """class Encoder(tf.keras.models.Model):\n    def __init__(self, depth: int, token_dim: int, encoding_dim: int, embedding_dim: int, batch_dim: int=None, attention: bool=False, **kwargs) -> None:\n        super(Encoder, self).__init__(**kwargs)\n        self._encoder = tf.keras.Sequential([\n            tf.keras.Input(shape=(encoding_dim,), batch_size=batch_dim, name='input'), # (B * G ^ D, U)\n            tf.keras.layers.Dense(units=embedding_dim, activation=None, use_bias=False, kernel_initializer='glorot_uniform', bias_initializer=None, name='embed-1'),] # (B * G ^ D, U) => (B * G ^ D, E)\n            + [tokun.layers.TokenizeBlock(left_axis=-2, right_axis=-1, token_dim=token_dim, attention=attention, name='tokenize' + (__i + 1) * '-4') for __i in range(depth)]) # (B * G ^ i, E) => (B * G ^ (i-1), E)\n\n    def call(self, x: tf.Tensor) -> tf.Tensor:\n        return self._encoder(x)\n""",
    """class AutoEncoder(tf.keras.models.Model):\n    def __init__(self, token_dim: int, encoding_dim: int, embedding_dim: int, batch_dim: int=None, **kwargs) -> None:\n        super(AutoEncoder, self).__init__(**kwargs)\n        self._encoder = Encoder(token_dim=token_dim, encoding_dim=encoding_dim, embedding_dim=embedding_dim, batch_dim=batch_dim)\n        self._decoder = Decoder(token_dim=token_dim, encoding_dim=encoding_dim, embedding_dim=embedding_dim, batch_dim=batch_dim)\n\n    def call(self, x: tf.Tensor) -> tf.Tensor:\n        return self._decoder(self._encoder(x))""",
    """class AutoEncoder(tf.keras.models.Model):\n  def __init__(self, token_dim: int, encoding_dim: int, embedding_dim: int, batch_dim: int=None, **kwargs) -> None:\n    super(AutoEncoder, self).__init__(**kwargs)\n    self._encoder = Encoder(token_dim=token_dim, encoding_dim=encoding_dim, embedding_dim=embedding_dim, batch_dim=batch_dim)\n    self._decoder = Decoder(token_dim=token_dim, encoding_dim=encoding_dim, embedding_dim=embedding_dim, batch_dim=batch_dim)\n\n  def call(self, x: tf.Tensor) -> tf.Tensor:\n    return self._decoder(self._encoder(x))"""]

In [None]:
# COMPUTE ######################################################################

__i = 0
__x = tokun.pipeline.preprocess(text=SAMPLES[__i], token_size=math.prod(N_TOKEN_DIM), expand=[1])
__p = DECODER(__x)
__y = tokun.pipeline.postprocess(__p, binary=BINARY, random=False)
__o = tokun.pipeline.unpack(data=__y)

In [None]:
print('# INPUT ################################################################\n\n' + SAMPLES[__i])
print('\n# OUTPUT ###############################################################\n\n' + __o[0])
print('\n# SCORE ################################################################\n\n' + str(tokun.evaluation.compare(SAMPLES[__i], __o[0])))

## Inspect

In [None]:
%load_ext tensorboard

In [None]:
%tensorboard --logdir .logs