## Import deps

In [1]:
!pip install -U datasets mlable tokun revml

Collecting datasets
  Downloading datasets-3.0.0-py3-none-any.whl.metadata (19 kB)
Collecting mlable
  Downloading mlable-0.8.6-py3-none-any.whl.metadata (4.7 kB)
Collecting tokun
  Downloading tokun-0.13.5-py3-none-any.whl.metadata (7.7 kB)
Collecting revml
  Downloading revml-0.6.8-py3-none-any.whl.metadata (1.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.6.1,>=2023.1.0 (from fsspec[http]<=2024.6.1,>=2023.1.0->datasets)
  Downloading fsspec-2024.6.1-py3-none-any.whl.metadata (11 kB)
Collecting aiohttp (from datasets)
  Downloading aiohttp-3.10.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.5 kB)
Collecting aiohappyeyeballs>=2.

In [2]:
import datetime
import functools
import itertools
import math
import os
import random
import urllib.request

import datasets as hd
import tensorflow as tf
import tensorflow_datasets as tfds

import mlable.data
import mlable.metrics
import mlable.shaping

import revml.contract.model
import revml.contract.pipeline

import tokun.data
import tokun.evaluation
import tokun.meta
import tokun.model
import tokun.pipeline

Exception ignored in: <function _xla_gc_callback at 0x7bec67973eb0>
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/jax/_src/lib/__init__.py", line 98, in _xla_gc_callback
    def _xla_gc_callback(*args):
KeyboardInterrupt: 


In [3]:
print("Tensorflow version " + tf.__version__)

Tensorflow version 2.15.0


## Setup the GPU / TPU

In [4]:
# MIXED PRECISION #############################################################

tf.keras.mixed_precision.set_global_policy('mixed_bfloat16')

In [5]:
# DEVICES #####################################################################

tf.debugging.set_log_device_placement(False)

CPU = tf.config.list_logical_devices('CPU')
GPU = tf.config.list_logical_devices('GPU')
TPU = tf.config.list_logical_devices('TPU')

if TPU:
    RESOLVER = tf.distribute.cluster_resolver.TPUClusterResolver()
    tf.config.experimental_connect_to_cluster(RESOLVER)
    tf.tpu.experimental.initialize_tpu_system(RESOLVER)
    DISTRIBUTION_STRATEGY = tf.distribute.TPUStrategy(RESOLVER)
elif GPU:
    DISTRIBUTION_STRATEGY = tf.distribute.MirroredStrategy(GPU)
else:
    DISTRIBUTION_STRATEGY = tf.distribute.MirroredStrategy(CPU)

print(DISTRIBUTION_STRATEGY)

KeyboardInterrupt: 

## Mode

In [None]:
# TOGGLE ######################################################################

IMPORT = False
DOWNLOAD = False
TRAINING = True
BINARY = True

## Defining The Metadata

In [None]:
# DATA PARAMETERS #############################################################

BATCH_CONFIG = {
    'batch_size': 64,
    'drop_remainder': True,
    'num_parallel_calls': tf.data.AUTOTUNE,}

PIPELINE_CONFIG = {
    'encoder': {
        'batch_dim': BATCH_CONFIG['batch_size'],
        'sample_dim': 512 * 8 * 33, # in bytes != codepoints
        'input_dim': 8 * 33,
        'sequence_axis': 1,
        'feature_axis': -1,},
    'decoder': {
        'batch_dim': BATCH_CONFIG['batch_size'],
        'sample_dim': 512 * 2 * 33,
        'input_dim': 2 * 33,
        'sequence_axis': 1,
        'feature_axis': -1,
        'data_weight': 1.0,
        'padding_weight': 0.01,}}

In [None]:
# DECODER PARAMETERS ##########################################################

DECODER_CONFIG = {
  'num_layers': 8,
  'num_heads': 16,
  'input_dim': PIPELINE_CONFIG['decoder']['input_dim'],
  'context_dim': PIPELINE_CONFIG['encoder']['input_dim'],
  'embed_dim': 1056, # 4 * 33 * 8
  'head_dim': 1056 // 16,
  'hidden_dim': 1056 * 4,
  'epsilon': 1e-6,}

In [None]:
# DERIVED DECODING PARAMETERS #################################################

DECODER_META = {
    'version': '0.1',
    'path': 'decoder.keras',
    'url': '',}

In [None]:
# TRAINING PARAMETERS #########################################################

OPTIMIZER_CONFIG = {
    'learning_rate': 4 * 0.001 * (0.1 if IMPORT else 1.0),
    'weight_decay': 0.1,
    'beta_1': 0.9,
    'beta_2': 0.95,
    'clipnorm': 1.0,}

SCHEDULER_CONFIG = {
    'initial_learning_rate': OPTIMIZER_CONFIG['learning_rate'],
    'decay_steps': 800 * 3,
    'alpha': 0.1,
    'name': 'cosine_lr',
    'warmup_target': None,
    'warmup_steps': 0,}

METRICS_CONFIG = {
    # 'factor': 256,
    'depth': 8,}

LOSS_CONFIG = {
    'from_logits': False,
    'label_smoothing': 0.,
    'axis': -1,
    'reduction': 'sum_over_batch_size',
    'name': 'loss',}

CHECKPOINT_CONFIG = {
    'filepath': DECODER_META['path'],
    'monitor': 'val_loss',
    'mode': 'auto',
    'save_freq': 'epoch',
    'save_best_only': False,
    'save_weights_only': False,
    'verbose': 1,}

TENSORBOARD_CONFIG = {
    'log_dir': os.path.join('.logs/', *DECODER_META['version'], datetime.datetime.now().strftime("%Y%m%d-%H%M%S")),
    'histogram_freq': 1,
    'embeddings_freq': 0,
    'profile_batch': (128, 256),
    'write_graph': False,
    'write_images': True,}

TRAINING_CONFIG = {
    'epochs': 8,
    'batch_size': None,
    'validation_split': None,
    'validation_freq': list(range(1, 9)),
    # 'class_weight': {__c: 0.03 if __c == 0 else 1. for __c in range(PIPELINE_CONFIG['decoder']['input_dim'])}, # there are 32 times more 0s than other bytes (most instructions have null data)
    'verbose': 1,}

## Download The Model Weights

In [None]:
# DECODER #####################################################################

if IMPORT and DOWNLOAD:
    urllib.request.urlretrieve(DECODER_META['url'], DECODER_META['path'])

## Loading The Data

In [None]:
# EVMC DATASET ################################################################

EVMC_TRAIN = hd.load_dataset('apehex/evm_contracts', name='hex-ethereum', split='cleaned[:90%]').to_tf_dataset(shuffle=True, batch_size=None)
EVMC_TEST = hd.load_dataset('apehex/evm_contracts', name='hex-ethereum', split='cleaned[90%:]').to_tf_dataset(shuffle=True, batch_size=None)

## Preprocess

In [None]:
# EVMC ########################################################################

# specialized preprocessing fn
__preprocess = revml.contract.pipeline.preprocess_factory(
    decoder_config=PIPELINE_CONFIG['decoder'],
    encoder_config=PIPELINE_CONFIG['encoder'],)

# apply
with DISTRIBUTION_STRATEGY.scope():
    EVMC_TRAIN = EVMC_TRAIN.batch(**BATCH_CONFIG).map(__preprocess, num_parallel_calls=tf.data.AUTOTUNE)
    EVMC_TEST = EVMC_TEST.batch(**BATCH_CONFIG).map(__preprocess, num_parallel_calls=tf.data.AUTOTUNE)

In [None]:
# INSPECT #####################################################################

(__X, __C), __T, __W = next(iter(EVMC_TEST.take(1)))

print(EVMC_TRAIN.element_spec)
print(EVMC_TEST.element_spec)

print('evmc train: {:,}'.format(EVMC_TRAIN.cardinality().numpy()))
print('evmc test:  {:,}'.format(EVMC_TEST.cardinality().numpy()))

## Init The Models

In [None]:
# DEBUG #######################################################################

class DebugModel(tf.keras.models.Model):
    def __init__(self, output_dim: int, **kwargs) -> None:
        # init
        super(DebugModel, self).__init__(**kwargs)
        # layers
        self._head = tf.keras.layers.Dense(units=output_dim, activation='sigmoid', use_bias=False, kernel_initializer='glorot_uniform', bias_initializer='zeros', name='head')

    def call(self, inputs: tuple, **kwargs) -> tf.Tensor:
        return self._head(inputs[0])

In [None]:
# METRICS #####################################################################

_Accuracy = mlable.metrics.BinaryGroupAccuracy if BINARY else mlable.metrics.RawGroupAccuracy
_Loss = tf.keras.losses.BinaryCrossentropy if BINARY else tf.keras.losses.MeanSquaredError

In [None]:
# COMPILE #####################################################################

with DISTRIBUTION_STRATEGY.scope():
    # COSINE LR ###############################################################
    cosine_lr = tf.keras.optimizers.schedules.CosineDecay(**SCHEDULER_CONFIG)
    OPTIMIZER_CONFIG['learning_rate'] = cosine_lr
    # metrics
    byte_accuracy = _Accuracy(group=1, name='byte_accuracy', **METRICS_CONFIG)
    instruction_accuracy = _Accuracy(group=33, name='instruction_accuracy', **METRICS_CONFIG)
    token_accuracy = _Accuracy(group=DECODER_CONFIG['input_dim'], name='token_accuracy', **METRICS_CONFIG)
    # decoder
    DECODER = revml.contract.model.Transformer(**DECODER_CONFIG)
    if IMPORT and os.path.isfile(DECODER_META['path']): DECODER = tf.keras.models.load_model(DECODER_META['path'], compile=False)
    # build
    DECODER((__X, __C))
    # compile
    DECODER.compile(
        optimizer=tf.keras.optimizers.AdamW(**OPTIMIZER_CONFIG),
        loss=_Loss(**LOSS_CONFIG),
        weighted_metrics=[byte_accuracy, instruction_accuracy, token_accuracy])

In [None]:
DECODER.summary()

## Train

In [None]:
# TRAIN #######################################################################

if TRAINING:
    with DISTRIBUTION_STRATEGY.scope():
        # callbacks
        cp_callback = tf.keras.callbacks.ModelCheckpoint(**CHECKPOINT_CONFIG)
        tb_callback = tf.keras.callbacks.TensorBoard(**TENSORBOARD_CONFIG)
        # fit model
        TRAINING_HISTORY = DECODER.fit(
            x=EVMC_TRAIN.prefetch(tf.data.AUTOTUNE),
            validation_data=EVMC_TEST.prefetch(tf.data.AUTOTUNE),
            callbacks=[cp_callback, tb_callback],
            **TRAINING_CONFIG)

## Dataviz

In [None]:
# DATA ########################################################################

__i = iter(EVMC_TEST)

In [None]:
(__x, __c), __t, __w = next(__i)
__y = DECODER((__x, __c))

In [None]:
__s = tf.reshape(__c, (BATCH_CONFIG['batch_size'], -1))
__s = tokun.pipeline.codepoint(__s)
__s = tokun.pipeline.decode(__s)

In [None]:
instruction_accuracy(y_true=__t, y_pred=__y)

In [None]:
__yp = mlable.sampling.binary(__y, depth=8, threshold=0.6)
__yp = mlable.shaping.merge(__yp, left_axis=-2, right_axis=-1, left=True)
__op = revml.contract.pipeline.detokenize(__yp)
__ip = [revml.contract.bytecode.iterate_over_instructions(bytes.fromhex(__h.decode('utf-8'))) for __h in __op.numpy().tolist()]
__ip = ['|'.join(__i.hex() for __i in __c) for __c in __ip]

In [None]:
__yt = mlable.sampling.binary(__t, depth=8, threshold=0.6)
__yt = mlable.shaping.merge(__yt, left_axis=-2, right_axis=-1, left=True)
__ot = revml.contract.pipeline.detokenize(__yt)
__it = [revml.contract.bytecode.iterate_over_instructions(bytes.fromhex(__h.decode('utf-8'))) for __h in __ot.numpy().tolist()]
__it = ['|'.join(__i.hex() for __i in __c) for __c in __it]

In [None]:
print(__it[0])
print(__ip[0])

In [None]:
print(__ot[1].numpy())
print(__op[1].numpy())

## Inspect

In [None]:
%load_ext tensorboard

In [None]:
# %tensorboard --logdir .logs