## Import deps

In [None]:
!pip install -U datasets mlable tokun revml

Collecting datasets
  Downloading datasets-2.21.0-py3-none-any.whl.metadata (21 kB)
Collecting mlable
  Downloading mlable-0.7.17-py3-none-any.whl.metadata (4.7 kB)
Collecting tokun
  Downloading tokun-0.13.3-py3-none-any.whl.metadata (7.7 kB)
Collecting revml
  Downloading revml-0.5.2-py3-none-any.whl.metadata (1.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting aiohttp (from datasets)
  Downloading aiohttp-3.10.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.5 kB)
Collecting aiohappyeyeballs>=2.3.0 (from aiohttp->datasets)
  Downloading aiohappyeyeballs-2.4.0-py3-none-any.whl.metadata (5.9 kB)
Collecting aiosignal>=1.1.2 (from aiohttp->datasets

In [None]:
import datetime
import functools
import itertools
import math
import os
import random
import urllib.request

import datasets as hd
import tensorflow as tf
import tensorflow_datasets as tfds

import mlable.data
import mlable.metrics
import mlable.ops

import revml.contract.model
import revml.contract.pipeline

import tokun.data
import tokun.evaluation
import tokun.meta
import tokun.model
import tokun.pipeline

In [None]:
print("Tensorflow version " + tf.__version__)

Tensorflow version 2.15.0


## Setup the GPU / TPU

In [None]:
# MIXED PRECISION #############################################################

tf.keras.mixed_precision.set_global_policy('mixed_bfloat16')

In [None]:
# DEVICES #####################################################################

tf.debugging.set_log_device_placement(False)

CPU = tf.config.list_logical_devices('CPU')
GPU = tf.config.list_logical_devices('GPU')
TPU = tf.config.list_logical_devices('TPU')

if TPU:
    RESOLVER = tf.distribute.cluster_resolver.TPUClusterResolver()
    tf.config.experimental_connect_to_cluster(RESOLVER)
    tf.tpu.experimental.initialize_tpu_system(RESOLVER)
    DISTRIBUTION_STRATEGY = tf.distribute.TPUStrategy(RESOLVER)
elif GPU:
    DISTRIBUTION_STRATEGY = tf.distribute.MirroredStrategy(GPU)
else:
    DISTRIBUTION_STRATEGY = tf.distribute.MirroredStrategy(CPU)

print(DISTRIBUTION_STRATEGY)

<tensorflow.python.distribute.tpu_strategy.TPUStrategyV2 object at 0x7fc900444550>


## Mode

In [None]:
# TOGGLE ######################################################################

IMPORT = False
DOWNLOAD = False
TRAINING = True
BINARY = True

## Defining The Metadata

In [None]:
# DATA PARAMETERS #############################################################

BATCH_CONFIG = {
    'batch_size': 128,
    'drop_remainder': True,
    'num_parallel_calls': tf.data.AUTOTUNE,}

PIPELINE_CONFIG = {
    'encoder': {
        'batch_dim': BATCH_CONFIG['batch_size'],
        'sample_dim': 128 * 256, # in bytes != codepoints => 8192 characters
        'token_dim': 256,
        'input_dim': 256, # bytes
        'output_dim': 256, # bytes
        'sequence_axis': 1,
        'feature_axis': -1,
        'output_dtype': tf.uint8,},
    'decoder': {
        'batch_dim': BATCH_CONFIG['batch_size'],
        'sample_dim': 128 * 4 * 33,
        'token_dim': 4 * 33,
        'input_dim': 256, # bytes
        'output_dim': 256, # bytes
        'sequence_axis': 1,
        'feature_axis': -1,
        'data_weight': 1.0,
        'padding_weight': 0.01,
        'binary': BINARY},}

In [None]:
# DECODER PARAMETERS ##########################################################

DECODER_CONFIG = {
  'num_layers': 4,
  'num_heads': 4,
  'embed_dim': 1024,
  'head_dim': 1024 // 4,
  'hidden_dim': 1024 * 4,
  'output_dim': int(math.log(PIPELINE_CONFIG['decoder']['input_dim'], 2)) * PIPELINE_CONFIG['decoder']['token_dim'] if BINARY else PIPELINE_CONFIG['decoder']['token_dim'],
  'epsilon': 1e-6,}

In [None]:
# DERIVED DECODING PARAMETERS #################################################

DECODER_META = {
    'version': '0.1',
    'path': 'decoder.keras',
    'url': '',}

In [None]:
# TRAINING PARAMETERS #########################################################

OPTIMIZER_CONFIG = {
    'learning_rate': 0.001 * (0.1 if IMPORT else 1.0),
    'weight_decay': 0.1,
    'beta_1': 0.9,
    'beta_2': 0.99,
    'clipnorm': 1.0,}

METRICS_CONFIG = {
    'depth': int(math.log(PIPELINE_CONFIG['decoder']['input_dim'], 2)),}

LOSS_CONFIG = {
    'from_logits': False,
    'label_smoothing': 0.,
    'axis': -1,
    'reduction': 'sum_over_batch_size',
    'name': 'loss',}

CHECKPOINT_CONFIG = {
    'filepath': DECODER_META['path'],
    'monitor': 'val_loss',
    'mode': 'auto',
    'save_freq': 'epoch',
    'save_best_only': False,
    'save_weights_only': False,
    'verbose': 1,}

TENSORBOARD_CONFIG = {
    'log_dir': os.path.join('.logs/', *DECODER_META['version'], datetime.datetime.now().strftime("%Y%m%d-%H%M%S")),
    'histogram_freq': 1,
    'embeddings_freq': 0,
    'profile_batch': (128, 256),
    'write_graph': False,
    'write_images': True,}

TRAINING_CONFIG = {
    'epochs': 8,
    'batch_size': None,
    'validation_split': None,
    'validation_freq': list(range(1, 9)),
    'class_weight': {__c: 0.03 if __c == 0 else 1. for __c in range(PIPELINE_CONFIG['decoder']['input_dim'])}, # there are 32 times more 0s than other bytes (most instructions have null data)
    'verbose': 1,}

## Download The Model Weights

In [None]:
# DECODER #####################################################################

if IMPORT and DOWNLOAD:
    urllib.request.urlretrieve(DECODER_META['url'], DECODER_META['path'])

## Loading The Data

In [None]:
# EVMC DATASET ################################################################

EVMC_TRAIN = hd.load_dataset('apehex/evm_contracts', name='hex-ethereum', split='cleaned[:90%]').to_tf_dataset(shuffle=True, batch_size=None)
EVMC_TEST = hd.load_dataset('apehex/evm_contracts', name='hex-ethereum', split='cleaned[90%:]').to_tf_dataset(shuffle=True, batch_size=None)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/5.58k [00:00<?, ?B/s]

Resolving data files:   0%|          | 0/1687 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/100 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/1687 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/100 [00:00<?, ?it/s]

Downloading data:   0%|          | 0/1687 [00:00<?, ?files/s]

KeyboardInterrupt: 

## Preprocess

In [None]:
# EVMC ########################################################################

# specialized preprocessing fn
__preprocess = revml.contract.pipeline.preprocess_factory(
    decoder_config=PIPELINE_CONFIG['decoder'],
    encoder_config=PIPELINE_CONFIG['encoder'],)

# apply
EVMC_TRAIN = EVMC_TRAIN.batch(**BATCH_CONFIG).map(__preprocess, num_parallel_calls=tf.data.AUTOTUNE)
EVMC_TEST = EVMC_TEST.batch(**BATCH_CONFIG).map(__preprocess, num_parallel_calls=tf.data.AUTOTUNE)

In [None]:
# INSPECT #####################################################################

print(EVMC_TRAIN.element_spec)
print(EVMC_TEST.element_spec)

print('evmc train: {:,}'.format(EVMC_TRAIN.cardinality().numpy()))
print('evmc test:  {:,}'.format(EVMC_TEST.cardinality().numpy()))

## Init The Models

In [None]:
# METRICS #####################################################################

_Accuracy = mlable.metrics.BinaryGroupAccuracy if BINARY else mlable.metrics.RawGroupAccuracy
_Loss = tf.keras.losses.BinaryCrossentropy if BINARY else tf.keras.losses.MeanSquaredError

In [None]:
# COMPILE #####################################################################

with DISTRIBUTION_STRATEGY.scope():
    # metrics
    byte_accuracy = _Accuracy(group=1, name='byte_accuracy', **METRICS_CONFIG)
    instruction_accuracy = _Accuracy(group=33, name='instruction_accuracy', **METRICS_CONFIG)
    token_accuracy = _Accuracy(group=PIPELINE_CONFIG['decoder']['token_dim'], name='token_accuracy', **METRICS_CONFIG)
    # decoder
    DECODER = revml.contract.model.Transformer(**DECODER_CONFIG)
    if IMPORT and os.path.isfile(DECODER_META['path']): DECODER = tf.keras.models.load_model(DECODER_META['path'], compile=False)
    # compile
    DECODER.compile(
        optimizer=tf.keras.optimizers.AdamW(**OPTIMIZER_CONFIG),
        loss=_Loss(**LOSS_CONFIG),
        weighted_metrics=[byte_accuracy, instruction_accuracy, token_accuracy])

## Train

In [None]:
# TRAIN #######################################################################

if TRAINING:
    with DISTRIBUTION_STRATEGY.scope():
        # callbacks
        cp_callback = tf.keras.callbacks.ModelCheckpoint(**CHECKPOINT_CONFIG)
        tb_callback = tf.keras.callbacks.TensorBoard(**TENSORBOARD_CONFIG)
        # fit model
        TRAINING_HISTORY = DECODER.fit(
            x=EVMC_TRAIN.prefetch(tf.data.AUTOTUNE),
            validation_data=EVMC_TEST.prefetch(tf.data.AUTOTUNE),
            callbacks=[cp_callback, tb_callback],
            **TRAINING_CONFIG)

In [None]:
DECODER.summary()

## Dataviz

In [None]:
# DATA ########################################################################

__i = iter(EVMC_TEST)
(__x, __c), __t, __w = next(__i)

In [None]:
tokun.pipeline.decode(tf.cast(0x40000 * tf.reshape(__c, (128, -1)), tf.int32))

In [None]:
revml.contract.pipeline.detokenize(tf.cast(256 * tf.reshape(__x, (128, -1)), tf.int32))

In [None]:
DECODER((__x, __c))

In [None]:
# COMPUTE ######################################################################

__i = 0
__x = tokun.pipeline.preprocess(text=SAMPLES[__i], token_size=math.prod(N_TOKEN_DIM), expand=[1])
__p = DECODER(__x)
__y = tokun.pipeline.postprocess(__p, binary=BINARY, random=False)
__o = tokun.pipeline.unpack(data=__y)

In [None]:
print('# INPUT ################################################################\n\n' + SAMPLES[__i])
print('\n# OUTPUT ###############################################################\n\n' + __o[0])
print('\n# SCORE ################################################################\n\n' + str(tokun.evaluation.compare(SAMPLES[__i], __o[0])))

## Inspect

In [None]:
%load_ext tensorboard

In [None]:
%tensorboard --logdir .logs