## Install Dependencies

In [1]:
!pip install mlable tokun

%load_ext tensorboard

Collecting mlable
  Downloading mlable-0.4.2-py3-none-any.whl (16 kB)
Collecting tokun
  Downloading tokun-0.7.2-py3-none-any.whl (15 kB)
Installing collected packages: mlable, tokun
Successfully installed mlable-0.4.2 tokun-0.7.2


## Login To HF

In [2]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) n
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


## Import Dependencies

In [3]:
import itertools
import math
import urllib.request

import huggingface_hub as hh
import keras
import tensorflow as tf
import transformers as ht

import mlable.io
import tokun.evaluation
import tokun.huggingface
import tokun.meta
import tokun.model
import tokun.pipeline

In [4]:
print("Tensorflow version " + tf.__version__)

Tensorflow version 2.15.0


## Setup the GPU / TPU

In [5]:
tf.debugging.set_log_device_placement(False)

CPU = tf.config.list_logical_devices('CPU')
GPU = tf.config.list_logical_devices('GPU')
TPU = tf.config.list_logical_devices('TPU')

if TPU:
    RESOLVER = tf.distribute.cluster_resolver.TPUClusterResolver()
    tf.config.experimental_connect_to_cluster(RESOLVER)
    tf.tpu.experimental.initialize_tpu_system(RESOLVER)
    DISTRIBUTION_STRATEGY = tf.distribute.TPUStrategy(RESOLVER)
elif GPU:
    DISTRIBUTION_STRATEGY = tf.distribute.MirroredStrategy(GPU)
else:
    DISTRIBUTION_STRATEGY = tf.distribute.MirroredStrategy(CPU)

print(DISTRIBUTION_STRATEGY)

<tensorflow.python.distribute.tpu_strategy.TPUStrategyV2 object at 0x7972338a3be0>


## Defining The Metadata

In [6]:
# PARAMETERS ##################################################################

N_SEQUENCE_AXIS = 1
N_FEATURE_AXIS = -1

N_TOKEN_DIM = [4, 16] # G, for each block
N_SEQUENCE_DIM = 512

In [7]:
# DERIVED #####################################################################

N_TOKEN_SIZES = list(itertools.accumulate(N_TOKEN_DIM, lambda x, y: x * y)) # in bytes

VERSION = tokun.meta.version(units=N_TOKEN_DIM, axis=N_SEQUENCE_AXIS)
LABEL = '6.3'

URL_IMPORT = 'https://github.com/apehex/tokun/raw/main/models/{}/{}/{}.keras'.format(*VERSION, LABEL)

PATH_IMPORT = 'model.keras'
PATH_EXPORT = 'variants/{}'.format(VERSION[0])

## Download The Model

In [8]:
# DOWNLOAD ###################################################################

urllib.request.urlretrieve(URL_IMPORT, PATH_IMPORT)

('model.keras', <http.client.HTTPMessage at 0x797233613d60>)

## Init

In [9]:
# TOKENIZER ###################################################################

TOKENIZER = tokun.huggingface.ByteTokenizer(vocab_size=256, split_special_tokens=True)

In [10]:
# MODEL #######################################################################

with DISTRIBUTION_STRATEGY.scope():
    MODEL = tf.keras.models.load_model(PATH_IMPORT)
    MODEL.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001),
        loss=tf.keras.losses.CategoricalCrossentropy(from_logits=False, label_smoothing=0., axis=-1, reduction=tf.keras.losses.Reduction.SUM_OVER_BATCH_SIZE, name='loss'),
        metrics=['accuracy'])


In [11]:
# SPECIFY IO ##################################################################

__inputs = tf.keras.layers.Input(shape=(math.prod(N_TOKEN_DIM) * N_SEQUENCE_DIM,), dtype=tf.int32)

__outputs = MODEL._encoder(__inputs)
__outputs = MODEL._decoder(__outputs)

TOKUN = tf.keras.models.Model(__inputs, __outputs)

In [12]:
MODEL.summary()
TOKUN.summary()

Model: "auto_encoder"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 encoder (Encoder)           (None, 512, 256)          1377792   
                                                                 
 decoder (Decoder)           (None, 32768, 256)        1382656   
                                                                 
Total params: 2760448 (10.53 MB)
Trainable params: 2760448 (10.53 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 32768)]           0         
                                                                 
 encoder (Encoder)           (None, 512, 256)          1377792   
                                                                 

## Check The Model And Tokenizer

In [13]:
# SAMPLE ######################################################################

__s = """위키백과, 우리 모두의 백과사전.\nt-분포 확률적 임베딩(t-SNE)은 데이터의 차원 축소에 사용되는 기계 학습 알고리즘 중 하나로, 2002년 샘 로이스Sam Rowise와 제프리 힌튼에 의해 개발되었다.[1] t-SNE는 비선형 차원 축소 기법으로, 고차원 데이터를 특히 2, 3차원 등으로 줄여 가시화하는데에 유용하게 사용된다. 구체적으로 t-SNE는 비슷한 데이터는 근접한 2, 3차원의 지점으로, 다른 데이터는 멀리 떨어진 지점으로 맵핑한다."""

In [14]:
# UTF-32 TOKENIZATION #########################################################

__x = TOKENIZER.batch_encode_plus(batch_text_or_text_pairs=[__s], padding='max_length', max_length=math.prod(N_TOKEN_DIM) * N_SEQUENCE_DIM, add_special_tokens=False)
__x = tf.convert_to_tensor(__x['input_ids'])

In [15]:
# TEST THE DERIVED MODEL ######################################################

__e = TOKUN.layers[1](__x) # encoder
__p = TOKUN.layers[2](__e) # decoder
__y = tokun.pipeline.postprocess(__p)

In [16]:
__y

'위키백과, 우리 모두의 백과사전.\nt-분포 확률적 임베딩(t-SNE)은 데이터의 차원 축소에 사용되는 기계 학습 알고리즘 중 하나로, 2002년 샘 로이스Sam Rowise와 제프리 힌튼에 의해 개발되었다.[1] t-SNE는 비선형 차원 축소 기법으로, 고차원 데이터를 특히 2, 3차원 등으로 줄여 가시화하는데에 유용하게 사용된다. 구체적으로 t-SNE는 비슷한 데이터는 근접한 2, 3차원의 지점으로, 다른 데이터는 멀리 떨어진 지점으로 맵핑한다.'

## Export

In [28]:
# INIT HF API #################################################################

API = hh.HfApi()

In [47]:
# TOKENIZER ###################################################################

TOKENIZER.save_pretrained(save_directory='tokenizer/')
API.upload_folder(repo_id='apehex/tokun', folder_path='tokenizer/', path_in_repo='tokenizer/')

CommitInfo(commit_url='https://huggingface.co/apehex/tokun/commit/4f0afb308af37af4bd577608a7513ad867bc8bd1', commit_message='Upload folder using huggingface_hub', commit_description='', oid='4f0afb308af37af4bd577608a7513ad867bc8bd1', pr_url=None, pr_revision=None, pr_num=None)

In [48]:
# MODEL #######################################################################

hh.save_pretrained_keras(model=TOKUN, save_directory='model/', config=TOKUN.get_config())
API.upload_folder(repo_id='apehex/tokun', folder_path='model/', path_in_repo=PATH_EXPORT)



Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

fingerprint.pb:   0%|          | 0.00/58.0 [00:00<?, ?B/s]

saved_model.pb:   0%|          | 0.00/560k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/apehex/tokun/commit/924c0c30bf1dde47b648ddf2ca2b6aa2fba881d9', commit_message='Upload folder using huggingface_hub', commit_description='', oid='924c0c30bf1dde47b648ddf2ca2b6aa2fba881d9', pr_url=None, pr_revision=None, pr_num=None)

## Import And Check

In [60]:
# DOWNLOAD REPO ###############################################################

API.snapshot_download(repo_id='apehex/tokun', local_dir='tokun/')

Fetching 12 files:   0%|          | 0/12 [00:00<?, ?it/s]

tokenizer/vocab.json:   0%|          | 0.00/3.24k [00:00<?, ?B/s]

variants/4x16/README.md:   0%|          | 0.00/190 [00:00<?, ?B/s]

variants/4x16/config.json:   0%|          | 0.00/1.12k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/190 [00:00<?, ?B/s]

.gitattributes:   0%|          | 0.00/1.69k [00:00<?, ?B/s]

tokenizer/special_tokens_map.json:   0%|          | 0.00/153 [00:00<?, ?B/s]

tokenizer/tokenizer_config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

variants/4x16/keras_metadata.pb: 0.00B [00:00, ?B/s]

fingerprint.pb:   0%|          | 0.00/58.0 [00:00<?, ?B/s]

variables.data-00000-of-00001:   0%|          | 0.00/11.1M [00:00<?, ?B/s]

saved_model.pb:   0%|          | 0.00/560k [00:00<?, ?B/s]

variants/4x16/variables/variables.index:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

'/content/tokun'

In [63]:
# MODEL #######################################################################

__tokun = hh.from_pretrained_keras('tokun/variants/4x16/')

In [64]:
# TOKENIZER ###################################################################

__tokenizer = tokun.huggingface.ByteTokenizer()

In [65]:
# PREDICT #####################################################################

__s = """위키백과, 우리 모두의 백과사전.\nt-분포 확률적 임베딩(t-SNE)은 데이터의 차원 축소에 사용되는 기계 학습 알고리즘 중 하나로, 2002년 샘 로이스Sam Rowise와 제프리 힌튼에 의해 개발되었다.[1] t-SNE는 비선형 차원 축소 기법으로, 고차원 데이터를 특히 2, 3차원 등으로 줄여 가시화하는데에 유용하게 사용된다. 구체적으로 t-SNE는 비슷한 데이터는 근접한 2, 3차원의 지점으로, 다른 데이터는 멀리 떨어진 지점으로 맵핑한다."""

__x = __tokenizer.batch_encode_plus(batch_text_or_text_pairs=[__s], padding='max_length', max_length=math.prod(N_TOKEN_DIM) * N_SEQUENCE_DIM, add_special_tokens=False)
__x = tf.convert_to_tensor(__x['input_ids'])

__p = __tokun(__x)
__y = tokun.pipeline.postprocess(__p)

In [66]:
__tokun(__x)

<tf.Tensor: shape=(1, 32768, 256), dtype=float32, numpy=
array([[[9.9999994e-01, 2.1866686e-11, 1.5651647e-10, ...,
         4.7577155e-13, 4.4455870e-13, 4.2751217e-14],
        [9.9999994e-01, 5.3872085e-10, 3.2973860e-12, ...,
         6.7400167e-14, 2.7105212e-11, 8.4624812e-16],
        [8.5352190e-15, 2.2504400e-20, 1.4804147e-11, ...,
         6.3620600e-20, 9.4874379e-20, 2.4211412e-26],
        ...,
        [9.9999994e-01, 4.0094700e-10, 5.2562658e-13, ...,
         4.2942572e-16, 6.3796191e-13, 2.5679170e-17],
        [9.9999744e-01, 6.4139338e-10, 2.0796067e-08, ...,
         4.0128212e-20, 7.6753092e-12, 2.0559233e-27],
        [9.9999768e-01, 1.4892085e-07, 4.1791142e-07, ...,
         1.1084704e-19, 9.6512708e-11, 1.1062994e-26]]], dtype=float32)>

In [67]:
__y

'위키백과, 우리 모두의 백과사전.\nt-분포 확률적 임베딩(t-SNE)은 데이터의 차원 축소에 사용되는 기계 학습 알고리즘 중 하나로, 2002년 샘 로이스Sam Rowise와 제프리 힌튼에 의해 개발되었다.[1] t-SNE는 비선형 차원 축소 기법으로, 고차원 데이터를 특히 2, 3차원 등으로 줄여 가시화하는데에 유용하게 사용된다. 구체적으로 t-SNE는 비슷한 데이터는 근접한 2, 3차원의 지점으로, 다른 데이터는 멀리 떨어진 지점으로 맵핑한다.'