## Install Dependencies

In [None]:
!pip install mlable tokun

Collecting mlable
  Downloading mlable-0.4.6-py3-none-any.whl (17 kB)
Collecting tokun
  Downloading tokun-0.7.3-py3-none-any.whl (15 kB)
Installing collected packages: mlable, tokun
Successfully installed mlable-0.4.6 tokun-0.7.3


In [None]:
%load_ext tensorboard

## Login To HF

In [None]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) n
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


## Import Dependencies

In [None]:
import itertools
import math
import os
import urllib.request

import huggingface_hub as hh
import keras
import tensorflow as tf
import transformers as ht

import mlable.metrics

import tokun.evaluation
import tokun.huggingface
import tokun.meta
import tokun.model
import tokun.pipeline

In [None]:
print("Tensorflow version " + tf.__version__)

Tensorflow version 2.15.0


## Setup the GPU / TPU

In [None]:
# DEVICES #####################################################################

tf.debugging.set_log_device_placement(False)

CPU = tf.config.list_logical_devices('CPU')
GPU = tf.config.list_logical_devices('GPU')
TPU = tf.config.list_logical_devices('TPU')

if TPU:
    RESOLVER = tf.distribute.cluster_resolver.TPUClusterResolver()
    tf.config.experimental_connect_to_cluster(RESOLVER)
    tf.tpu.experimental.initialize_tpu_system(RESOLVER)
    DISTRIBUTION_STRATEGY = tf.distribute.TPUStrategy(RESOLVER)
elif GPU:
    DISTRIBUTION_STRATEGY = tf.distribute.MirroredStrategy(GPU)
else:
    DISTRIBUTION_STRATEGY = tf.distribute.MirroredStrategy(CPU)

print(DISTRIBUTION_STRATEGY)

<tensorflow.python.distribute.tpu_strategy.TPUStrategyV2 object at 0x78086be45c30>


## Defining The Metadata

In [None]:
# PARAMETERS ##################################################################

N_SEQUENCE_AXIS = 1
N_FEATURE_AXIS = -1

N_TOKEN_DIM = [4, 16] # G, for each block
N_SEQUENCE_DIM = 512

In [None]:
# DERIVED #####################################################################

N_TOKEN_SIZES = list(itertools.accumulate(N_TOKEN_DIM, lambda x, y: x * y)) # in bytes

VERSION = tokun.meta.version(units=N_TOKEN_DIM, axis=N_SEQUENCE_AXIS)
LABEL = '6.1'

URL_IMPORT = 'https://github.com/apehex/tokun/raw/main/models/{}/{}/{}.keras'.format(*VERSION, LABEL)

PATH_IMPORT = 'model.keras'
PATH_EXPORT = 'variants/{}'.format(VERSION[0])

## Download The Model

In [None]:
# DOWNLOAD ###################################################################

urllib.request.urlretrieve(URL_IMPORT, PATH_IMPORT)

('model.keras', <http.client.HTTPMessage at 0x78086bddfd90>)

## Init

In [None]:
# TOKENIZER ###################################################################

TOKENIZER = tokun.huggingface.ByteTokenizer(vocab_size=256, split_special_tokens=True)

In [None]:
# MODEL #######################################################################

with DISTRIBUTION_STRATEGY.scope():
    # metrics
    byte_accuracy = mlable.metrics.CategoricalGroupAccuracy(group=1, name='byte_accuracy')
    character_accuracy = mlable.metrics.CategoricalGroupAccuracy(group=4, name='character_accuracy')
    token_accuracy = mlable.metrics.CategoricalGroupAccuracy(group=N_TOKEN_SIZES[-1], name='token_accuracy')
    # weights and config
    MODEL = tf.keras.models.load_model(PATH_IMPORT, compile=False)
    # compilation
    MODEL.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001),
        loss=tf.keras.losses.CategoricalCrossentropy(from_logits=False, label_smoothing=0., axis=-1, reduction=tf.keras.losses.Reduction.SUM_OVER_BATCH_SIZE, name='loss'),
        metrics=[byte_accuracy, character_accuracy, token_accuracy])


In [None]:
# SPECIFY IO ##################################################################

__inputs = tf.keras.layers.Input(shape=(math.prod(N_TOKEN_DIM) * N_SEQUENCE_DIM,), dtype=tf.int32)

__outputs = MODEL._encoder(__inputs)
__outputs = MODEL._decoder(__outputs)

TOKUN = tf.keras.models.Model(__inputs, __outputs)

In [None]:
MODEL.summary()
TOKUN.summary()

Model: "auto_encoder"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 encoder (Encoder)           (None, 512, 256)          1377792   
                                                                 
 decoder (Decoder)           (None, 32768, 256)        1382656   
                                                                 
Total params: 2760448 (10.53 MB)
Trainable params: 2760448 (10.53 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 32768)]           0         
                                                                 
 encoder (Encoder)           (None, 512, 256)          1377792   
                                                                 

## Check The Model And Tokenizer

In [None]:
# SAMPLE ######################################################################

__s = """위키백과, 우리 모두의 백과사전.\nt-분포 확률적 임베딩(t-SNE)은 데이터의 차원 축소에 사용되는 기계 학습 알고리즘 중 하나로, 2002년 샘 로이스Sam Rowise와 제프리 힌튼에 의해 개발되었다.[1] t-SNE는 비선형 차원 축소 기법으로, 고차원 데이터를 특히 2, 3차원 등으로 줄여 가시화하는데에 유용하게 사용된다. 구체적으로 t-SNE는 비슷한 데이터는 근접한 2, 3차원의 지점으로, 다른 데이터는 멀리 떨어진 지점으로 맵핑한다."""

In [None]:
# UTF-32 TOKENIZATION #########################################################

__x = TOKENIZER.batch_encode_plus(batch_text_or_text_pairs=[__s], padding='max_length', max_length=math.prod(N_TOKEN_DIM) * N_SEQUENCE_DIM, add_special_tokens=False)
__x = tf.convert_to_tensor(__x['input_ids'])

In [None]:
# TEST THE DERIVED MODEL ######################################################

__e = TOKUN.layers[1](__x) # encoder
__p = TOKUN.layers[2](__e) # decoder
__y = tokun.pipeline.postprocess(__p)

In [None]:
__y

'위키백과, 우리 모두의 백과사전.\nt-분포 확률적 임베딩(t-SNE)은 데이터의 차원 축소에 사용되는 기계 학습 알고리즘 중 하나로, 2002년 샘 로이스Sam Rowise와 제프리 힌튼에 의해 개발되었다.[1] t-SNE는 비선형 차원 축소 기법으로, 고차원 데이터를 특히 2, 3차원 등으로 줄여 가시화하는데에 유용하게 사용된다. 구체적으로 t-SNE는 비슷한 데이터는 근접한 2, 3차원의 지점으로, 다른 데이터는 멀리 떨어진 지점으로 맵핑한다.'

## Export

In [None]:
# INIT HF API #################################################################

API = hh.HfApi()

In [None]:
# TOKENIZER ###################################################################

TOKENIZER.save_pretrained(save_directory='tokenizer/')
API.upload_folder(repo_id='apehex/tokun', folder_path='tokenizer/', path_in_repo='tokenizer/')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


CommitInfo(commit_url='https://huggingface.co/apehex/tokun/commit/fb9a7a7281a1701a762d595a92b4049523053d54', commit_message='Upload folder using huggingface_hub', commit_description='', oid='fb9a7a7281a1701a762d595a92b4049523053d54', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
# MODEL #######################################################################

hh.save_pretrained_keras(model=TOKUN, save_directory='model/', config=TOKUN.get_config())
API.upload_folder(repo_id='apehex/tokun', folder_path='model/', path_in_repo=PATH_EXPORT)



saved_model.pb:   0%|          | 0.00/559k [00:00<?, ?B/s]

Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

fingerprint.pb:   0%|          | 0.00/57.0 [00:00<?, ?B/s]

variables.data-00000-of-00001:   0%|          | 0.00/11.1M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/apehex/tokun/commit/9b698d6c0a5375bbb2a05dba7beaea4874faf9aa', commit_message='Upload folder using huggingface_hub', commit_description='', oid='9b698d6c0a5375bbb2a05dba7beaea4874faf9aa', pr_url=None, pr_revision=None, pr_num=None)

## Import And Check

In [None]:
# DOWNLOAD REPO ###############################################################

API.snapshot_download(repo_id='apehex/tokun', local_dir='tokun/')

Fetching 34 files:   0%|          | 0/34 [00:00<?, ?it/s]

LICENSE.md:   0%|          | 0.00/34.5k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/7.39k [00:00<?, ?B/s]

.gitattributes:   0%|          | 0.00/1.96k [00:00<?, ?B/s]

variants/16x4/keras_metadata.pb: 0.00B [00:00, ?B/s]

tokenizer/vocab.json:   0%|          | 0.00/3.24k [00:00<?, ?B/s]

variants/16x4/config.json:   0%|          | 0.00/1.09k [00:00<?, ?B/s]

tokenizer/special_tokens_map.json:   0%|          | 0.00/153 [00:00<?, ?B/s]

tokenizer/tokenizer_config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

fingerprint.pb:   0%|          | 0.00/55.0 [00:00<?, ?B/s]

variables.data-00000-of-00001:   0%|          | 0.00/11.1M [00:00<?, ?B/s]

saved_model.pb:   0%|          | 0.00/565k [00:00<?, ?B/s]

variants/16x4/variables/variables.index:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

variants/4x16/config.json:   0%|          | 0.00/1.08k [00:00<?, ?B/s]

variants/4x16/keras_metadata.pb: 0.00B [00:00, ?B/s]

variants/4x16/README.md:   0%|          | 0.00/190 [00:00<?, ?B/s]

fingerprint.pb:   0%|          | 0.00/57.0 [00:00<?, ?B/s]

fingerprint.pb:   0%|          | 0.00/55.0 [00:00<?, ?B/s]

variants/4x16/variables/variables.index:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

variants/4x4/config.json:   0%|          | 0.00/1.08k [00:00<?, ?B/s]

variants/4x4/README.md:   0%|          | 0.00/190 [00:00<?, ?B/s]

variables.data-00000-of-00001:   0%|          | 0.00/11.1M [00:00<?, ?B/s]

saved_model.pb:   0%|          | 0.00/559k [00:00<?, ?B/s]

variables.data-00000-of-00001:   0%|          | 0.00/4.76M [00:00<?, ?B/s]

variants/4x4/keras_metadata.pb: 0.00B [00:00, ?B/s]

variants/4x4/variables/variables.index:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

saved_model.pb:   0%|          | 0.00/557k [00:00<?, ?B/s]

variants/16x4/README.md:   0%|          | 0.00/190 [00:00<?, ?B/s]

variants/4x4x4/config.json:   0%|          | 0.00/1.08k [00:00<?, ?B/s]

variants/4x4x4/README.md:   0%|          | 0.00/190 [00:00<?, ?B/s]

variants/4x4x4/keras_metadata.pb: 0.00B [00:00, ?B/s]

variants/4x4x4/variables/variables.index:   0%|          | 0.00/1.64k [00:00<?, ?B/s]

saved_model.pb:   0%|          | 0.00/770k [00:00<?, ?B/s]

fingerprint.pb:   0%|          | 0.00/57.0 [00:00<?, ?B/s]

variables.data-00000-of-00001:   0%|          | 0.00/6.87M [00:00<?, ?B/s]

'/content/tokun'

In [None]:
# MODEL #######################################################################

__tokun = hh.from_pretrained_keras(os.path.join('tokun/', PATH_EXPORT))

In [None]:
# TOKENIZER ###################################################################

__tokenizer = tokun.huggingface.ByteTokenizer()

In [None]:
# PREDICT #####################################################################

__s = """위키백과, 우리 모두의 백과사전.\nt-분포 확률적 임베딩(t-SNE)은 데이터의 차원 축소에 사용되는 기계 학습 알고리즘 중 하나로, 2002년 샘 로이스Sam Rowise와 제프리 힌튼에 의해 개발되었다.[1] t-SNE는 비선형 차원 축소 기법으로, 고차원 데이터를 특히 2, 3차원 등으로 줄여 가시화하는데에 유용하게 사용된다. 구체적으로 t-SNE는 비슷한 데이터는 근접한 2, 3차원의 지점으로, 다른 데이터는 멀리 떨어진 지점으로 맵핑한다."""

__x = __tokenizer.batch_encode_plus(batch_text_or_text_pairs=[__s], padding='max_length', max_length=math.prod(N_TOKEN_DIM) * N_SEQUENCE_DIM, add_special_tokens=False)
__x = tf.convert_to_tensor(__x['input_ids'])

__p = __tokun(__x)
__y = tokun.pipeline.postprocess(__p)

In [None]:
__y

'위키백과, 우리 모두의 백과사전.\nt-분포 확률적 임베딩(t-SNE)은 데이터의 차원 축소에 사용되는 기계 학습 알고리즘 중 하나로, 2002년 샘 로이스Sam Rowise와 제프리 힌튼에 의해 개발되었다.[1] t-SNE는 비선형 차원 축소 기법으로, 고차원 데이터를 특히 2, 3차원 등으로 줄여 가시화하는데에 유용하게 사용된다. 구체적으로 t-SNE는 비슷한 데이터는 근접한 2, 3차원의 지점으로, 다른 데이터는 멀리 떨어진 지점으로 맵핑한다.'