## Enabling and testing the TPU

In [None]:
import datetime
import functools
import math
import os
import random

import tensorflow as tf

print("Tensorflow version " + tf.__version__)

Tensorflow version 2.12.0


In [None]:
try:
  TPU = tf.distribute.cluster_resolver.TPUClusterResolver()  # TPU detection
except ValueError:
  raise BaseException('ERROR: Not connected to a TPU runtime!')

tf.config.experimental_connect_to_cluster(TPU)
tf.tpu.experimental.initialize_tpu_system(TPU)
tpu_strategy = tf.distribute.TPUStrategy(TPU)

print('Running on TPU ', TPU.cluster_spec().as_dict()['worker'])

Running on TPU  ['10.90.201.34:8470']


## Defining The Metadata

In [None]:
# META ########################################################################

N_VOCABULARY_DIM = 37
N_CONTEXT_DIM = 256
N_EMBEDDING_DIM = 512
N_HIDDEN_DIM = 4 * N_EMBEDDING_DIM # = 4 * N_ATTENTION_DIM * N_ATTENTION_HEAD
N_ATTENTION_HEAD = 8
N_ATTENTION_DIM = N_EMBEDDING_DIM // N_ATTENTION_HEAD
N_ATTENTION_BLOCK = 2

N_EPOCHS = 16
N_EPOCHS_RAMPUP = 4
N_EPOCHS_SUSTAIN = 0

N_BATCH = 128

N_SAMPLE = 256

R_MIN = 0.00001
R_MAX = 0.0001 * tpu_strategy.num_replicas_in_sync
R_EXP = .8

VERSION = 'sat-keras-125k'

## Loading The Data

In [None]:
# DOWNLOAD ####################################################################

!wget --show-progress --continue -O sample_data/hamlet.md https://raw.githubusercontent.com/apehex/mlable/main/.data/shakespeare/hamlet.md
!wget --show-progress --continue -O sample_data/othello.md https://raw.githubusercontent.com/apehex/mlable/main/.data/shakespeare/othello.md
!wget --show-progress --continue -O sample_data/macbeth.md https://raw.githubusercontent.com/apehex/mlable/main/.data/shakespeare/macbeth.md

--2024-02-02 12:02:53--  https://raw.githubusercontent.com/apehex/mlable/main/.data/shakespeare/hamlet.md
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.111.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 416 Range Not Satisfiable

    The file is already fully retrieved; nothing to do.

--2024-02-02 12:02:53--  https://raw.githubusercontent.com/apehex/mlable/main/.data/shakespeare/othello.md
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.111.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 416 Range Not Satisfiable

    The file is already fully retrieved; nothing to do.

--2024-02-02 12:02:53--  https://raw.githubusercontent.com/apehex/mlable/main/.data/shake

In [None]:
# LOAD ########################################################################

TEXT = tf.io.gfile.GFile('sample_data/othello.md', 'r').read() # .splitlines()
TEXT += tf.io.gfile.GFile('sample_data/hamlet.md', 'r').read() # .splitlines()
TEXT += tf.io.gfile.GFile('sample_data/macbeth.md', 'r').read() # .splitlines()

## Datasets

In [None]:
# ENCODING / DECODING #########################################################

# CONSTANTS

BLANK = chr(0)

# LIST

def capture(text: str, blank: str=BLANK) -> str:
    return sorted(list(set(text).union({blank})))

# MAPPINGS

def mappings(vocabulary: list) -> dict:
    __itos = {__i: __c for __i, __c in enumerate(vocabulary)}
    __stoi = {__c: __i for __i, __c in enumerate(vocabulary)}
    # blank placeholder
    __blank_c = __itos[0] # blank
    __blank_i = 0 # len(vocabulary)
    # s => i
    def __encode(c: str) -> int:
        return __stoi.get(c, __blank_i)
    # i => s
    def __decode(i: int) -> str:
        return __itos.get(i, __blank_c)
    # return both
    return {'encode': __encode, 'decode': __decode}

# ENCODING

def encode(text: str, stoi: callable) -> list:
    return [stoi(__c) for __c in text]

def decode(sequence: list, itos: callable) -> list:
    return ''.join([itos(__i) for __i in sequence])

In [None]:
# TOKENIZE ####################################################################

# TEXT TO LIST

def tokenize(text: str, length: int, blank=BLANK):
    __context = length * blank
    for __c in text:
        yield __context
        __context = __context[1:] + __c

# TEXT TO VECTOR

def dataset(text: list, stoi: callable, depth: int, context: int) -> tuple:
    __x = [encode(text=__n, stoi=stoi) for __n in tokenize(text=text, length=context)]
    __y = encode(text=text, stoi=stoi)
    return tf.constant(tf.convert_to_tensor(value=__x, dtype=tf.dtypes.int32)), tf.constant(tf.one_hot(indices=__y, depth=depth, dtype=tf.dtypes.float32))

In [None]:
# MAPPINGS ####################################################################

VOCABULARY = capture(TEXT)
N_VOCABULARY_DIM = len(VOCABULARY)

MAPPINGS = mappings(vocabulary=VOCABULARY)

_stoi = MAPPINGS['encode']
_itos = MAPPINGS['decode']

In [None]:
# SPLIT #######################################################################

N1 = int(0.8 * len(TEXT))
N2 = int(0.9 * len(TEXT))

X_TRAIN, Y_TRAIN = dataset(text=TEXT[:N1], stoi=_stoi, context=N_CONTEXT_DIM, depth=N_VOCABULARY_DIM)
X_DEV, Y_DEV = dataset(text=TEXT[N1:N2], stoi=_stoi, context=N_CONTEXT_DIM, depth=N_VOCABULARY_DIM)
X_TEST, Y_TEST = dataset(text=TEXT[N2:], stoi=_stoi, context=N_CONTEXT_DIM, depth=N_VOCABULARY_DIM)

## Blocks

In [None]:
# FEED FORWARD BLOCK ##########################################################

class ResidualFeedForwardBlock(tf.keras.layers.Layer):
    def __init__(
        self,
        hidden_dim: int,
        normalization_epsilon: float=0.001,
        **kwargs
    ):
        super(ResidualFeedForwardBlock, self).__init__(**kwargs)
        self._normalization = tf.keras.layers.LayerNormalization(axis=-1, epsilon=normalization_epsilon, center=True, scale=True, beta_initializer='zeros', gamma_initializer='glorot_uniform', beta_regularizer=None, gamma_regularizer=None, beta_constraint=None, gamma_constraint=None,  **kwargs)
        self._hidden_dim = hidden_dim
        self._hidden = tf.keras.layers.Dense(units=self._hidden_dim, activation='relu', use_bias=True, kernel_initializer='glorot_uniform', bias_initializer='zeros', kernel_regularizer=None, bias_regularizer=None, activity_regularizer=None, kernel_constraint=None, bias_constraint=None, **kwargs)
        self._projection = None

    def build(self, input_shape: tuple, **kwargs) -> None:
        # create the projection layer to match the input shape
        self._projection = tf.keras.layers.Dense(units=input_shape[-1], activation=None, use_bias=True, kernel_initializer='glorot_uniform', bias_initializer='zeros', kernel_regularizer=None, bias_regularizer=None, activity_regularizer=None, kernel_constraint=None, bias_constraint=None, **kwargs)
        # no need to build the activation layer
        self._normalization.build(input_shape=input_shape) # no weights
        self._hidden.build(input_shape=input_shape) # (C, H)
        self._projection.build(input_shape=list(input_shape)[:-1] + [self._hidden_dim]) # (H, C), called on (x * W_h) => shape (B, T, H)
        # notify the model
        self.built = True

    def call(self, inputs: tf.Tensor, **kwargs):
        __dx = inputs # (B, T, C)
        # normalize the features
        __dx = self._normalization(__dx, **kwargs) # (B, T, C)
        # expand inside the hidden layer
        __dx = self._hidden(__dx, **kwargs) # (B, T, C) * (C, H) = (B, T, H)
        # projection: match the input shape
        __dx = self._projection(__dx, **kwargs) # (B, T, H) * (H, C) = (B, T, C)
        # residual
        return inputs + __dx # (B, T, C)

# ATTENTION BLOCK #############################################################

class ResidualSelfAttentionBlock(tf.keras.layers.Layer):
    def __init__(
        self,
        attention_head_dim: int,
        attention_head_count: int=1,
        normalization_epsilon: float=0.001,
        dropout: float=0.0,
        **kwargs
    ):
        super(ResidualSelfAttentionBlock, self).__init__(**kwargs)
        self._normalization = tf.keras.layers.LayerNormalization(axis=-1, epsilon=normalization_epsilon, center=True, scale=True, beta_initializer='zeros', gamma_initializer='glorot_uniform', beta_regularizer=None, gamma_regularizer=None, beta_constraint=None, gamma_constraint=None,  **kwargs)
        self._attention = tf.keras.layers.MultiHeadAttention(num_heads=attention_head_count, key_dim=attention_head_dim, value_dim=attention_head_dim, dropout=dropout, use_bias=True, output_shape=None, attention_axes=None, kernel_initializer='glorot_uniform', bias_initializer='zeros', kernel_regularizer=None, bias_regularizer=None, activity_regularizer=None, kernel_constraint=None, bias_constraint=None, **kwargs)

    def build(self, input_shape: tuple, **kwargs) -> None:
        # build
        self._normalization.build(input_shape=input_shape)
        self._attention.build(input_shape=input_shape)
        # notify the model
        self.built = True

    def call(self, inputs: tf.Tensor, training: bool=True, **kwargs):
        __dx = inputs # (B, T, C)
        # normalize the features
        __dx = self._normalization(__dx, **kwargs) # (B, T, C)
        # self-attention
        __dx = self._attention(key=__dx, query=__dx, value=__dx, return_attention_scores=False, training=training, use_causal_mask=True, **kwargs) # (B, T, H_d * H_c) = (B, T, C) use_causal_mask=True
        # residual
        return inputs + __dx # (B, T, C)

# META BLOCK ##################################################################

class ResidualSelfAttentionDecoderBlock(tf.keras.layers.Layer):
    def __init__(
        self,
        hidden_dim: int,
        attention_head_dim: int,
        attention_head_count: int=1,
        normalization_epsilon: float=0.001,
        dropout: float=0.0,
        **kwargs
    ):
        super(ResidualSelfAttentionDecoderBlock, self).__init__(**kwargs)
        self._feedforward = ResidualFeedForwardBlock(hidden_dim=hidden_dim, normalization_epsilon=normalization_epsilon)
        self._attention = ResidualSelfAttentionBlock(attention_head_dim=attention_head_dim, attention_head_count=attention_head_count, normalization_epsilon=normalization_epsilon, dropout=dropout)

    def build(self, input_shape: tuple, **kwargs) -> None:
        self._feedforward.build(input_shape=input_shape)
        self._attention.build(input_shape=input_shape)
        # notify the model
        self.built = True

    def call(self, inputs: tf.Tensor, training: bool=True, **kwargs):
        __dx = inputs # (B, T, C)
        # residual self-attention
        __dx = self._attention(__dx, training=training, **kwargs) # (B, T, C)
        # residual FF
        __dx = self._feedforward(__dx, **kwargs) # (B, T, C)
        # residual
        return __dx # (B, T, C)

## Model

In [None]:
# MODEL #######################################################################

def create_model(
    n_context_dim: int=N_CONTEXT_DIM,
    n_vocabulary_dim: int=N_VOCABULARY_DIM,
    n_embedding_dim: int=N_EMBEDDING_DIM,
    n_hidden_dim: int=N_HIDDEN_DIM,
    n_attention_block: int=N_ATTENTION_BLOCK,
    n_attention_head: int=N_ATTENTION_HEAD,
    n_attention_dim: int=N_ATTENTION_DIM,
    lr_min: float=R_MIN
) -> tf.keras.Model:
    __model = tf.keras.Sequential()
    # embedding
    __model.add(tf.keras.layers.Embedding(input_dim=n_vocabulary_dim, output_dim=n_embedding_dim, embeddings_initializer='he_normal', name='embedding'))
    # blocks
    for __i in range(n_attention_block):
        __model.add(ResidualSelfAttentionDecoderBlock(hidden_dim=n_hidden_dim, attention_head_dim=n_attention_dim, attention_head_count=n_attention_head, normalization_epsilon=0.001, dropout=0.0, name='decoder-block-' + str(__i)))
    # head
    __model.add(tf.keras.layers.Reshape(target_shape=(n_context_dim * n_embedding_dim,), input_shape=(n_context_dim, n_embedding_dim)))
    __model.add(tf.keras.layers.Dense(units=n_vocabulary_dim, activation=None, use_bias=True, kernel_initializer='glorot_uniform', bias_initializer='zeros', name='head'))
    __model.add(tf.keras.layers.Softmax(axis=-1, name='softmax'))
    # compile
    __model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=lr_min),
        loss=tf.keras.losses.CategoricalCrossentropy(from_logits=False, label_smoothing=0., axis=-1, reduction=tf.keras.losses.Reduction.SUM_OVER_BATCH_SIZE, name='loss'))
    return __model

In [None]:
with tpu_strategy.scope(): # creating the model in the TPUStrategy scope means we will train the model on the TPU
  MODEL = create_model()
MODEL.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 512)         35840     
                                                                 
 decoder-block-0 (ResidualSe  (None, None, 512)        3152384   
 lfAttentionDecoderBlock)                                        
                                                                 
 decoder-block-1 (ResidualSe  (None, None, 512)        3152384   
 lfAttentionDecoderBlock)                                        
                                                                 
 reshape_2 (Reshape)         (None, 131072)            0         
                                                                 
 head (Dense)                (None, 70)                9175110   
                                                                 
 softmax (Softmax)           (None, 70)               

## Train

In [None]:
# LEARNING RATE ###############################################################

def lrfn(epoch: int, lr_min: float, lr_max: float, lr_exp: float, rampup: int, sustain: int):
  __lr = lr_min
  if epoch < rampup:
    __lr = lr_min + (epoch * (lr_max - lr_min) / rampup)
  elif epoch < rampup + sustain:
    __lr = lr_max
  else:
    __lr = lr_min + (lr_max - lr_min) * lr_exp ** (epoch - rampup - sustain)
  return __lr

lr_callback = tf.keras.callbacks.LearningRateScheduler(lambda epoch: lrfn(epoch, lr_min=R_MIN, lr_max=R_MAX, lr_exp=R_EXP, rampup=N_EPOCHS_RAMPUP, sustain=N_EPOCHS_SUSTAIN), verbose=True)

In [None]:
# TRAIN #######################################################################

TRAINING_HISTORY = MODEL.fit(
    x=X_TRAIN,
    y=Y_TRAIN,
    batch_size=N_BATCH,
    epochs=N_EPOCHS,
    validation_split=None,
    validation_data=(X_DEV, Y_DEV),
    validation_freq=[1, N_EPOCHS],
    verbose=2,
    callbacks=[lr_callback]) # callbacks=[CALLBACK]


Epoch 1: LearningRateScheduler setting learning rate to 1e-05.
Epoch 1/16
2956/2956 - 118s - loss: 2.7572 - val_loss: 2.4572 - lr: 1.0000e-05 - 118s/epoch - 40ms/step

Epoch 2: LearningRateScheduler setting learning rate to 0.0002075.
Epoch 2/16
2956/2956 - 80s - loss: 2.2043 - lr: 2.0750e-04 - 80s/epoch - 27ms/step

Epoch 3: LearningRateScheduler setting learning rate to 0.00040500000000000003.
Epoch 3/16
2956/2956 - 80s - loss: 1.9937 - lr: 4.0500e-04 - 80s/epoch - 27ms/step

Epoch 4: LearningRateScheduler setting learning rate to 0.0006025000000000001.
Epoch 4/16
2956/2956 - 81s - loss: 1.8889 - lr: 6.0250e-04 - 81s/epoch - 28ms/step

Epoch 5: LearningRateScheduler setting learning rate to 0.0008.
Epoch 5/16
2956/2956 - 80s - loss: 1.8319 - lr: 8.0000e-04 - 80s/epoch - 27ms/step

Epoch 6: LearningRateScheduler setting learning rate to 0.0006420000000000001.
Epoch 6/16
2956/2956 - 80s - loss: 1.6138 - lr: 6.4200e-04 - 80s/epoch - 27ms/step

Epoch 7: LearningRateScheduler setting lea

In [None]:
def _next(model: tf.Module, x: tf.Tensor) -> int:
    __prob = tf.squeeze(model(x, training=False))
    return tf.argmax(__prob, axis=-1).numpy()

def sample(model: tf.Module, context: int, depth: int, length: int, itos: callable) -> str:
    __start = int(random.uniform(0, depth))
    __result = itos(__start)
    __ngram = (context - 1) * [0,] + [__start]
    __x = tf.convert_to_tensor(value=[__ngram], dtype=tf.dtypes.int32)
    __n = _next(model=model, x=__x)
    for __i in range(length):
        __ngram = __ngram[1:] + [__n]
        __x = tf.convert_to_tensor(value=[__ngram], dtype=tf.dtypes.int32)
        __n = _next(model=model, x=__x)
        __result += itos(__n)
    return __result

sample_2048 = functools.partial(sample, model=MODEL, context=N_CONTEXT_DIM, depth=N_VOCABULARY_DIM, length=8*N_SAMPLE, itos=_itos)

In [None]:
print(sample_2048())

D OThele thereethen thereen

> I lo be the sereent the the sere the mant the the reat mere there sere me the the llatuth, the the munt thimund outh huth urd outhin thour, the hour hare sher shithe serene sweld sine chis sore, to thes lfonst to the teresbyoe we

> ean  f with  loinghas   int ere lee seiteoeor ateyoo

>e  aail lis oy ing ty ain aibles end al wirlath

> in ham shis ghasd fait the oust on s mothd ane mfit,

> To houed in lave thet aid fith he isseren,

> chat bure ance poom med maghand oughers

> And heam all ane to ghes whis fon the linge;

> And line shill ar mare sin tor and mice for hionge

> 's ponsust in ta mont willous anot mofersto,

> Tha with th the wiak se tigh y alave so hepr

> The kith mead and ous calf in with ars wime.

> To he poo mare tore tour co fare he theere

> And hith y or ghe riow then. I me not me dire,

> 't on mencers's are lock of poo minteris.

**OTHELLO**

> I he bleand she therewis pow thain seme thenct

> whot my thaunstersime tore he buthe