# Transformers with Keras: Translation
https://keras.io/examples/nlp/neural_machine_translation_with_transformer/

In [None]:
from tensorflow.compat.v1 import keras
from tensorflow.compat.v1.keras import backend as K
import tensorflow.compat.v1 as tf
from tensorflow.compat.v1.keras import layers


import os
import numpy as np
import matplotlib.pyplot as plt

In [None]:
from tensorflow.core.protobuf import rewriter_config_pb2


# set GPU config
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
config.log_device_placement=True
config.allow_soft_placement=True

In [None]:
os.environ['TF_ENABLE_AUTO_MIXED_PRECISION'] = '1'


from tensorflow.keras.mixed_precision import experimental as mixed_precision

policy = mixed_precision.Policy('mixed_float16')
mixed_precision.set_policy(policy)

print('Compute dtype: %s' % policy.compute_dtype)
print('Variable dtype: %s' % policy.variable_dtype)

loss_scale = policy.loss_scale
print('Loss scale: %s' % loss_scale)


# default is 1e-7 which is too small for float16.  Without adjusting the epsilon, we will get NaN predictions because of divide by zero problems
float_type = 'float16'
print("keras backend float size before: ", K.floatx())
K.set_epsilon(1e-4) 
K.set_floatx(float_type)

# set the new configs
sess = tf.Session(config=config)
K.set_session(sess)

In [None]:

G = len(tf.config.experimental.list_physical_devices('GPU'))
print("Num GPUs Available: ", G)
print("tf version ", tf.__version__)
print("keras version ", tf.keras.__version__)
print("keras backend float size: ", K.floatx())

# Data loading

In [None]:

x_train = np.load('data/x_train_SMALL.npy')
y_train = np.load('data/y_train_SMALL.npy')

x_val = x_train
y_val = y_train


print(len(x_train), "Training sequences")
print(len(x_val), "Validation sequences")


print(x_train.shape)
print(y_train.shape)
print(x_val.shape)
print(y_val.shape)


# Hyperparams

In [None]:
## Data shape
NUM_FEATS = 60
SEQ_LEN = 50
vocab_size = 300 # number of discrete tokens
maxlen = SEQ_LEN # Setting this to reflect sequence length


## Model dimensions
embed_dim = 32  # Embedding size for each token
num_heads = 4  # Number of attention heads
latent_dim = 256  # Hidden layer size in feed forward network inside transformer


## Training params

patience = 5
BS = 16*G
epochs = 500

In [None]:
def format_dataset(inp, targ):
    return ({"encoder_inputs": inp, "decoder_inputs": targ,},targ)


def make_dataset(x,y,bs):
    x = list(x)
    y = list(y)
    dataset = tf.data.Dataset.from_tensor_slices((x, y))
    dataset = dataset.batch(bs)
    dataset = dataset.map(format_dataset)
    return dataset.shuffle(2048).prefetch(bs).cache()

train_ds = make_dataset(x_train, y_train,BS)
val_ds = make_dataset(x_val, y_val,BS)

In [None]:
for inputs, targets in train_ds.take(1):
    print(f'inputs["encoder_inputs"].shape: {inputs["encoder_inputs"].shape}')
    print(f'inputs["decoder_inputs"].shape: {inputs["decoder_inputs"].shape}')
    print(f"targets.shape: {targets.shape}")

## Learning rate schedule

In [None]:
class CustomSchedule(keras.optimizers.schedules.LearningRateSchedule):
    def __init__(
        self,
        init_lr=0.00001,
        lr_after_warmup=0.001,
        final_lr=0.00001,
        warmup_epochs=15,
        decay_epochs=85,
        steps_per_epoch=203,
    ):
        super().__init__()
        self.init_lr = init_lr
        self.lr_after_warmup = lr_after_warmup
        self.final_lr = final_lr
        self.warmup_epochs = warmup_epochs
        self.decay_epochs = decay_epochs
        self.steps_per_epoch = steps_per_epoch

    def calculate_lr(self, epoch):
        """ linear warm up - linear decay """
        warmup_lr = (
            self.init_lr
            + ((self.lr_after_warmup - self.init_lr) / (self.warmup_epochs - 1)) * epoch
        )
        decay_lr = tf.math.maximum(
            self.final_lr,
            self.lr_after_warmup
            - (epoch - self.warmup_epochs)
            * (self.lr_after_warmup - self.final_lr)
            / (self.decay_epochs),
        )
        return tf.math.minimum(warmup_lr, decay_lr)

    def __call__(self, step):
        epoch = step // self.steps_per_epoch
        return self.calculate_lr(epoch)


# Embedding

In [None]:
class PositionalEmbedding(layers.Layer):
    def __init__(self, sequence_length, vocab_size, embed_dim, **kwargs):
        super(PositionalEmbedding, self).__init__(**kwargs)
        #
        self.token_embeddings = layers.Embedding(
            input_dim=vocab_size, output_dim=embed_dim, input_length = SEQ_LEN)
        
        # token embedding of features will be merged so shape pf position embedding output dim must match 
        self.position_embeddings = layers.Embedding(
            input_dim=sequence_length, output_dim=embed_dim*FEATS)
        
        
        self.sequence_length = sequence_length
        self.vocab_size = vocab_size
        self.embed_dim = embed_dim

    def call(self, inputs):
        
        embedded_tokens = self.token_embeddings(inputs)
        
        # merging feature and embedding dimentions.
        emb_shape = tf.shape(embedded_tokens) # (bs, seqlen, feats, embdim)
        embedded_tokens = tf.reshape(embedded_tokens, [emb_shape[0], emb_shape[1],emb_shape[-2]*emb_shape[-1]])
        #print(f"shape of token emb reshaped: {embedded_tokens}" ) # (bs, seqlen, feats*embdim)

        length = tf.shape(embedded_tokens)[-2]
        positions = tf.range(start=0, limit=length, delta=1)
        
        embedded_positions = self.position_embeddings(positions)

        
        ret = embedded_tokens + embedded_positions
        return ret

    def compute_mask(self, inputs, mask=None):
        return tf.math.not_equal(inputs, 0)


# Encoder

In [None]:
class TransformerEncoder(layers.Layer):
    def __init__(self, embed_dim, dense_dim, num_heads, **kwargs):
        super(TransformerEncoder, self).__init__(**kwargs)
        self.embed_dim = embed_dim
        self.dense_dim = dense_dim
        self.num_heads = num_heads
        self.attention = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim
        )
        self.dense_proj = keras.Sequential(
            [layers.Dense(dense_dim, activation="relu"), layers.Dense(embed_dim),]
        )
        self.layernorm_1 = layers.LayerNormalization()
        self.layernorm_2 = layers.LayerNormalization()
        self.supports_masking = True

    def call(self, inputs, mask=None):
        if mask is not None:
            # Ignoring this as all my sequences are of the same length
            print(f"encoder mask is not none: {mask}- but will be set to none")
            padding_mask = None
            
        attention_output = self.attention(
            query=inputs, value=inputs, key=inputs, attention_mask=padding_mask
        )
        
        proj_input = self.layernorm_1(inputs + attention_output)
        proj_output = self.dense_proj(proj_input)
        return self.layernorm_2(proj_input + proj_output)

# Decoder

In [None]:
class TransformerDecoder(layers.Layer):
    def __init__(self, embed_dim, latent_dim, num_heads, **kwargs):
        super(TransformerDecoder, self).__init__(**kwargs)
        self.embed_dim = embed_dim
        self.latent_dim = latent_dim
        self.num_heads = num_heads
        self.attention_1 = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim
        )
        self.attention_2 = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim
        )
        self.dense_proj = keras.Sequential(
            [layers.Dense(latent_dim, activation="relu"), layers.Dense(embed_dim),]
        )
        self.layernorm_1 = layers.LayerNormalization()
        self.layernorm_2 = layers.LayerNormalization()
        self.layernorm_3 = layers.LayerNormalization()
        self.supports_masking = True

    def call(self, inputs, encoder_outputs, mask=None):
        causal_mask = self.get_causal_attention_mask(inputs)
        if mask is not None:
            # Ignoring this as all my sequences are of the same length
            print(f"mask is not None: {mask}\n but will be set to None")
            padding_mask = None
        
        attention_output_1 = self.attention_1(
            query=inputs, value=inputs, key=inputs, attention_mask=causal_mask
        )
        out_1 = self.layernorm_1(inputs + attention_output_1)

        attention_output_2 = self.attention_2(
            query=out_1,
            value=encoder_outputs,
            key=encoder_outputs,
            attention_mask=padding_mask,
        )
        
        out_2 = self.layernorm_2(out_1 + attention_output_2)
        

        proj_output = self.dense_proj(out_2)
        
        
        ret = self.layernorm_3(out_2 + proj_output)
        
        # dealing with merged feat+emb dim
        ret_shape = tf.shape(ret)
        ret = tf.reshape(ret, (ret_shape[0], ret_shape[1], 60, embed_dim))
        print(f"decoder ret reshaped  = {ret}")
        
        return ret

    def get_causal_attention_mask(self, inputs):
        input_shape = tf.shape(inputs)
        batch_size, sequence_length = input_shape[0], input_shape[1]
        i = tf.range(sequence_length)[:, tf.newaxis]
        j = tf.range(sequence_length)
        mask = tf.cast(i >= j, dtype="int32")
        mask = tf.reshape(mask, (1, input_shape[1], input_shape[1]))
        mult = tf.concat(
            [tf.expand_dims(batch_size, -1), tf.constant([1, 1], dtype=tf.int32)],
            axis=0,
        )
        return tf.tile(mask, mult)

# Transformer 

In [None]:


mirrored_strategy = tf.distribute.MirroredStrategy()
with mirrored_strategy.scope():
    encoder_inputs = keras.Input(shape=(SEQ_LEN,FEATS,)
                                 , dtype="int64", name="encoder_inputs")
    x = PositionalEmbedding(SEQ_LEN, vocab_size, embed_dim)(encoder_inputs)
    print(x.shape)
    encoder_outputs = TransformerEncoder(embed_dim*FEATS, latent_dim, num_heads)(x)
    encoder = keras.Model(encoder_inputs, encoder_outputs)

    decoder_inputs = keras.Input(shape=(SEQ_LEN,FEATS,)
                                 , dtype="int64", name="decoder_inputs")
    encoded_seq_inputs = keras.Input(shape=(None, embed_dim*FEATS,), name="decoder_state_inputs")
    x = PositionalEmbedding(SEQ_LEN, vocab_size, embed_dim)(decoder_inputs)
    x = TransformerDecoder(embed_dim*FEATS, latent_dim, num_heads)(x, encoded_seq_inputs)
    x = layers.Dropout(0.5)(x)
    decoder_outputs = layers.Dense(vocab_size, activation="softmax")(x) 
    decoder = keras.Model([decoder_inputs, encoded_seq_inputs], decoder_outputs)

    decoder_outputs = decoder([decoder_inputs, encoder_outputs])
    transformer = keras.Model(
        [encoder_inputs, decoder_inputs], decoder_outputs, name="transformer"
    )
    

# Training

In [None]:

learning_rate = CustomSchedule(
    init_lr=0.0001,
    lr_after_warmup=0.001,
    final_lr=0.0001,
    warmup_epochs=15,
    decay_epochs=85,
    steps_per_epoch= x_train.shape[0]/BS,
)
optimizer = tf.keras.optimizers.Adam(learning_rate)

early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', mode='min', patience=5, verbose=1)
checkpoint_filepath = '../data/models/checkpoint-transformer_translator.h5'
checkpoint = tf.keras.callbacks.ModelCheckpoint(
        filepath=checkpoint_filepath,
        save_weights_only=True,
        monitor='val_loss',
        mode='min',
        save_best_only=True, verbose=1)


transformer.summary()
transformer.compile(
    optimizer=optimizer, loss="sparse_categorical_crossentropy", metrics=["accuracy"]
)
history = transformer.fit(train_ds, epochs=epochs, validation_data=val_ds, callbacks=[early_stopping, checkpoint])


# Generate

In [None]:
# reload saved model

encoder_inputs = keras.Input(shape=(SEQ_LEN,FEATS,)
                                 , dtype="int64", name="encoder_inputs")
x = PositionalEmbedding(SEQ_LEN, vocab_size, embed_dim)(encoder_inputs)

encoder_outputs = TransformerEncoder(embed_dim*FEATS, latent_dim, num_heads)(x)
encoder = keras.Model(encoder_inputs, encoder_outputs)

decoder_inputs = keras.Input(shape=(SEQ_LEN,FEATS,)
                                 , dtype="int64", name="decoder_inputs")
encoded_seq_inputs = keras.Input(shape=(None, embed_dim*FEATS,), name="decoder_state_inputs")
x = PositionalEmbedding(SEQ_LEN, vocab_size, embed_dim)(decoder_inputs)
x = TransformerDecoder(embed_dim*FEATS, latent_dim, num_heads)(x, encoded_seq_inputs)

x = layers.Dropout(0.5)(x)

decoder_outputs = layers.Dense(vocab_size, activation="softmax")(x) 
decoder = keras.Model([decoder_inputs, encoded_seq_inputs], decoder_outputs)

decoder_outputs = decoder([decoder_inputs, encoder_outputs])
transformer_reload = keras.Model(
        [encoder_inputs, decoder_inputs], decoder_outputs, name="transformer"
    )
    
    


transformer_reload.load_weights(checkpoint_filepath)

In [None]:
inp = x_train[0:1,:]
targ = y_train[0:1,:]
print(inp.shape)
print(inp)
print(targ.shape)


pred = transformer_reload([inp, targ])
print(f"prediction shape: {pred.shape}")
predicted = np.argmax(pred, axis=-1)
print(predicted.shape)


In [None]:
def write_ex_to_tsv(ex, fn):
    num_frames = ex.shape[0]
    
    
    freq = 30
    marker_names = ['MARKER_NAMES','Head','neck','rsho','relb','rwri','rhan','lsho','lelb','lwri','lhan','back','root','rhip','lhip','rknee','lknee','rank','lank', 'rfoot', 'lfoot']
    
    with open('../animations/tsv/'+fn+'.tsv', 'wt') as out_file:
        tsv_writer = csv.writer(out_file, delimiter='\t')
        tsv_writer.writerow(['NO_OF_FRAMES', num_frames])
        tsv_writer.writerow(['NO_OF_CAMERAS', 0])
        tsv_writer.writerow(['NO_OF_MARKERS', 20])
        tsv_writer.writerow(['FREQUENCY', freq])
        tsv_writer.writerow(['NO_OF_ANALOG', 0])
        tsv_writer.writerow(['ANALOG_FREQUENCY', 0])
        tsv_writer.writerow(['DESCRIPTION--', ''])
        tsv_writer.writerow(['TIME_STAMP--', ''])
        tsv_writer.writerow(['DATA_INCLUDED', '3D'])
        tsv_writer.writerow(marker_names)

        
        for frame in range(num_frames):
            tsv_writer.writerow(ex[frame,:])
            


In [None]:
write_ex_to_tsv(predicted[0], 'PRED')
write_ex_to_tsv(inp[0], 'INPUT')
write_ex_to_tsv(targ[0], 'TARGET')