# Transformers with Keras: Translation
https://keras.io/examples/audio/transformer_asr/

In [1]:
from tensorflow.compat.v1 import keras
from tensorflow.compat.v1.keras import backend as K
import tensorflow.compat.v1 as tf
from tensorflow.compat.v1.keras import layers


import os
import numpy as np
import matplotlib.pyplot as plt

In [2]:

from tensorflow.core.protobuf import rewriter_config_pb2
#tf.disable_eager_execution()

# set GPU config
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
config.log_device_placement=True
config.allow_soft_placement=True

In [3]:
# os.environ["CUDA_VISIBLE_DEVICES"] = ""
# os.environ["CUDA_VISIBLE_DEVICES"]= "0" # first gpu
#os.environ["CUDA_VISIBLE_DEVICES"]= "" # second gpu
# os.environ["CUDA_VISIBLE_DEVICES"]= "2" # third gpu

config = tf.ConfigProto()
G = len(tf.config.experimental.list_physical_devices('GPU'))
print("Num GPUs Available: ", G)
print("tf version ", tf.__version__)
print("keras version ", tf.keras.__version__)

Num GPUs Available:  2
tf version  2.6.0
keras version  2.6.0


In [4]:
os.environ['TF_ENABLE_AUTO_MIXED_PRECISION'] = '1'


from tensorflow.keras.mixed_precision import experimental as mixed_precision

policy = mixed_precision.Policy('mixed_float16')
mixed_precision.set_policy(policy)

print('Compute dtype: %s' % policy.compute_dtype)
print('Variable dtype: %s' % policy.variable_dtype)

loss_scale = policy.loss_scale
print('Loss scale: %s' % loss_scale)


# default is 1e-7 which is too small for float16.  Without adjusting the epsilon, we will get NaN predictions because of divide by zero problems
float_type = 'float16'
print("keras backend float size before: ", K.floatx())
K.set_epsilon(1e-4) 
K.set_floatx(float_type)

# set the new configs
sess = tf.Session(config=config)
K.set_session(sess)

INFO:tensorflow:Mixed precision compatibility check (mixed_float16): OK
Your GPUs will likely run quickly with dtype policy mixed_float16 as they all have compute capability of at least 7.0
Instructions for updating:
Use tf.keras.mixed_precision.LossScaleOptimizer instead. LossScaleOptimizer now has all the functionality of DynamicLossScale
Compute dtype: float16
Variable dtype: float32
Loss scale: DynamicLossScale(current_loss_scale=32768.0, num_good_steps=0, initial_loss_scale=32768.0, increment_period=2000, multiplier=2.0)
keras backend float size before:  float32


In [5]:

G = len(tf.config.experimental.list_physical_devices('GPU'))
print("Num GPUs Available: ", G)
print("tf version ", tf.__version__)
print("keras version ", tf.keras.__version__)
print("keras backend float size: ", K.floatx())

Num GPUs Available:  2
tf version  2.6.0
keras version  2.6.0
keras backend float size:  float16


# Data loading

In [6]:

x_train = np.load('../data/x_train.npy')
y_train = np.load('../data/y_train.npy')
x_val = np.load('../data/x_val.npy')
y_val = np.load('../data/y_val.npy')

print(x_train.dtype)
x_train = x_train.astype('int32')
print(x_train.dtype)


y_train = y_train.astype('int32')
x_val = x_val.astype('int32')
y_val = y_val.astype('int32')





#testing data squish
#x_train = x_train.reshape(x_train.shape[0], x_train.shape[1]*x_train.shape[2])
#y_train = y_train.reshape(y_train.shape[0], y_train.shape[1]*y_train.shape[2])
#x_val = x_val.reshape(x_val.shape[0], x_val.shape[1]*x_val.shape[2])
#y_val = y_val.reshape(y_val.shape[0], y_val.shape[1]*y_val.shape[2])

#x_train = np.transpose(x_train, (0,2,1))
#x_val = np.transpose(x_val, (0,2,1))
#y_train = np.transpose(y_train, (0,2,1))
#y_val = np.transpose(y_val, (0,2,1))


# OVERFITTING TEST
x_train = x_train[0:2, :]
y_train = y_train[0:2, :]
x_val = x_train
y_val = y_train


print(len(x_train), "Training sequences")
print(len(x_val), "Validation sequences")


print(x_train.shape)
print(y_train.shape)
print(x_val.shape)
print(y_val.shape)


float32
int32
2 Training sequences
2 Validation sequences
(2, 50, 60)
(2, 50, 60)
(2, 50, 60)
(2, 50, 60)


# Hyperparams

In [7]:
## Data shape
NUM_FEATS = 60
SEQ_LEN = 50
vocab_size = 300 #max index of discrete data
maxlen = SEQ_LEN#*NUM_FEATS # Setting this to reflect sequence length


## Model dimensions
EMBED_DIM = 5  # Embedding size for each token
NUM_HEADS = 5  # Number of attention heads
ff_dim = 512  # Hidden layer size in feed forward network inside transformer
dense_dim = 20
#inputshape = (NUM_FEATS, SEQ_LEN)

## Training params

patience = 10
BS = 2*G
num_epochs = 50000

In [8]:
def create_tf_dataset(data, bs=4):#(source_data, target_data, bs=4):
    #audio_ds = create_audio_ds(data)
    #text_ds = create_text_ds(data)
    #ds = tf.data.Dataset.zip((source_data, target_data))
    ds = data.map(lambda x, y: {"source": x, "target": y})
    ds = ds.batch(bs)
    #ds = ds.prefetch(tf.data.experimental.AUTOTUNE)
    return ds


#split = int(len(data) * 0.99)
#train_data = data[:split]
#test_data = data[split:]


train_dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train))
test_dataset = tf.data.Dataset.from_tensor_slices((x_val, y_val))

print(train_dataset)
print(BS)
#for z in test_dataset:
#  print(z.numpy())


DS = create_tf_dataset(train_dataset, bs=BS)
VAL_DS = create_tf_dataset(test_dataset, bs=BS)

<DatasetV1Adapter shapes: ((50, 60), (50, 60)), types: (tf.int32, tf.int32)>
4


## Learning rate schedule

In [9]:
class CustomSchedule(keras.optimizers.schedules.LearningRateSchedule):
    def __init__(
        self,
        init_lr=0.00001,
        lr_after_warmup=0.001,
        final_lr=0.00001,
        warmup_epochs=15,
        decay_epochs=85,
        steps_per_epoch=203,
    ):
        super().__init__()
        self.init_lr = init_lr
        self.lr_after_warmup = lr_after_warmup
        self.final_lr = final_lr
        self.warmup_epochs = warmup_epochs
        self.decay_epochs = decay_epochs
        self.steps_per_epoch = steps_per_epoch

    def calculate_lr(self, epoch):
        """ linear warm up - linear decay """
        warmup_lr = (
            self.init_lr
            + ((self.lr_after_warmup - self.init_lr) / (self.warmup_epochs - 1)) * epoch
        )
        decay_lr = tf.math.maximum(
            self.final_lr,
            self.lr_after_warmup
            - (epoch - self.warmup_epochs)
            * (self.lr_after_warmup - self.final_lr)
            / (self.decay_epochs),
        )
        return tf.math.minimum(warmup_lr, decay_lr)

    def __call__(self, step):
        epoch = step // self.steps_per_epoch
        return self.calculate_lr(epoch)


# Embedding

In [10]:
class TokenEmbedding(layers.Layer):
    def __init__(self, num_vocab=1000, maxlen=100, num_hid=64):
        super().__init__()
        self.emb = tf.keras.layers.Embedding(num_vocab, num_hid)
        self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=num_hid)

    def call(self, x):
        print("TOKEN EMBEDDING WAS CALLED")
        maxlen = tf.shape(x)[-1]
        x = self.emb(x)
        positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = self.pos_emb(positions)
        return x + positions


class SpeechFeatureEmbedding(layers.Layer):
    def __init__(self, num_hid=64, maxlen=100):
        super().__init__()
        self.conv1 = tf.keras.layers.Conv1D(
            num_hid, 11, strides=2, padding="same", activation="relu"
        )
        self.conv2 = tf.keras.layers.Conv1D(
            num_hid, 11, strides=2, padding="same", activation="relu"
        )
        self.conv3 = tf.keras.layers.Conv1D(
            num_hid, 11, strides=2, padding="same", activation="relu"
        )
        self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=num_hid)

    def call(self, x):
        x = self.conv1(x)
        x = self.conv2(x)
        return self.conv3(x)

class TokenAndPositionEmbedding(layers.Layer):
    def __init__(self, maxlen, vocab_size, embed_dim):
        super(TokenAndPositionEmbedding, self).__init__()
        self.token_emb = layers.Embedding(input_dim=vocab_size+1, output_dim=embed_dim)
        self.pos_emb = layers.Embedding(input_dim=maxlen+1, output_dim=embed_dim)

    def call(self, x):
        #x_shape = x.shape
        #print("hello I am embedding")
        maxlen = tf.shape(x)[-1]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        #print("embedding went fine!")
        return x + positions


# Encoder

In [11]:
class TransformerEncoder(layers.Layer):
    def __init__(self, embed_dim, num_heads, feed_forward_dim, rate=0.3):
        super().__init__()
        self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = keras.Sequential(
            [
                layers.Dense(feed_forward_dim, activation="relu"),
                layers.Dense(embed_dim),
            ]
        )
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)

    def call(self, inputs, training):
        #print("Hello I am ENCODING")
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)



# Decoder

In [12]:
class TransformerDecoder(layers.Layer):
    def __init__(self, embed_dim, num_heads, feed_forward_dim, dropout_rate=0.3):
        super().__init__()
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm3 = layers.LayerNormalization(epsilon=1e-6)
        self.self_att = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim, #attention_axes=(0,)
        )
        #print(f"hello before mha {embed_dim}")
        self.enc_att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim
                                                )
        #print("hello after")
        self.self_dropout = layers.Dropout(0.5)
        self.enc_dropout = layers.Dropout(0.1)
        self.ffn_dropout = layers.Dropout(0.1)
        self.ffn = keras.Sequential(
            [
                layers.Dense(feed_forward_dim, activation="relu"),
                layers.Dense(embed_dim),
            ]
        )
        
    #def causal_attention_mask(size):
    #    mask = 1 - tf.linalg.band_part(tf.ones((size, size)), -1, 0)
    #   return mask  # (seq_len, seq_len)
    
    #def causal_attention_mask(size):
    #    """hello"""
    #    mask = 1-tf.linalg.band.part(tf.ones((size,size)), -1, 0)
    #    return mask

    def causal_attention_mask1(self, batch_size, n_dest, n_src, dtype):
        """Masks the upper half of the dot product matrix in self attention.

        This prevents flow of information from future tokens to current token.
        1's in the lower triangle, counting from the lower right corner.
        """
        i = tf.range(n_dest)[:, None]
        j = tf.range(n_src)
        m = i >= j - n_src + n_dest
        mask = tf.cast(m, dtype)
        mask = tf.reshape(mask, [1, n_dest, n_src])
        mult = tf.concat(
            [tf.expand_dims(1#batch_size
                            , -1), tf.constant([1, 1], dtype=tf.int32
                                             )], 0
        )
        
        #print(f"n_dest {n_dest}, n_src {n_src}")
        mask = tf.tile(mask, mult)
        return mask
        
        #or
        #mask = 1-tf.linalg.band_part(tf.ones((n_dest,n_src)), -1, 0)
        #return mask
        
        
    def causal_attention_mask(self, inputs):
        input_shape = tf.shape(inputs)
        batch_size, sequence_length = input_shape[0], input_shape[1]
        i = tf.range(sequence_length)[:, tf.newaxis]
        j = tf.range(sequence_length)
        mask = tf.cast(i >= j, dtype="int32")
        mask = tf.reshape(mask, (1, input_shape[1], input_shape[1]))
        mult = tf.concat(
            [tf.expand_dims(batch_size, -1), tf.constant([1, 1], dtype=tf.int32)],
            axis=0,
        )
        ret = tf.tile(mask, mult)
        print(ret)
        return ret
    
    
    def causal_attention_mask_3D(self,inputs):
        input_shape = tf.shape(inputs)
        batch_size, sequence_length,feats = input_shape[0], input_shape[1], input_shape[2]#inputs.shape[0], inputs.shape[1], inputs.shape[2]
        #print(f"inputs for mask: {input_shape}")
        i = tf.range(sequence_length)[:, tf.newaxis]
        j = tf.range(feats)
        print(f"i: {i}\n j: {j}")
        mask = tf.cast(i >= j, dtype="int32")
        mask = tf.reshape(mask, (1, input_shape[1], input_shape[2]))
        mult = tf.concat(
            [tf.expand_dims(batch_size, -1), tf.constant([1, 1], dtype=tf.int32)],
            axis=0,
        )
        
        ret = tf.tile(mask, mult)
        
        return ret
    
    def test_mask_3D(self, inputs):
        input_shape = tf.shape(inputs)
        batch_size, sequence_length,feats = input_shape[0], input_shape[1], input_shape[2]
        bs = inputs.shape[0]
        sl = inputs.shape[1]
        f = inputs.shape[2]
        mask = np.zeros((sl,f, sl, f),dtype="int32")
        #print(mask.shape)
        for ind in range(sl):
            mask[ind,:,0:ind+1,:] += 1       

        mult = tf.concat(
            [tf.expand_dims(batch_size, -1), tf.constant([1, 1, 1], dtype=tf.int32)],
            axis=0,
        )
        #print(mult)

        ret = tf.tile(mask, mult)
        #print(ret)
        return ret
        
    def test_mask(self,inputs):
            # test mask shape:
            input_shape = tf.shape(inputs)
            batch_size, sequence_length,feats = input_shape[0], input_shape[1], input_shape[2]
            bs = inputs.shape[0]
            sl = inputs.shape[1]
            f = inputs.shape[2]
            mask = np.zeros((sl,f, sl, f))
            print(mask.shape)
            
            
            mult = tf.concat(
                [tf.expand_dims(batch_size, -1), tf.constant([1, 1, 1], dtype=tf.int32)],
                axis=0,
            )
            print(mult)

            ret = tf.tile(mask, mult)
            #print(ret)
            return ret
           
            
    def call(self, enc_out, target):
        #print(f"decoder call, enc_out: {enc_out}, target: {target}")
        #print("enc_out.shape ", enc_out.shape)
        #input_shape = tf.shape(target)
        input_shape = target.shape
        #print("DECODER hello I am input shape to be masked", input_shape)
        batch_size = input_shape[0]
        #print("DECODER hello I am batch size: ", batch_size)
        seq_len = input_shape[1]
        #new_mask = self.causal_attention_mask_3D(target)
        
        test_mask = self.test_mask_3D(target)
        #print(f"DECODER hello I made a mask: {new_mask}")
        #new_mask = self.causal_attention_mask(batch_size, seq_len, seq_len, tf.bool)
        target_att = self.self_att(target, target, attention_mask=test_mask)#causal_mask)
        
        #print("DECODER hello I did target attention with causal mask")
        target_norm = self.layernorm1(target + self.self_dropout(target_att))
        enc_out = self.enc_att(target_norm, enc_out)
        #print("DECODER hello I got past encoder attention too")
        enc_out_norm = self.layernorm2(self.enc_dropout(enc_out) + target_norm)
        ffn_out = self.ffn(enc_out_norm)
        ffn_out_norm = self.layernorm3(enc_out_norm + self.ffn_dropout(ffn_out))
        #print("DECODER helo I return from decoder call")
        return ffn_out_norm


# Transformer 

In [20]:

class Transformer(keras.Model):
    
    def __init__(
        self,
        num_hid=64,
        num_head=2,
        num_feed_forward=128,
        source_maxlen=100,
        target_maxlen=100,
        num_layers_enc=6,
        num_layers_dec=1,
        num_classes=101,
    ):
        super().__init__()
        self.loss_metric = keras.metrics.Mean(name="loss")
        self.num_layers_enc = num_layers_enc
        self.num_layers_dec = num_layers_dec
        self.target_maxlen = target_maxlen
        self.num_classes = num_classes
        
        self.enc_input = TokenEmbedding(num_vocab=num_classes, maxlen=target_maxlen, num_hid=num_hid)
        #TokenAndPositionEmbedding(source_maxlen, vocab_size, num_hid)#
        #SpeechFeatureEmbedding(num_hid=num_hid, maxlen=source_maxlen) #Use tokenembedding here too?
        
        self.dec_input = TokenEmbedding(num_vocab=num_classes, maxlen=target_maxlen, num_hid=num_hid)

        self.encoder = keras.Sequential(
            [self.enc_input]
            + [
                TransformerEncoder(num_hid, num_head, num_feed_forward)
                for _ in range(num_layers_enc)
            ]
        )

        for i in range(num_layers_dec):
            setattr(
                self,
                f"dec_layer_{i}",
                TransformerDecoder(num_hid, num_head, num_feed_forward),
            )


        self.classifier = layers.Dense(num_classes)

    def decode(self, enc_out, target):
        y = self.dec_input(target)
        #print("yes hello decode")
        for i in range(self.num_layers_dec):
            #print(f"encoder out: {enc_out}, y: {y}")
            y = getattr(self, f"dec_layer_{i}")(enc_out, y)
            #print(f"I got an attribute {y}")
        return y

    def call(self, inputs):
        #print("I am in call")
        #print(f"inputs {inputs}")
        
        source = inputs[0] #input[0]use source/target when eager is disabled
        target = inputs[1]
        
        #print(f"model CALL source={source}")
        #print(f"model CALL target={target}")
        
        #print("I found inputs and targets")
        x = self.encoder(source)
        #print(f"I made x by calling encoder {x}")
        y = self.decode(x, target)
        #print(f"I made y by calling decoder {y}")
        return self.classifier(y)

    @property
    def metrics(self):
        return [self.loss_metric]

    def train_step(self, batch):
        """Processes one batch inside model.fit()."""
        #print("TRAIN STEP")
        source = batch["source"]
        print(f"TRAIN STEP source: {source}")
        target = batch["target"]
        print(f"TRAIN STEP target: {target}")
        dec_input = target[:, :-1]
        print(f"TRAIN STEP dec_input: {dec_input}")
        dec_target = target[:, 1:]
        print(f"TRAIN STEP dec_target: {dec_target}")
        with tf.GradientTape() as tape:
            #print("DEBUG1")
            preds = self([source, dec_input])
            #print(f"shape of predictions: {preds}")
            #print("DEBUG2")
            one_hot = tf.one_hot(dec_target, depth=self.num_classes)
            print(f"shape of onehot: {one_hot}")
            #print("DEBUG3")
            mask = tf.math.logical_not(tf.math.equal(dec_target, 0))
            loss = self.compiled_loss(one_hot, preds, sample_weight=mask)
            #print(f"I calculated loss {loss}")
        trainable_vars = self.trainable_variables
        gradients = tape.gradient(loss, trainable_vars)
        self.optimizer.apply_gradients(zip(gradients, trainable_vars))
        self.loss_metric.update_state(loss)
        print("-------I DID BATCH STEP-----!")
        
        return {"loss": self.loss_metric.result()}

    def test_step(self, batch):
        print("*****TEST STEP WAS CALLED")
        source = batch["source"]
        target = batch["target"]
        dec_input = target[:, :-1]
        dec_target = target[:, 1:]
        preds = self([source, dec_input])
        one_hot = tf.one_hot(dec_target, depth=self.num_classes)
        mask = tf.math.logical_not(tf.math.equal(dec_target, 0))
        loss = self.compiled_loss(one_hot, preds, sample_weight=mask)
        self.loss_metric.update_state(loss)
        return {"loss": self.loss_metric.result()}

    def generate(self, source, target_start_token_idx):
        """Performs inference over one batch of inputs using greedy decoding."""
        print("****GENERATE STEP WAS CALLED")
        bs = tf.shape(source)[0]
        enc = self.encoder(source)
        dec_input = tf.ones((bs, 1), dtype=tf.int32) * target_start_token_idx
        dec_logits = []
        for i in range(self.target_maxlen - 1):
            print("Trying to decode - generate")
            dec_out = self.decode(enc, dec_input)
            print("got past decode")
            logits = self.classifier(dec_out)
            logits = tf.argmax(logits, axis=-1, output_type=tf.int32)
            last_logit = tf.expand_dims(logits[:, -1], axis=-1)
            dec_logits.append(last_logit)
            dec_input = tf.concat([dec_input, last_logit], axis=-1)
        return dec_input


# Training

In [25]:

#batch = next(iter(val_ds))

## The vocabulary to convert predicted indices into characters
#idx_to_char = vectorizer.get_vocabulary()
#display_cb = DisplayOutputs(
#    batch, idx_to_char, target_start_token_idx=2, target_end_token_idx=3
#)  # set the arguments as per vocabulary index for '<' and '>'

loss_fn = tf.keras.losses.CategoricalCrossentropy(
        from_logits=True, label_smoothing=0.1,
    )
learning_rate = CustomSchedule(
       init_lr=0.00001,
        lr_after_warmup=0.0001,
        final_lr=0.00001,
        warmup_epochs=30,
        decay_epochs=55,
        steps_per_epoch= x_train.shape[0]/BS,
    )
optimizer = keras.optimizers.Adam(learning_rate=0.00001)#learning_rate)

mirrored_strategy = tf.distribute.MirroredStrategy()

with mirrored_strategy.scope():
    model = Transformer(
            num_hid=64,#EMBED_DIM,
            num_head=8,#NUM_HEADS,
            num_feed_forward=512,#512#ff_dim,
            target_maxlen=maxlen,
            source_maxlen=maxlen,
            num_layers_enc=6,#4,
            num_layers_dec=6,#4,
            num_classes=vocab_size,
        )
    
    model.compile(optimizer=optimizer, loss=loss_fn, metrics=["accuracy"])


early_stopping = keras.callbacks.EarlyStopping(monitor='val_loss', mode='min', patience=patience, verbose=1)
checkpoint_filepath = '/tmp/checkpoint'
checkpoint = keras.callbacks.ModelCheckpoint(
        filepath=checkpoint_filepath,
        save_weights_only=True,
        monitor='val_loss',
        mode='min',
        save_best_only=True, verbose=1)


#num_epochs = 1
history = model.fit(DS, validation_data=VAL_DS, callbacks=[early_stopping, checkpoint], epochs=num_epochs, shuffle=True)
model.summary()


#print(history.history.get('acc')[-1])
#history = model.fit(x_train, y_train, validation_data=(x_val, y_val), callbacks=[early_stopping, checkpoint], epochs=num_epochs)
#history = model.fit(ds, valiinputdation_data=val_ds, callbacks=[display_cb], epochs=1)


INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0', '/job:localhost/replica:0/task:0/device:GPU:1')
  opt = tf.keras.mixed_precision.LossScaleOptimizer(opt)

Epoch 1/50000
TRAIN STEP source: Tensor("cond_2/Identity:0", shape=(None, 50, 60), dtype=int32, device=/job:localhost/replica:0/task:0/device:GPU:0)
TRAIN STEP target: Tensor("cond_2/Identity_1:0", shape=(None, 50, 60), dtype=int32, device=/job:localhost/replica:0/task:0/device:GPU:0)
TRAIN STEP dec_input: Tensor("strided_slice:0", shape=(None, 49, 60), dtype=int32, device=/job:localhost/replica:0/task:0/device:GPU:0)
TRAIN STEP dec_target: Tensor("strided_slice_1:0", shape=(None, 49, 60), dtype=int32, device=/job:localhost/replica:0/task:0/device:GPU:0)
TOKEN EMBEDDING WAS CALLED
TOKEN EMBEDDING WAS CALLED
TOKEN EMBEDDING WAS CALLED
shape of onehot: Tensor("one_hot:0", shape=(None, 49, 60, 300), dtype=float32, device=/job:localhost/replica:0/task:0/device:GPU:0)
TRAIN STEP source: 

# Testing masking functions

In [None]:
#td = TransformerDecoder(embed_dim=3, num_heads=4, feed_forward_dim=5)

def attention_mask_3D(inputs):
        input_shape = tf.shape(inputs)
        batch_size, sequence_length,feats = input_shape[0], input_shape[1],input_shape[2]#inputs.shape[0], inputs.shape[1], inputs.shape[2]
        #print(f"inputs for mask: {input_shape}")
        i = tf.range(sequence_length)[:, tf.newaxis]
        j = tf.range(sequence_length)
        k = tf.range(feats)#[:, tf.newaxis]
        print(f"i: {i}\nj: {j}\nk: {k}")
        
        
        
        mask_seqs = [tf.cast(i >= j, dtype="int32") for val in k]
        #print(mask_seqs)
        #print(mask)
        mask=np.asarray(mask_seqs)
        mask = tf.reshape(mask_seqs, (input_shape[1],input_shape[2], input_shape[1], input_shape[2]))
        #print(mask.shape)
        
        # make copies for each batch using tile
        mult = tf.concat(
            [tf.expand_dims(batch_size, -1), tf.constant([1, 1], dtype=tf.int32)],
            axis=0,
        )
        print(mult)
        
        ret = tf.tile(mask, mult)
        #print(ret)
        return ret
    
test = np.ones((4,5, 3, 2)) # bs, seq_len, feats, embed_dim
#print(np.ones((5, 3)))
mask = attention_mask_3D(test)
print(mask)


In [None]:
def causal_attention_mask(inputs):
        input_shape = tf.shape(inputs)
        batch_size, sequence_length = input_shape[0], input_shape[1]
        i = tf.range(sequence_length)[:, tf.newaxis]
        j = tf.range(sequence_length)
        mask = tf.cast(i >= j, dtype="int32")
        mask = tf.reshape(mask, (1, input_shape[1], input_shape[1]))
        print(mask)
        mult = tf.concat(
            [tf.expand_dims(batch_size, -1), tf.constant([1, 1], dtype=tf.int32)],
            axis=0,
        )
        print(mult)
        ret = tf.tile(mask, mult)
        #print(ret)
        return ret

test = np.ones((4,10,3)) # bs, seq_len, embed_dim
mask = causal_attention_mask(test)
#print(mask)


In [None]:
def test_mask_works(self,inputs):
    # test mask shape:
    input_shape = tf.shape(inputs)
    batch_size, sequence_length,feats = input_shape[0], input_shape[1], input_shape[2]
    bs = inputs.shape[0]
    sl = inputs.shape[1]
    f = inputs.shape[2]
    mask = np.zeros((sl,f, sl, f))
    print(mask.shape)
           
            
    return mask
        
        
test = np.ones((4,5, 3, 2)) # bs, seq_len, feats, embed_dim
#print(np.ones((5, 3)))
mask = test_mask_works(test)
#print(mask)

In [None]:
def test_mask(inputs):
    # test mask shape:
    input_shape = tf.shape(inputs)
    batch_size, sequence_length,feats = input_shape[0], input_shape[1], input_shape[2]
    bs = inputs.shape[0]
    sl = inputs.shape[1]
    f = inputs.shape[2]
    mask = np.zeros((sl,f, sl, f),dtype="int32")
    #print(mask.shape)
    for ind in range(sl):
        mask[ind,:,0:ind+1,:] += 1       
            
    mult = tf.concat(
        [tf.expand_dims(batch_size, -1), tf.constant([1, 1, 1], dtype=tf.int32)],
        axis=0,
    )
    #print(mult)

    ret = tf.tile(mask, mult)
    #print(ret)
    return ret

test = np.ones((1,5, 3, 2)) # bs, seq_len, feats, embed_dim
#print(np.ones((5, 3)))
mask = test_mask(test)
print(mask)

In [None]:
print(history.history.keys())
print(history.history['loss'])

# Generate

In [21]:
reconstructed_model = keras.models.load_model("transformer3D_overfit")

In [46]:
# OVERFIT TEST 3D

#x_test = x_test.reshape(x_test.shape[0], x_test.shape[1]*x_test.shape[2])
#y_test = y_test.reshape(y_test.shape[0], y_test.shape[1]*y_test.shape[2])



source = x_train[0:1] #x_test[] # test example
target = y_train[0:1] #y_test[]
print(source.shape)
target_start_token_idx = 0 # start of target sequence
#print(model.evaluate(source, target))

preds = model.generate(source, target_start_token_idx)
preds = preds.numpy()
print(preds.shape)
print("preds: ", preds)

(1, 50, 60)
****GENERATE STEP WAS CALLED
TOKEN EMBEDDING WAS CALLED
Trying to decode - generate
TOKEN EMBEDDING WAS CALLED


InvalidArgumentError: Expected input 0 to have rank 4 but got: 3 [Op:Einsum]

In [None]:
# OVERFIT TEST

#x_test = x_test.reshape(x_test.shape[0], x_test.shape[1]*x_test.shape[2])
#y_test = y_test.reshape(y_test.shape[0], y_test.shape[1]*y_test.shape[2])


source = x_train[0:1,:]#x_test[] # test example
target = y_train[0:1,:]#y_test[]
print(source.shape)
target_start_token_idx = 0 # start of target sequence

#print(model.evaluate(source, target))

preds = model.generate(source, target_start_token_idx)
preds = preds.numpy()
print(preds.shape)
print(preds)

In [None]:
# get test data
x_test = np.load('../data/x_test.npy')
y_test = np.load('../data/y_test.npy')

x_test = x_test.astype('int32')
y_test = y_test.astype('int32')




#testing data squish
#x_test = x_test.reshape(x_test.shape[0], x_test.shape[1]*x_test.shape[2])
#y_test = y_test.reshape(y_test.shape[0], y_test.shape[1]*y_test.shape[2])

source = x_test[0:1,:]#x_test[] # test example
target = y_test[0:1,:]#y_test[]
print(source.shape)
target_start_token_idx = 0 # start of target sequence

preds = model.generate(source, target_start_token_idx)
preds = preds.numpy()
print(preds.shape)
print(preds)



In [None]:
print(preds[0].shape)

In [None]:
import csv

def write_ex_to_tsv(ex, fn):
    #num_frames = ex.shape[0]
    
    # squished
    num_frames = 50 
    
    freq = 30
    marker_names = ['MARKER_NAMES','Head','neck','lsho','lelb','lwri','lhan','rsho','relb','rwri','rhan','t10','root','lhip','lknee','lank','lfoot','rhip','rknee','rank','rfoot']#['MARKER_NAMES','Head','neck','rsho','relb','rwri','rhan','lsho','lelb','lwri','lhan','back','root','rhip','lhip','rknee','lknee','rank','lank', 'rfoot', 'lfoot']
    
    with open('../animations/tsv/'+fn+'.tsv', 'wt') as out_file:
        tsv_writer = csv.writer(out_file, delimiter='\t')
        tsv_writer.writerow(['NO_OF_FRAMES', num_frames])
        tsv_writer.writerow(['NO_OF_CAMERAS', 0])
        tsv_writer.writerow(['NO_OF_MARKERS', 20])
        tsv_writer.writerow(['FREQUENCY', freq])
        tsv_writer.writerow(['NO_OF_ANALOG', 0])
        tsv_writer.writerow(['ANALOG_FREQUENCY', 0])
        tsv_writer.writerow(['DESCRIPTION--', ''])
        tsv_writer.writerow(['TIME_STAMP--', ''])
        tsv_writer.writerow(['DATA_INCLUDED', '3D'])
        tsv_writer.writerow(marker_names)
        
        # squished:
        idx = 0
        
        for frame in range(num_frames):
            #tsv_writer.writerow(ex[frame,:])
            
            # squished:    
            tsv_writer.writerow(ex[idx:idx+60])
            idx = idx+60

In [None]:
write_ex_to_tsv(preds[0], 'test-overfit2')






# To dos
- Debug training error
- implement a function for generating from trained model- Possibly rework the code below and write tsv files at certain epochs

In [None]:
class DisplayOutputs(keras.callbacks.Callback):
    def __init__(
        self, batch, idx_to_token, target_start_token_idx=27, target_end_token_idx=28
    ):
        """Displays a batch of outputs after every epoch

        Args:
            batch: A test batch containing the keys "source" and "target"
            idx_to_token: A List containing the vocabulary tokens corresponding to their indices
            target_start_token_idx: A start token index in the target vocabulary
            target_end_token_idx: An end token index in the target vocabulary
        """
        self.batch = batch
        self.target_start_token_idx = target_start_token_idx
        self.target_end_token_idx = target_end_token_idx
        self.idx_to_char = idx_to_token

    def on_epoch_end(self, epoch, logs=None):
        if epoch % 5 != 0:
            return
        source = self.batch["source"]
        target = self.batch["target"].numpy()
        bs = tf.shape(source)[0]
        preds = self.model.generate(source, self.target_start_token_idx)
        preds = preds.numpy()
        for i in range(bs):
            target_text = "".join([self.idx_to_char[_] for _ in target[i, :]])
            prediction = ""
            for idx in preds[i, :]:
                prediction += self.idx_to_char[idx]
                if idx == self.target_end_token_idx:
                    break
            print(f"target:     {target_text.replace('-','')}")
            print(f"prediction: {prediction}\n")


# hmm

Jiaman uses masked cross entropy negative log likelihood - implement this myself? 
https://github.com/lijiaman/motion_transformer/blob/3e36d6ac6a0b96b19255ed26af1fd351855190f3/two_stream_transformer_discrete_main.py

def maskedCrossEntropy(probs, labels, mask):
    # probs: BS X T X J X n_cls, labels: BS X T X J, mask: BS X T X 1
    B, T, J, C = probs.size()
    prob_flat = probs.contiguous().view(-1, C)
    logp_flat = F.log_softmax(prob_flat, dim=1) # (BxT,C) log probabilities
    logp = logp_flat.view(B,T,J,C)
    mask = mask.repeat(1, 1, J) # BS X T X J
    labels = labels.float()*mask.float()
    logp = torch.gather(logp, 3, labels.unsqueeze(3).long()).squeeze(-1) # (B,T,J)
    negative_log_likelihood = -(logp*mask.float())

    # return negative_log_likelihood.mean()
    return torch.sum(negative_log_likelihood)/torch.sum(mask)