In [11]:
import tensorflow as tf
from tensorflow.keras import layers
import keras
import numpy as np

# Hyperparameters
num_heads = 4
key_dim = 64
ff_dim = 256
input_vocab_size = 1000
target_vocab_size = 1000
max_len = 50
embed_dim = num_heads * key_dim  # Must match num_heads * key_dim

In [12]:
class Encoder(layers.Layer):
    def __init__(self,num_heads,key_dim,ff_dim):
        super().__init__()
        self.attn=layers.MultiHeadAttention(num_heads,key_dim)
        self.ffn=keras.Sequential([
            layers.Dense(ff_dim,activation='relu'),
            layers.Dense(embed_dim)
        ])
        self.norm1=layers.LayerNormalization()
        self.norm2=layers.LayerNormalization()
        self.dropout1 = layers.Dropout(0.1)
        self.dropout2 = layers.Dropout(0.1)
    
    def call(self,inputs):
        attn_out=self.attn(inputs,inputs)
        x1=self.norm1(attn_out+inputs)
        x2=self.ffn(x1)
        out=self.norm2(x1+x2)
        return out
encoder=Encoder(num_heads,key_dim,ff_dim)

In [13]:
x=np.random.random(size=(20,80,256))
y=np.random.randint(0,1000,size=(20,80))

In [14]:
input=np.random.random(size=(1,20,64*4))
encoder(input)

<tf.Tensor: shape=(1, 20, 256), dtype=float32, numpy=
array([[[-1.2127826 , -0.77087265,  0.77632517, ...,  0.8525477 ,
         -0.348576  ,  0.869121  ],
        [-0.2408125 , -0.34218946, -0.40675944, ...,  1.0770398 ,
          0.17444578,  0.3869732 ],
        [ 1.0243182 ,  0.03374228, -1.9451551 , ...,  0.3192826 ,
          1.5060017 ,  1.551663  ],
        ...,
        [-1.433679  , -0.05109433, -0.92366725, ..., -0.4154136 ,
          0.4992351 , -0.11111915],
        [ 0.08867569, -2.5041466 ,  0.80557925, ..., -0.32401872,
          0.9066082 ,  0.9615029 ],
        [ 2.3308926 , -0.59212214,  0.8988739 , ..., -1.6763623 ,
          0.36125395, -0.8192442 ]]], dtype=float32)>

In [15]:
class Decoder(layers.Layer):
    def __init__(self,num_heads,key_dim,ff_dim):
        super().__init__()
        self.attn1=layers.MultiHeadAttention(num_heads,key_dim)
        self.attn2=layers.MultiHeadAttention(num_heads,key_dim)
        self.norm1=layers.LayerNormalization()
        self.norm2=layers.LayerNormalization()
        self.norm3=layers.LayerNormalization()
        self.ffn=keras.Sequential([
            layers.Dense(ff_dim,activation='relu'),
            layers.Dense(embed_dim)
        ])
        self.final=layers.Dense(1000)

    def call(self,inputs,encoder_out):
        attn1=self.attn1(inputs,inputs)
        norm1=self.norm1(attn1+inputs)
        attn2=self.attn2(inputs,encoder_out,encoder_out)
        norm2=self.norm2(attn2+norm1)
        ffn=self.ffn(norm2)
        norm3=self.norm3(ffn+norm2)
        out=self.final(norm3)
        return out
decoder=Decoder(num_heads,key_dim,ff_dim)

In [16]:
output=np.random.random(size=(1,20,64*4))

In [17]:
output.shape

(1, 20, 256)

In [18]:
decoder(input,input)

<tf.Tensor: shape=(1, 20, 1000), dtype=float32, numpy=
array([[[-0.05834087, -0.18001312, -0.06784771, ..., -0.8585679 ,
         -0.35062277, -0.90648115],
        [ 0.48854625, -0.63146025,  1.4996128 , ...,  0.10885578,
         -0.7751333 ,  0.25545335],
        [ 0.25171676, -0.6617282 ,  0.06840431, ..., -0.9687251 ,
          0.40246302,  0.16524458],
        ...,
        [ 0.16209333,  0.54144454,  0.6200061 , ..., -0.27395976,
          0.4443381 ,  1.187688  ],
        [ 0.2857694 ,  1.5800495 , -0.3480262 , ..., -0.49839073,
         -0.6645607 ,  0.02672759],
        [-0.5273811 , -0.94447374,  0.215056  , ...,  0.15271834,
         -0.5571054 , -0.97469854]]], dtype=float32)>

In [20]:
import tensorflow as tf
class Transformer(keras.Model):
    def __init__(self,encoder,decoder):
        super().__init__()
        self.encoder=encoder
        self.decoder=decoder
        
    def call(self,inputs):
        en_out=self.encoder(inputs)
        dec_out=self.decoder(inputs,en_out)
        return dec_out
    
    def train_step(self,data):
        print('in train_step')
        x,y=data
        with tf.GradientTape() as tape:
            y_pred=self(x)
            loss=self.compiled_loss(y,y_pred)
        train_vars=self.trainable_variables
        grads=tape.gradient(loss,train_vars)
        self.optimizer.apply_gradients(zip(grads,train_vars))
        self.compiled_metrics.update_state(y, y_pred)

        return {m.name: m.result() for m in self.metrics}



In [21]:
model=Transformer(encoder,decoder)

In [22]:
model.compile(loss=keras.losses.SparseCategoricalCrossentropy(),optimizer='adam')

In [23]:
model.fit(x,y,epochs=1)

in train_step


```
for metric in self.metrics:
    metric.update_state(y, y_pred)
```

  return self._compiled_metrics_update_state(


in train_step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4s/step - loss: -0.0100


<keras.src.callbacks.history.History at 0x23d4fbfaf90>