In [13]:
import tensorflow as tf
from tensorflow.keras import layers
import keras

# Hyperparameters
num_heads = 4
key_dim = 64
ff_dim = 256
input_vocab_size = 1000
target_vocab_size = 1000
max_len = 50
embed_dim = num_heads * key_dim  # Must match num_heads * key_dim

In [26]:
class Encoder(layers.Layer):
    def __init__(self,num_heads,key_dim,ff_dim):
        super().__init__()
        self.attn=layers.MultiHeadAttention(num_heads,key_dim)
        self.ffn=keras.Sequential([
            layers.Dense(ff_dim,activation='relu'),
            layers.Dense(embed_dim)
        ])
        self.norm1=layers.LayerNormalization()
        self.norm2=layers.LayerNormalization()
        self.dropout1 = layers.Dropout(0.1)
        self.dropout2 = layers.Dropout(0.1)
    
    def call(self,inputs):
        attn_out=self.attn(inputs,inputs)
        x1=self.norm1(attn_out+inputs)
        x2=self.ffn(x1)
        out=self.norm2(x1+x2)
        return out
encoder=Encoder(num_heads,key_dim,ff_dim)

In [49]:
x=np.random.random(size=(20,80,256))
y=np.random.randint(0,1000,size=(20,80))

In [34]:
input=np.random.random(size=(1,20,64*4))
encoder(input)

<tf.Tensor: shape=(1, 20, 256), dtype=float32, numpy=
array([[[-0.649084  ,  1.0761093 , -0.52660125, ...,  0.26298004,
          0.06401635, -1.8109443 ],
        [ 0.1479908 ,  0.89725554,  0.4548781 , ..., -0.49070308,
         -0.7247118 , -1.5038882 ],
        [-0.18679012, -0.61070246,  0.5352549 , ..., -0.34154418,
         -0.21008793,  0.84406525],
        ...,
        [ 0.35534573,  0.05886101, -0.12305738, ..., -0.6793685 ,
         -0.33180696,  0.18863481],
        [ 0.56061625, -1.1070285 , -0.2135569 , ...,  0.3142345 ,
          0.03185473,  0.05495362],
        [-0.8543104 ,  0.376284  ,  0.585863  , ..., -2.0291612 ,
          0.187307  , -0.5297316 ]]], dtype=float32)>

In [59]:
class Decoder(layers.Layer):
    def __init__(self,num_heads,key_dim,ff_dim):
        super().__init__()
        self.attn1=layers.MultiHeadAttention(num_heads,key_dim)
        self.attn2=layers.MultiHeadAttention(num_heads,key_dim)
        self.norm1=layers.LayerNormalization()
        self.norm2=layers.LayerNormalization()
        self.norm3=layers.LayerNormalization()
        self.ffn=keras.Sequential([
            layers.Dense(ff_dim,activation='relu'),
            layers.Dense(embed_dim)
        ])
        self.final=layers.Dense(1000)

    def call(self,inputs,encoder_out):
        attn1=self.attn1(inputs,inputs)
        norm1=self.norm1(attn1+inputs)
        attn2=self.attn2(inputs,encoder_out,encoder_out)
        norm2=self.norm2(attn2+norm1)
        ffn=self.ffn(norm2)
        norm3=self.norm3(ffn+norm2)
        out=self.final(norm3)
        return out
decoder=Decoder(num_heads,key_dim,ff_dim)

In [42]:
output=np.random.random(size=(1,20,64*4))

In [44]:
output.shape

(1, 20, 256)

In [60]:
decoder(input,input)

<tf.Tensor: shape=(1, 20, 1000), dtype=float32, numpy=
array([[[ 0.46030194,  0.2590694 ,  1.1781337 , ...,  0.67232203,
          0.21596876, -0.88003135],
        [ 0.48037672,  0.1798131 ,  0.53229946, ..., -1.506984  ,
          1.6327589 ,  0.08765519],
        [-0.34970826, -0.17879637,  0.86428535, ...,  0.19337237,
         -0.76814914,  0.457581  ],
        ...,
        [ 0.6966606 , -0.62689054,  0.76753   , ...,  0.914107  ,
          0.66620576, -0.67119473],
        [-0.42851877,  0.5160184 ,  0.3727733 , ..., -0.84890187,
          1.3006573 ,  0.10884823],
        [ 0.47528273, -0.12156303, -0.06013846, ..., -0.832541  ,
         -0.06490389, -0.61822474]]], dtype=float32)>

In [61]:
import tensorflow as tf
class Transformer(keras.Model):
    def __init__(self,encoder,decoder):
        super().__init__()
        self.encoder=encoder
        self.decoder=decoder
        
    def call(self,inputs):
        en_out=self.encoder(inputs)
        dec_out=self.decoder(inputs,en_out)
        return dec_out
    
    def train_step(self,data):
        x,y=data
        with tf.GradientTape() as tape:
            y_pred=self(x)
            loss=self.compiled_loss(y,y_pred)
        train_vars=self.trainable_variables
        grads=tape.gradient(loss,train_vars)
        self.optimizer.apply_gradients(zip(grads,train_vars))
        self.compiled_metrics.update_state(y, y_pred)

        return {m.name: m.result() for m in self.metrics}



In [62]:
model=Transformer(encoder,decoder)

In [63]:
model.compile(loss=keras.losses.SparseCategoricalCrossentropy(),optimizer='adam')

In [66]:
model.fit(x,y,epochs=20)

Epoch 1/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 121ms/step - loss: -0.4207
Epoch 2/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 111ms/step - loss: -0.4427
Epoch 3/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 110ms/step - loss: -0.4638
Epoch 4/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 111ms/step - loss: -0.4790
Epoch 5/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 114ms/step - loss: -0.4929
Epoch 6/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 112ms/step - loss: -0.4933
Epoch 7/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 114ms/step - loss: -0.4929
Epoch 8/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 117ms/step - loss: -0.4906
Epoch 9/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 122ms/step - loss: -0.4973
Epoch 10/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 111ms/step - loss: -0.5073

<keras.src.callbacks.history.History at 0x18adc1be5d0>