<a href="https://colab.research.google.com/github/ashaduzzaman-sarker/Text-classification-Sentiment-Analysis/blob/main/Text_Sentiment_classification_with_Transformer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Implement a Transformer block as a Keras layer and use it for text classification

## Imports

In [2]:
!pip install --upgrade keras tensorflow

Collecting keras
  Downloading keras-3.4.1-py3-none-any.whl.metadata (5.8 kB)
Collecting tensorflow
  Downloading tensorflow-2.17.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.2 kB)
Collecting namex (from keras)
  Downloading namex-0.0.8-py3-none-any.whl.metadata (246 bytes)
Collecting optree (from keras)
  Downloading optree-0.12.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (47 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.8/47.8 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
Collecting ml-dtypes (from keras)
  Downloading ml_dtypes-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (20 kB)
Collecting tensorboard<2.18,>=2.17 (from tensorflow)
  Downloading tensorboard-2.17.0-py3-none-any.whl.metadata (1.6 kB)
Downloading keras-3.4.1-py3-none-any.whl (1.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m13.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownload

In [1]:
import keras
from keras import ops
from keras import layers

## Implementing a Transfer block as a layer

![](https://www.researchgate.net/publication/365188912/figure/fig5/AS:11431281095394256@1667877161012/Illustration-of-Transformer-blocks-Vaswani-et-al-2017-the-dashed-rectangle-in-the.ppm)

In [10]:
class TransformerBlock(layers.Layer):
  def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
    super().__init__()
    self.attn = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
    self.ffn = keras.Sequential(
        [layers.Dense(ff_dim, activation="relu"), layers.Dense(embed_dim),]
    )
    self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
    self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
    self.dropout1 = layers.Dropout(rate)
    self.dropout2 = layers.Dropout(rate)

  def call(self, inputs):
    attn_output = self.attn(inputs, inputs)
    attn_output = self.dropout1(attn_output)
    out1 = self.layernorm1(inputs + attn_output)
    ffn_output = self.ffn(out1)
    ffn_output = self.dropout2(ffn_output)
    return self.layernorm2(out1 + ffn_output)

## Implement embedding layer

Two separate embedding layers:
- tokens
- token index (positions)

In [9]:
class TokenAndPositionEmbedding(layers.Layer):
  def __init__(self, maxlen, vocab_size, embed_dim):
    super().__init__()
    self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
    self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=embed_dim)

  def call(self, x):
    maxlen = ops.shape(x)[-1]
    positions = ops.arange(start=0, stop=maxlen, step=1)
    positions = self.pos_emb(positions)
    x = self.token_emb(x)
    return x + positions

## Download and prepare IMDB dataset

In [4]:
vocab_size = 20000  # Only consider the top 20k words
maxlen = 200  # Only consider the first 200 words of each movie review

(x_train, y_train), (x_val, y_val) = keras.datasets.imdb.load_data(num_words=vocab_size)

print(len(x_train), 'Training sequences')
print(len(x_val), 'Validation sequences')

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz
[1m17464789/17464789[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step
25000 Training sequences
25000 Validation sequences


In [7]:
x_train = keras.utils.pad_sequences(x_train, maxlen=maxlen)
x_val = keras.utils.pad_sequences(x_val, maxlen=maxlen)

## Create classifier model using transformer layer

- **Transformer Layer Output:** Produces one vector per time step in the input sequence.
- **Mean Pooling:** Calculate the mean of all vectors across time steps.
- **Feed Forward Network:** Apply a feed-forward network on the mean vector.
- **Text Classification:** Use the output of the feed-forward network for text classification.

In [11]:
embed_dim = 32  # Embedding size for each token
num_heads = 2  # Number of attention heads
ff_dim = 32 # Hidden layer size in feed forward inside transformer

inputs = layers.Input(shape=(maxlen,))
embedding_layer = TokenAndPositionEmbedding(maxlen, vocab_size, embed_dim)
x = embedding_layer(inputs)
transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim)
x = transformer_block(x)
x = layers.GlobalAveragePooling1D()(x)
x = layers.Dropout(0.1)(x)
x = layers.Dense(20, activation='relu')(x)
x = layers.Dropout(0.1)(x)
outputs = layers.Dense(2, activation='softmax')(x)

model = keras.Model(inputs=inputs, outputs=outputs)

## Train and Evaluate

In [12]:
model.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

history = model.fit(
    x_train,
    y_train,
    batch_size=32,
    epochs=10,
    validation_data=(x_val, y_val)
)

Epoch 1/10
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m123s[0m 151ms/step - accuracy: 0.6992 - loss: 0.5330 - val_accuracy: 0.8433 - val_loss: 0.3408
Epoch 2/10
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m140s[0m 149ms/step - accuracy: 0.9269 - loss: 0.1960 - val_accuracy: 0.8753 - val_loss: 0.3103
Epoch 3/10
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m115s[0m 147ms/step - accuracy: 0.9642 - loss: 0.1097 - val_accuracy: 0.8618 - val_loss: 0.3620
Epoch 4/10
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m143s[0m 148ms/step - accuracy: 0.9781 - loss: 0.0718 - val_accuracy: 0.8554 - val_loss: 0.4355
Epoch 5/10
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m125s[0m 160ms/step - accuracy: 0.9857 - loss: 0.0474 - val_accuracy: 0.8432 - val_loss: 0.5797
Epoch 6/10
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m141s[0m 159ms/step - accuracy: 0.9922 - loss: 0.0306 - val_accuracy: 0.8441 - val_loss: 0.6184
Epoc