# Text classification with Transformer

**Author:** [Apoorv Nandan](https://twitter.com/NandanApoorv)<br>
**Date created:** 2020/05/10<br>
**Last modified:** 2020/05/10<br>
**Description:** Implement a Transformer block as a Keras layer and use it for text classification.

## Setup

In [1]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

## Implement a Transformer block as a layer

In [3]:

class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super(TransformerBlock, self).__init__()
        self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = keras.Sequential(
            [layers.Dense(ff_dim, activation="relu"), layers.Dense(embed_dim),]
        )
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)

    def call(self, inputs, training):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)


## Implement embedding layer

Two seperate embedding layers, one for tokens, one for token index (positions).

In [4]:

class TokenAndPositionEmbedding(layers.Layer):
    def __init__(self, maxlen, vocab_size, embed_dim):
        super(TokenAndPositionEmbedding, self).__init__()
        self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
        self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=embed_dim)

    def call(self, x):
        maxlen = tf.shape(x)[-1]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions


## Download and prepare dataset

In [11]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [12]:
!cp -r /content/drive/MyDrive/T2T/Data /content/T2T

In [13]:
!pip install tensor2tensor

Collecting tensor2tensor
[?25l  Downloading https://files.pythonhosted.org/packages/d6/7c/9e87d30cefad5cbc390bb7f626efb3ded9b19416b8160f1a1278da81b218/tensor2tensor-1.15.7-py2.py3-none-any.whl (1.4MB)
[K     |▎                               | 10kB 14.8MB/s eta 0:00:01[K     |▌                               | 20kB 20.6MB/s eta 0:00:01[K     |▊                               | 30kB 14.2MB/s eta 0:00:01[K     |█                               | 40kB 10.5MB/s eta 0:00:01[K     |█▏                              | 51kB 8.5MB/s eta 0:00:01[K     |█▍                              | 61kB 7.8MB/s eta 0:00:01[K     |█▋                              | 71kB 8.6MB/s eta 0:00:01[K     |█▉                              | 81kB 8.5MB/s eta 0:00:01[K     |██                              | 92kB 8.9MB/s eta 0:00:01[K     |██▎                             | 102kB 8.6MB/s eta 0:00:01[K     |██▌                             | 112kB 8.6MB/s eta 0:00:01[K     |██▊                             | 

In [22]:
vocab_size = 1800  # Only consider the top 20k words
maxlen = 200  # Only consider the first x words of each sample  -- What should I set this to?


training_set = ["/content/T2T/EFC401_B4.tfrecord", "/content/T2T/EFC401_B6.tfrecord", "/content/T2T/EFC401_B8.tfrecord", "/content/T2T/EFC401_B12.tfrecord", "/content/T2T/EFC401_B17.tfrecord", "/content/T2T/EFC401_B18.tfrecord", "/content/T2T/EFC401_B20.tfrecord", "/content/T2T/EFC401_B32.tfrecord", "/content/T2T/EFC401_B34.tfrecord", "/content/T2T/EFC401_B41.tfrecord", "/content/T2T/EFC401_B57.tfrecord", "/content/T2T/EFC401_B61.tfrecord", "/content/T2T/EFC401_B66.tfrecord", "/content/T2T/EFC401_B69.tfrecord","/content/T2T/EFC401_B73.tfrecord", "/content/T2T/EFC401_B77.tfrecord"]
testing_set = ["/content/T2T/EFC401_B87.tfrecord"]   
validation_set = ["/content/T2T/EFC401_B83.tfrecord"]
train_dataset, test_dataset, validation_dataset = tf.data.TFRecordDataset(training_set), tf.data.TFRecordDataset(testing_set), tf.data.TFRecordDataset(validation_set)
print(train_dataset)

#train_dataset, test_dataset, validation_dataset = tf.data.TFRecordDataset(training_set), tf.data.TFRecordDataset(testing_set), tf.data.TFRecordDataset(validation_set)

#(x_train, y_train), (x_val, y_val) = keras.datasets.imdb.load_data(num_words=vocab_size)
#print(len(train_dataset), "Training sequences") -- error, dataset length is unknown/infinite.  Can I just ignore this...
#print(len(validation_dataset), "Validation sequences")
train_dataset = keras.preprocessing.sequence.pad_sequences(training_set)
validation_dataset = keras.preprocessing.sequence.pad_sequences(validation_dataset)

<TFRecordDatasetV2 shapes: (), types: tf.string>


ValueError: ignored

## Create classifier model using transformer layer

Transformer layer outputs one vector for each time step of our input sequence.
Here, we take the mean across all time steps and
use a feed forward network on top of it to classify text.

In [None]:

embed_dim = 32  # Embedding size for each token
num_heads = 2  # Number of attention heads
ff_dim = 32  # Hidden layer size in feed forward network inside transformer

inputs = layers.Input(shape=(maxlen,))
embedding_layer = TokenAndPositionEmbedding(maxlen, vocab_size, embed_dim)
x = embedding_layer(inputs)
transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim)
x = transformer_block(x)
x = layers.GlobalAveragePooling1D()(x)
x = layers.Dropout(0.1)(x)
x = layers.Dense(20, activation="relu")(x)
x = layers.Dropout(0.1)(x)
outputs = layers.Dense(2, activation="softmax")(x)

model = keras.Model(inputs=inputs, outputs=outputs)


## Train and Evaluate

In [None]:
model.compile("adam", "sparse_categorical_crossentropy", metrics=["accuracy"])
history = model.fit(
    x_train, y_train, batch_size=32, epochs=2, validation_data=(x_val, y_val)
)