In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import BatchNormalization, Input, Dense, Dropout, GlobalAveragePooling1D, Embedding
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.datasets import imdb
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.utils import to_categorical, plot_model
from tensorflow.keras import backend as K

In [2]:
class PositionEncoding(Model):
    def __init__(self, model_dim):
        super().__init__()
        self.model_dim = model_dim
        
    def call(self, inputs):
        seq_length = inputs.shape[1]
        position_encodings = np.zeros((seq_length, self.model_dim))
        for pos in range(seq_length):
            for i in range(self.model_dim):
                position_encodings[pos, i] = pos / np.power(10000, (i-i%2) / self.model_dim)
        position_encodings[:, 0::2] = np.sin(position_encodings[:, 0::2])
        position_encodings[:, 1::2] = np.cos(position_encodings[:, 1::2])
        position_encodings = tf.expand_dims(tf.cast(position_encodings, tf.float32), axis=0)
        return position_encodings

In [3]:
class ScaledDotProductAttention(Model):
    def __init__(self, masking=True, future=False, dropout=0):
        super().__init__()
        self.masking = masking
        self.future = future
        self.dropout = dropout
        self.masking_num = -2**32+1
        
        
    def mask(self, inputs, masks):
        masks = tf.cast(masks, tf.float32)
        masks = tf.tile(masks, [tf.shape(inputs)[0] // tf.shape(masks)[0], 1])
        masks = tf.expand_dims(masks, axis=1)
        outputs = inputs + masks * self.masking_num
        return outputs
    
    def future_mask(self, inputs):
        diag_vals = tf.ones_like(inputs[0])
        tril = tf.linalg.LinearOperatorLowerTriangular(diag_vals).to_dense()
        future_masks = tf.tile(tf.expand_dims(tril, 0), [tf.shape(inputs)[0], 1, 1])
        paddings = tf.ones_like(future_masks) * self.masking_num
        outputs = tf.where(tf.equal(future_masks, 0), paddings, inputs)
        return outputs
        
    def call(self, inputs):
        if self.masking:
            tf.assert_equal(len(inputs), 4)
            queries, keys, values, masks = inputs
        else:
            tf.assert_equal(len(inputs), 3)
            queries, keys, values = inputs
        
        # dtype
        if queries.dtype != tf.float32:
            queries = tf.cast(queries, tf.float32)
        if keys.dtype != tf.float32:
            keys = tf.cast(keys, tf.float32)
        if values.dtype != tf.float32:
            values = tf.cast(values, tf.float32)
        
        matmul = tf.matmul(queries, tf.transpose(keys, [0, 2, 1]))
        scaled_matmul = matmul / tf.sqrt(tf.cast(queries.shape[-1], tf.float32))
        if self.masking:
            scaled_matmul = self.mask(scaled_matmul, masks)
        if self.future:
            scaled_matmul = self.future_mask(scaled_matmul)
        
        softmax_out = tf.nn.softmax(scaled_matmul)
        
        out = tf.nn.dropout(softmax_out, self.dropout)
        outputs = tf.matmul(out, values)
        
        return outputs    

In [4]:
class MultiHeadAttention(Model):
    def __init__(self, n_heads, head_dim, dropout=.1, masking=True, 
                 future=False, trainable=True):
        super().__init__()
        self.n_heads = n_heads
        self.head_dim = head_dim
        self.dropout = dropout
        self.masking = masking
        self.future = future
        self.trainable = trainable
        self.q_dense = Dense(head_dim*n_heads)
        self.k_dense = Dense(head_dim*n_heads)
        self.v_dense = Dense(head_dim*n_heads)
        
        
    
    def call(self, inputs):
        if self.masking:
            tf.assert_equal(len(inputs), 4)
            queries, keys, values, masks = inputs
        else:
            tf.assert_equal(len(inputs), 3)
            queries, keys, values = inputs
        
        queries_linear = self.q_dense(queries)
        keys_linear = self.k_dense(keys)
        values_linear = self.v_dense(values)
        
        queries_multi_heads = tf.concat(tf.split(queries_linear, self.n_heads, axis=2), axis=0)
        keys_multi_heads = tf.concat(tf.split(keys_linear, self.n_heads, axis=2), axis=0)
        values_multi_heads = tf.concat(tf.split(values_linear, self.n_heads, axis=2), axis=0)
        
        if self.masking:
            att_inputs = [queries_multi_heads, keys_multi_heads, values_multi_heads, masks]
        else:
            att_inputs = [queries_multi_heads, keys_multi_heads, values_multi_heads]
        
        attention = ScaledDotProductAttention(masking=self.masking, future=self.future, 
                                             dropout=self.dropout)
        att_out = attention(att_inputs)
#         print(att_out.shape)
        
        outputs = tf.concat(tf.split(att_out, self.n_heads, axis=0), axis=2)
#         print(outputs.shape)
        
        return outputs

In [16]:
vocab_size = 5000
max_len = 256
model_dim = 512
batch_size = 64
epochs = 10

(x_train, y_train), (x_test, y_test) = imdb.load_data(maxlen=max_len, num_words=vocab_size)
x_train = sequence.pad_sequences(x_train, maxlen=max_len)
x_test = sequence.pad_sequences(x_test, maxlen=max_len)
x_train_masks = tf.equal(x_train, 0)
x_test_masks = tf.equal(x_test, 0)
y_train = to_categorical(y_train)
y_test = to_categorical(y_test)

inputs = Input(shape=(max_len,), name='inputs')
masks = Input(shape=(max_len,), name='masks')
embeddings = Embedding(vocab_size, model_dim)(inputs)
encodings = PositionEncoding(model_dim)(embeddings)
print(embeddings.shape, encodings.shape)
encodings = tf.keras.layers.add([embeddings, encodings])
x = MultiHeadAttention(8, 64)([encodings, encodings, encodings, masks])
x = GlobalAveragePooling1D(data_format='channels_last')(x)
x = Dropout(0.2)(x)
x = Dense(10, activation='relu')(x)
outputs = Dense(2, activation='softmax')(x)
model = Model(inputs=[inputs, masks], outputs=outputs)

(None, 256, 512) (1, 256, 512)


In [6]:
# plot_model(model)

In [7]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
inputs (InputLayer)             [(None, 256)]        0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 256, 512)     2560000     inputs[0][0]                     
__________________________________________________________________________________________________
position_encoding (PositionEnco (1, 256, 512)        0           embedding[0][0]                  
__________________________________________________________________________________________________
add (Add)                       (None, 256, 512)     0           embedding[0][0]                  
                                                                 position_encoding[0][0]      

In [8]:
model.compile(optimizer=Adam(beta_1=0.9, beta_2=0.98, epsilon=1e-9), 
    loss='categorical_crossentropy', metrics=['accuracy'])

print("Model Training ... ")
es = EarlyStopping(patience=5)
model.fit([x_train, x_train_masks], y_train, 
          batch_size=batch_size, epochs=epochs, 
          validation_split=0.2, callbacks=[es])

test_metrics = model.evaluate([x_test, x_test_masks], y_test, batch_size=batch_size, verbose=0)
print("loss on Test: %.4f" % test_metrics[0])
print("accu on Test: %.4f" % test_metrics[1])

Model Training ... 
Train on 20000 samples, validate on 5000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
loss on Test: 0.6935
accu on Test: 0.4936


In [17]:
class PositionWiseFeedForward(Model):
    def __init__(self, model_dim, inner_dim, trainable=True):
        super().__init__()
        self.model_dim = model_dim
        self.inner_dim = inner_dim
        self.trainable = trainable
        self.dense0 = Dense(inner_dim, activation='relu')
        self.dense1 = Dense(model_dim)
        
    def call(self, inputs):
        inner_out = self.dense0(inputs)
        outputs = self.dense1(inner_out)
        return outputs

In [18]:
class LayerNormalization(Model):
    def __init__(self, epsilon=1e-8):
        super().__init__()
        self.epsilon = epsilon
        
    def call(self, inputs):
        outputs = BatchNormalization(epsilon=self.epsilon)(inputs)
        return outputs

In [78]:
class Transformer(Model):
    def __init__(self, vocab_size, model_dim, n_heads=8, encoder_stack=6, 
                decoder_stack=6, feed_forward_size=2048, dropout=0.1):
        super().__init__()
        self.vocab_size = vocab_size
        self.model_dim = model_dim
        self.n_heads = n_heads
        self.encoder_stack = encoder_stack
        self.decoder_stack = decoder_stack
        self.feed_forward_size = feed_forward_size
        self.dropout = dropout
        self.embedding_layer = Embedding(vocab_size, model_dim)
        
    def encoder(self, inputs):
        masks = tf.equal(inputs, 0)
        embeddings = self.embedding_layer(inputs)
        position_encodings = PositionEncoding(self.model_dim)(embeddings)
        position_encodings = tf.tile(position_encodings, [tf.shape(inputs)[0], 1, 1])
        print(embeddings.shape, position_encodings.shape)
        encodings = tf.keras.layers.add([embeddings, position_encodings])
        encodings = tf.nn.dropout(encodings, self.dropout)
        
        for i in range(self.encoder_stack):
            attention = MultiHeadAttention(self.n_heads, 
                                           self.model_dim // self.n_heads)
            attention_input = [encodings, encodings, encodings, masks]
            attention_out = attention(attention_input)
            attention_out = tf.keras.layers.add([attention_out, encodings])
            attention_out = LayerNormalization()(attention_out)
#             print(attention_out.shape)
            ff = PositionWiseFeedForward(self.model_dim, self.feed_forward_size)
            ff_out = ff(attention_out)
#             print(ff_out.shape)
            ff_out = tf.keras.layers.add([ff_out, attention_out])
            encodings = LayerNormalization()((ff_out))
        
        return encodings, masks
        
    def decoder(self, inputs):
        decoder_inputs, encoder_encodings, encoder_masks = inputs
        
        decoder_masks = tf.equal(decoder_inputs, 0)
        embeddings = self.embedding_layer(decoder_inputs)
        position_encodings = PositionEncoding(self.model_dim)(embeddings)
        encodings = embeddings + position_encodings
        encodings = tf.nn.dropout(encodings, self.dropout)
        
        for i in range(self.decoder_stack):
            masked_attention = MultiHeadAttention(self.n_heads, 
                                                  self.model_dim // self.n_heads, 
                                                 future=True)
            masked_attention_input = [encodings, encodings, encodings, decoder_masks]
            masked_attention_out = masked_attention(masked_attention_input)
            
            masked_attention_out = tf.keras.layers.add([masked_attention_out, encodings])
            masked_attention_out = LayerNormalization()(masked_attention_out)
            
            attention = MultiHeadAttention(self.n_heads, 
                                          self.model_dim // self.n_heads)
            attention_input = [masked_attention_out, encoder_encodings, 
                               encoder_encodings, encoder_masks]
            attention_out = attention(attention_input)
            attention_out = tf.keras.layers.add([attention_out, masked_attention_out])
            attention_out = LayerNormalization()(attention_out)
            
            ff = PositionWiseFeedForward(self.model_dim, self.feed_forward_size)
            ff_out = ff(attention_out)
            ff_out = tf.keras.layers.add([ff_out, attention_out])
            encodings = LayerNormalization()(ff_out)
            
        
        linear_projection = tf.matmul(encodings, tf.transpose(self.embedding_layer.weights, perm=[0, 2, 1]))
        outputs = tf.nn.softmax(linear_projection)
        return outputs
    
    def call(self, inputs):
        encoder_inputs, decoder_inputs = inputs
        encoder_encodings, encoder_masks = self.encoder(encoder_inputs)
        decoder_outputs = self.decoder([decoder_inputs, encoder_encodings, 
                                       encoder_masks])
        return decoder_outputs

In [83]:
vocab_size = 5000
max_len = 256
model_dim = 512
batch_size = 32
epochs = 10

(x_train, y_train), (x_test, y_test) = imdb.load_data(maxlen=max_len, num_words=vocab_size)
x_train = sequence.pad_sequences(x_train, maxlen=max_len)
x_test = sequence.pad_sequences(x_test, maxlen=max_len)
x_train_masks = tf.equal(x_train, 0)
x_test_masks = tf.equal(x_test, 0)
y_train = to_categorical(y_train)
y_test = to_categorical(y_test)

encoder_inputs = Input(shape=(max_len,), name='encoder_inputs')
decoder_inputs = Input(shape=(max_len,), name='decoder_inputs')
outputs = Transformer(vocab_size, model_dim)([encoder_inputs, decoder_inputs])
model = Model(inputs=[encoder_inputs, decoder_inputs], outputs=outputs)

# outputs = GlobalAveragePooling1D(data_format='channels_first')(outputs)
# outputs = Dropout(0.5)(outputs)
# outputs = Dense(128, activation='relu')(outputs)
# outputs = Dense(2, activation='softmax')(outputs)
# model = Model(inputs=[encoder_inputs, decoder_inputs], outputs=outputs)

(None, 256, 512) (None, 256, 512)


In [80]:
model.summary()

Model: "model_15"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
encoder_inputs (InputLayer)     [(None, 256)]        0                                            
__________________________________________________________________________________________________
decoder_inputs (InputLayer)     [(None, 256)]        0                                            
__________________________________________________________________________________________________
transformer_20 (Transformer)    (None, 256, 5000)    2560000     encoder_inputs[0][0]             
                                                                 decoder_inputs[0][0]             
__________________________________________________________________________________________________
global_average_pooling1d_15 (Gl (None, 256)          0           transformer_20[0][0]      

In [82]:
# model.compile(optimizer=Adam(beta_1=0.9, beta_2=0.98, epsilon=1e-9), 
#     loss='categorical_crossentropy', metrics=['accuracy'])

# print("Model Training ... ")
# es = EarlyStopping(patience=5)
# model.fit([x_train, x_train_masks], y_train, 
#           batch_size=batch_size, epochs=epochs, 
#           validation_split=0.2, callbacks=[es])

# test_metrics = model.evaluate([x_test, x_test_masks], y_test, batch_size=batch_size, verbose=0)
# print("loss on Test: %.4f" % test_metrics[0])
# print("accu on Test: %.4f" % test_metrics[1])