In [1]:
import sys
import os
sys.path.append(os.getcwd())
import time
import random
import re
import json
import pickle
from typing import List, Tuple, Dict, Callable, Optional, Any, Sequence, Mapping, NamedTuple

In [2]:
import tensorflow as tf
import numpy as np
import matplotlib as plt

In [3]:
from utils.config import Config
from data_loader.ptb_datasource import PTBDataSource

In [6]:
config = Config()
print(config)

Config(num_units=512, num_layers=6, num_heads=8, batch_size=128, max_length=50, dropout_in_rate=0.1, dropout_out_rate=0.2, learning_rate=0.001, grad_clip=5.0, is_layer_norm=False, data_path='./data/', log_dir='./logs/')


In [7]:
data = PTBDataSource(config)
print(data.vocab_size)

10000


In [None]:
class TransformerEncoder:
    
    def __init__(self, config, vocab_size, reuse=None):
        self._config = config
        self.vocab_size = vocab_size
        
        self._create_placeholder()
        self._create_model(reuse)
        
    def _create_placeholder(self):
        self.is_training = tf.placeholder(shape=(), dtype=tf.bool, name='is_training')
        self.inputs_data = tf.placeholder(shape=[None, None], name='inputs_data', dtype=tf.int32)  # batch_size x max_length
        self.targets_classes = tf.placeholder(shape=[None, self.vocab_size], name='targets_data', dtype=tf.int32)  # batch_size x num_outputs
        
    def _create_model(self, reuse):
        with tf.variable_scope('transformer', reuse=reuse):
            inputs = self.inputs_data
            embedded_inputs = self._embedding(inputs)  # [batch_size, max_length, embedded_size]
            embedded_inputs += self._positional_encoding(inputs)  # [batch_size, max_length, embedded_size]
            encoded_query = self._encode(embedded_inputs)
            self.outputs_prob = self._dense(encoder_query, self.num_outputs, 'outputs', tf.nn.softmax)
            self.predicted_class = tf.argmax(self.outputs_prob, axis=-1)
        
    def _encode(self,
                    inputs,  # [batch_size, max_length, num_units]
                    scope: str='encoder'):
        with tf.variable_scope(scope):
            queries = inputs
            encoded_queries = tf.get_variable('encoded_queries', 
                                                      dtype=myfloat, 
                                                      shape=[1, self.num_units], 
                                                      initializer=tf.contrib.layers.xavier_initializer())  # [batch_size, 1, num_units]
            encoder_queries = tf.tile(tf.expand_dims(encoder_query, 0), [tf.shape(inputs)[0], 1, 1])
            for i in range(self.num_blocks):
                with tf.variable_scope('block_{}'.format(i)):
                    original_queries = queries
                    queries = self._multihead_attention(
                        keys=queries, 
                        queries=queries, 
                        num_units=self.num_units, 
                        num_heads=self.num_heads, 
                        causality=False, 
                        reuse=None
                    )
                    queries += original_queries
                    queries = self._normalize(queries, scope='mh_normalize')
                    
                    original_queries = encoded_queries
                    encoded_queries = self._multihead_attention(
                        keys=queries,
                        queries=encoded_queries,
                        num_units=self.num_units,
                        num_heads=self.num_heads,
                        causality=False,
                        reuse=None
                    )
                    encoded_queries += original_queries
                    encoded_queries = self.normalize(encoded_queries)

                    original_queries = queries
                    queries = self._feedforward(queries, [self.num_inner_units, self.num_units])
                    queries += original_queries
                    queries = self._normalize(queries, scope='ff_normalize')
        return queries
    
    def _multihead_attention(self,
                           keys,  # [batch_size, max_length, embedded_size]
                           queries,
                           num_units: int,
                           num_heads: int=8,
                           causality: bool=False,
                           scope: str='multihead_attention',
                           reuse: bool=None):
        with tf.variable_scope(scope):
            num_heads_units = num_units / num_heads
            keys = self._dense(keys, num_units, 'keys')  # [batch_size, max_length, num_units]
            values = self._dense(keys, num_units, 'values')
            queries = self._dense(queries, num_units, 'queries')

            mh_keys = tf.concat(tf.split(keys, num_heads, axis=2), axis=0)  # [batch_size*num_heads, max_length, num_units/num_heads]
            mh_values = tf.concat(tf.split(values, num_heads, axis=2), axis=0)
            mh_queries = tf.concat(tf.split(queries, num_heads, axis=2), axis=0)

            key_masks = tf.sign(tf.abs(tf.reduce_sum(keys, axis=-1)))  # [batch_size, max_length]
            key_masks = tf.tile(key_masks, [num_heads, 1])  # [batch_size*num_heads, max_length]
            key_masks = tf.tile(tf.expand_dims(key_masks, 1), [1, tf.shape(queries)[1], 1])  # [batch_size*num_heads, max_length, max_length]

            outputs = tf.matmul(mh_queries, tf.transpose(mh_keys, [0, 1, 2]))  # [batch_size*num_heads, max_length, max_length]
            outputs = outputs / (num_heads_units)**0.5

            paddings = tf.ones_like(outputs)*(-2**32+1)
            outputs = tf.where(tf.equal(key_masks, 0), paddings, outputs)

            outputs = tf.nn.softmax(outputs)
            outputs = tf.matmul(outputs, mh_values)  # [batch_size*num_heads, max_length, num_units/num_heads]

            outputs = tf.concat(tf.split(outputs, num_heads, axis=0), axis=2)  # [batch_size, max_length, num_units]

            outputs = self._dense(outputs, num_units, 'output')
        return outputs
    
    def _feedforward(self,
                        inputs,
                        num_units: List[int], # [num_layers, num_units]
                        scope: str='feedforward',
                        reuse: bool=None):
        with tf.variable_scope(scope):
            layer = inputs
            for (i, units) in enumerate(num_units):
                layer = self._dense(layer, units, 'dense_{}'.format(i), tf.nn.relu)
        return layer
    
    def _positional_encoding(self,
                                inputs,
                                is_zero_pad: bool=True,
                                scope: str='positional_encoding'):
        outputs = tf.to_float(inputs)
        return outputs
    
    def _embedding(self,
                     inputs,
                     is_zero_pad: bool=True,
                     is_scale: bool=True,
                     scope: str='embedding',
                     reuse: bool=None):
        with tf.variable_scope(scope, reuse=reuse):
            lookup_table = tf.get_variable(
                'lookup_table', 
                shape=[self.vocab_size, self.num_units],
                dtype=tf.float32,
                initializer=tf.contrib.layers.xavier_initializer()
            )
            if is_zero_pad:
                lookup_table = tf.concat((tf.zeros(shape=[1, self.num_units]), lookup_table[1:, :]), 0)
            embedded = tf.nn.embedding_lookup(lookup_table, inputs)
            if is_scale:
                embedded = embedded * self.num_units ** 0.5
        return embedded
    
    def _normalize(self,
                     inputs,
                     epsilon=1e-8,
                     scope='normalize',
                     reuse: bool=None):
        with tf.variable_scope(scope, reuse=reuse):
            param_dim = inputs.get_shape()[-1]
            mean, variance = tf.nn.moments(inputs, [-1], keep_dims=True)
            
            beta = tf.get_variable('beta', initializer=tf.zeros([param_dim]))
            gamma = tf.get_variable('gamma', initializer=tf.ones([param_dim]))
            normalized = (inputs - mean)/((variance+epsilon) **0.5)
            normalized = normalized * gamma + beta
        return normalized
    
    def _dense(self,
                 inputs,
                 num_units: int,
                 scope: str,
                 activation=None,
                 dropout_rate: Optional[float]=None):
        with tf.variable_scope(scope):
            layer = tf.layers.dense(inputs, num_units, activation, name='dense')
            if dropout_rate:
                layer = tf.layers.dropout(layer, dropout_rate, training=self.is_training, name='dropout')
        return layer
    

In [None]:
with tf.Graph().as_default():
    model = TransformerEncoder()
    model.outputs_prob
    model.predicted_class