update transformer, dataloader to fix

Former-commit-id: 548a917
asyml · Nov 7, 2017 · c7dc812 · c7dc812
1 parent 620a76a
commit c7dc812
Show file tree

Hide file tree

Showing 6 changed files with 244 additions and 45 deletions.
diff --git a/examples/transformer.py b/examples/transformer.py
@@ -14,7 +14,7 @@
 # We shall wrap all these modules
 from txtgen.data import database
 from txtgen.modules import ConstantConnector
-from txtgen.modules import BasicRNNDecoder, get_helper
+from txtgen.modules import TransformerEncoder, TransformerDecoder
 from txtgen.losses import mle_losses
 from txtgen.core import optimization as opt
 from txtgen import context
@@ -28,13 +28,26 @@
     data_hparams = {
         "num_epochs": 10,
         "seed": 123,
-        "dataset": {
+        "batch_size":3,
+        "source_dataset": {
             "files": ['data/sent.txt'],
             "vocab_file": 'data/vocab.txt'
+        },
+        "target_dataset": {
+            "files": ['data/sent.txt'],
+            "vocab_share": True,
+            "reader_share": True,
+            "processing":{
+                "eos_token": "<TARGET_EOS>"
+            }
         }
     }
     # Construct the database
-    src_db, tgt_db = database.PairedTextDataBase(data_hparams)
+    text_database = database.PairedTextDataBase(data_hparams)
+    print('database finished')
+
+    ##TODO(haoran) bug: the text_database cannot be called here
+    text_data_batch = text_database()
     # Get data minibatch, which is a dictionary:
     # {
     #   "text": text_tensor,     # text string minibatch,
@@ -43,34 +56,29 @@
     #   "text_ids": text_id_tensor, # a 2D int tensor of token ids with shape
     #                               # `[batch_size, max_seq_length]`
     # }
-    src_data_batch, tgt_data_batch = src_db(), tgt_db
-    ### Build model
 
     # Build decoder. Simply use the default hyperparameters.
     #decoder = rnn_decoders.BasicRNNDecoder(vocab_size=text_db.vocab.vocab_size)
-    encoder = BasicRNNEncoder(vocab_size = src_db.vovab.vocab_size)
-    decoder = BasicRNNDecoder(vocab_size=tgt_db.vocab.vocab_size)
+    encoder = TransformerEncoder(vocab_size=text_database.source_vocab.vocab_size)
+    decoder = TransformerDecoder(vocab_size=text_database.target_vocab.vocab_size)
 
     # Build connector, which simply feeds zero state to decoder as initial state
     connector = ConstantConnector(decoder.state_size)
+    print('encoder decoder finished')
+    src_text = text_data_batch['source_text']
+    tgt_text = text_data_batch['target_text']
 
-    # Build helper used in training.
-    # We shall probably improve the interface here.
-    helper_train = get_helper(
-        decoder.hparams.helper_train.type,
-        inputs=data_batch['text_ids'][:, :-1],
-        sequence_length=data_batch['length'] - 1,
-        embedding=decoder.embedding)
-
+    encoder_output = encoder(src_text['text_ids'][:, :-1],
+            sequence_length=src_text['length']-1)
     # Decode
     outputs, final_state, sequence_lengths = decoder(
-        helper=helper_train, initial_state=connector(text_db.batch_size))
+        initial_state=connector(text_database._hparams.batch_size))
 
     # Build loss
     mle_loss = mle_losses.average_sequence_sparse_softmax_cross_entropy(
-        labels=data_batch['text_ids'][:, 1:],
-        logits=outputs.rnn_output,
-        sequence_length=sequence_lengths - 1)
+        labels=tgt_text['text_ids'][:, 1:],
+        logits=outputs.output_logits,
+        sequence_length=sequence_lengths-1)
 
     # Build train op. Only config the optimizer while using default settings
     # for other hyperparameters.

diff --git a/txtgen/core/layers.py b/txtgen/core/layers.py
@@ -9,7 +9,8 @@
 
 import tensorflow as tf
 import tensorflow.contrib.rnn as rnn
-
+import numpy as np
+from txtgen import context
 from txtgen.hyperparams import HParams
 from txtgen.core.utils import get_instance, switch_dropout
 
@@ -249,7 +250,7 @@ def get_embedding(hparams=None,
                                    initializer=init_values,
                                    trainable=hparams["trainable"])
 
-def sinuoid_positional_encoding(inputs,
+def sinusoid_positional_encoding(inputs,
                                 zero_pad=True,
                                 scale=True,
                                 reuse=None,
@@ -265,31 +266,30 @@ def sinuoid_positional_encoding(inputs,
         scope: [String], Optional scope for 'variable_scope'
         position_duration: [Int], default=10000
     """
+    batch_size, max_time, hidden_dim = inputs.get_shape().as_list()
     with tf.variable_scope(scope, reuse=reuse):
-        batch_size, max_time, hidden_dim = inputs.get_shape().as_list()
-        input_one = tf.tile(tf.expand_dims(tf.range(max_time), 0), [batch_size, 1]) #batch_size * max_time
-        position_block = tf.tile(tf.expand_dims(tf.range(max_time), 1), [1, num_units // 2])
-        unit_block = tf.tile(tf.expand_dims(tf.range(hidden_dim // 2), 0), [max_time, 1])
-        rad_block = tf.pow(tf.div(position_block, tf.multiply(position_duration, 1)), tf.div(unit_block, hidden_dim // 2))
+        position_idx = tf.tile(tf.expand_dims(tf.range(max_time), 0), [batch_size, 1]) #batch_size * max_time
+        position_enc = np.array([
+            [pos /np.power(10000, 2.*i/hidden_dim) for i in range(hidden_dim)]
+            for pos in range(max_time)])
 
-        sin_block = tf.sin(tf.cast(rad_block, tf.float32))
-        cos_block = tf.cos(tf.cast(rad_block, tf.float32))
-        lookup_table = tf.concat([sin_block, cos_block], axis = 1)
+        position_enc[:, 0::2] = np.sin(position_enc[:, 0::2])
+        position_enc[:, 1::2] = np.cos(position_enc[:, 1::2])
 
+        lookup_table = tf.convert_to_tensor(position_enc)
         if zero_pad:
-            lookup_table = tf.concat((tf.zeros(shape = [1, num_units]), lookup_table[1:, :]), 0)
-        outputs = tf.nn.embedding_lookup(lookup_table, input_one)
+            lookup_table = tf.concat((tf.zeros(shape=[1, hidden_dim]),
+                lookup_table[1:, :]), 0)
+        outputs = tf.nn.embedding_lookup(lookup_table, position_idx)
         if scale:
-            outputs = outputs * math.sqrt(hidden_dim)
+            outputs = outputs * hidden_dim**0.5
         return outputs
 
-
 def multihead_attention(queries,
                         keys,
                         num_units= None,
                         num_heads=8,
                         dropout_rate=0,
-                        is_training=True,
                         causality = False,
                         scope = 'multihead_attention',
                         reuse= None):
@@ -299,7 +299,6 @@ def multihead_attention(queries,
         keys: A 3d tensor with shape of [N, T_k, C_k].
         num_units: A scalar. Attention size.
         dropout_rate: A floating point number.
-        is_training: Boolean. Controller of mechanism for dropout.
         causality: Boolean. Should be true, units that reference the future are masked
         num_heads: An int. Number of heads.
         scope: Optional scope for `variable_scope`.
@@ -355,7 +354,7 @@ def multihead_attention(queries,
         outputs *= query_masks # broadcasting. (N, T_q, C)
 
         # Dropouts
-        outputs = tf.layers.dropout(outputs, rate=dropout_rate, training=tf.convert_to_tensor(is_training))
+        outputs = tf.layers.dropout(outputs, rate=dropout_rate, training=context.is_train())
 
         # Weighted sum
         outputs = tf.matmul(outputs, V_) # ( h*N, T_q, C/h)
@@ -367,6 +366,65 @@ def multihead_attention(queries,
         outputs += queries
 
         # Normalize
-        outputs = normalize(outputs) # (N, T_q, C)
 
     return outputs
+
+
+
+def poswise_feedforward(attended_dec, scope="multihead_attention", reuse=None):
+  '''Point-wise feed forward net.
+
+  Args:
+    inputs: A 3d tensor with shape of [N, T, C].
+    num_units: A list of two integers.
+    scope: Optional scope for `variable_scope`.
+    reuse: Boolean, whether to reuse the weights of a previous layer
+      by the same name.
+
+  Returns:
+    A 3d tensor with the same shape and dtype as inputs
+  '''
+  hidden_dim = attended_dec.shape().as_list()[-1]
+  with tf.variable_scope(scope, reuse=reuse):
+      outputs = tf.layers.conv1d(inputs = attended_dec,
+              filters=hidden_dim*4,
+              kernel_size=1,
+              activation=tf.nn.relu,
+              use_bias=True)
+      outputs = tf.layers.conv1d(inputs = outputs,
+              filters=hidden_dim,
+              kernel_size=1,
+              activation=None,
+              use_bias=True)
+      outputs += attended_dec #residual connection
+  return outputs
+
+def normalize(inputs,
+              epsilon = 1e-8,
+              scope="ln",
+              reuse=None):
+    '''Applies layer normalization.
+
+    Args:
+      inputs: A tensor with 2 or more dimensions, where the first dimension has
+        `batch_size`.
+      epsilon: A floating number. A very small number for preventing ZeroDivision Error.
+      scope: Optional scope for `variable_scope`.
+      reuse: Boolean, whether to reuse the weights of a previous layer
+        by the same name.
+
+    Returns:
+      A tensor with the same shape and data dtype as `inputs`.
+    '''
+    with tf.variable_scope(scope, reuse=reuse):
+        inputs_shape = inputs.get_shape()
+        params_shape = inputs_shape[-1:]
+
+        mean, variance = tf.nn.moments(inputs, [-1], keep_dims=True)
+        beta= tf.Variable(tf.zeros(params_shape))
+        gamma = tf.Variable(tf.ones(params_shape))
+        normalized = (inputs - mean) / ( (variance + epsilon) ** (.5) )
+        outputs = gamma * normalized + beta
+
+    return outputs
+
diff --git a/txtgen/modules/decoders/__init__.py b/txtgen/modules/decoders/__init__.py
@@ -12,4 +12,4 @@
 from txtgen.modules.decoders.rnn_decoder_base import *
 from txtgen.modules.decoders.rnn_decoders import *
 from txtgen.modules.decoders.rnn_decoder_helpers import *
-
+from txtgen.modules.decoders.transformer_decoders import *
diff --git a/txtgen/modules/decoders/transformer_decoders.py b/txtgen/modules/decoders/transformer_decoders.py
@@ -0,0 +1,131 @@
+"""
+ transformer decoders. Attention is all you need.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+# pylint: disable=no-name-in-module, too-many-arguments, too-many-locals
+# pylint: disable=not-context-manager
+import tensorflow as tf
+from tensorflow.contrib.seq2seq import Decoder as TFDecoder
+from tensorflow.python.framework import tensor_shape, dtypes
+
+from txtgen.modules.module_base import ModuleBase
+from txtgen.core import layers
+from txtgen import context
+
+class TransformerDecoderOutput(
+        collections.namedtuple("TransformerDecoderOutput",
+            ("output_logits", "sample_ids"))):
+    """the output logits and sampled_ids"""
+    pass
+
+class TransformerDecoder(ModuleBase, TFDecoder):
+    """decoder for transformer: Attention is all you need
+    """
+    def __init__(self,
+                embedding=None,
+                vocab_size=None,
+                hparams=None):
+        ModuleBase.__init__(self, hparams)
+        self._embedding = None
+        if self._hparams.embedding_enabled:
+            if embedding is None and vocab_size == None:
+                raise ValueError("If 'embedding' is not provided, "
+                        "'vocab_size' must be specified.")
+            if isinstance(embedding, tf.Variable):
+                self._embedding = embedding
+            else:
+                self._embedding = layers.get_embedding(
+                    self._hparams.target.embedding, embedding, vocab_size,
+                    self.variable_scope)
+            embed_dim = self._embedding.shape()[1]
+            if self._hparams.zero_pad:
+                self._embedding = tf.concat((tf.zeros(shape=[1, embed_dim]),
+                    self._embedding[1:, :]), 0)
+            if self._hparams.embedding.trainable:
+                self._add_trainable_variable(self._embedding)
+
+    @staticmethod
+    def default_hparams():
+        return {
+            "embedding_enabled": True,
+            "embedding": layers.default_embedding_hparams(),
+            "name":"transformer_decoder",
+            "num_heads":8,
+            "num_units":64,
+        }
+
+    def initialize(self, name=None):
+        return self._helper.initialize() + (self._initial_state,)
+
+    def _build(self, inputs, encoder_output):
+#        max_decoding_length_train = self._hparams.max_decoding_length_train
+#        if max_decoding_length_train is None:
+#            max_decoding_length_train = utils.MAX_SEQ_LENGTH
+#        if max_decoding_length_infer is None:
+#            max_decoding_length_infer = utils.MAX_SEQ_LENGTH
+#        max_decoding_length = tf.cond(
+#                context.is_train(),
+#                lambda: max_decoding_length_train,
+#                lambda: max_decoding_length_infer)
+        if self._embedding  is not None:
+            tgt_embedding = tf.nn.embedding_lookup(self._embedding, inputs)
+        else:
+            tgt_embedding = inputs
+        num_units = tf.shape(tgt_embedding).as_list()[2]
+        if self.scale:
+            tgt_embedding = tgt_embedding * tf.sqrt(num_units)
+        if self._hparams.sinusoid:
+            position_dec_embeds = layers.sinusoid_positional_encoding(tgt_embedding,
+                    scope = "dec_pe")
+        dec_input = tf.layers.dropout(tgt_embedding + position_dec_embeds,
+                rate = self._hparams.encoder.dropout_rate,
+                training = context.is_train())
+        hparams = self._hparams
+        with tf.variable_scope(self.variable_scope):
+            for i in range(self._hparams.decoder.num_blocks):
+                with tf.variable_scope("num_blocks_{}".format(i)):
+                    attended_dec = layers.multihead_attention(queries = dec_input,
+                            keys = dec_input,
+                            num_units = hparams.hidden_units,
+                            num_heads = hparams.num_heads,
+                            is_training = context.is_train(),
+                            dropout_rate = hparams.dropout_rate,
+                            causility = True,
+                            scope = "self_attention")
+                    attended_dec = layers.normalize(attended_dec)
+
+                    attended_dec = layers.multihead_attention(queries = dec_input,
+                            keys = encoder_output,
+                            num_units = hparams.hidden_units,
+                            num_heads = hparams.num_heads,
+                            dropout_rate = hparams.dropout_rate,
+                            is_training = context.is_train(),
+                            causality = False,
+                            scope = "vanilla_attention")
+                    attended_dec = layers.normalize(attended_dec)
+
+                    attended_dec = layers.poswise_feed_forward(attended_dec)
+                    attended_dec = layers.normalize(attended_dec)
+                    #[batch, seq_len, hidden_units]
+        self.logits = tf.layers.dense(self.attended_dec, self._vocab_size)
+        self.preds = tf.to_int32(tf.arg_max(self.logits, dimension=-1))
+        #[batch, seq_len]
+        return self.logits, self.preds
+
+    @property
+    def output_size(self):
+        return TransformerDecoderOutput(
+            output_logits=tensor_shape.TensorShape([None, None, self._vocab_size]),
+            sample_id=tensor_shape.TensorShape([None, None])
+            )
+
+    @property
+    def output_dtype(self):
+        return TransformerDecoderOutput(
+            output_logits=dtypes.float32, sample_id=dtypes.int32)
+
diff --git a/txtgen/modules/encoders/__init__.py b/txtgen/modules/encoders/__init__.py
@@ -12,4 +12,4 @@
 from txtgen.modules.encoders.encoder_base import *
 from txtgen.modules.encoders.rnn_encoders import *
 from txtgen.modules.encoders.hierarchical_encoders import *
-
+from txtgen.modules.encoders.transformer_encoders import *