fixed transformer decoder docs

asyml · Feb 21, 2019 · 615411b · 615411b
1 parent a2e28b2
commit 615411b
Show file tree

Hide file tree

Showing 2 changed files with 18 additions and 13 deletions.
diff --git a/texar/core/layers.py b/texar/core/layers.py
@@ -448,7 +448,8 @@ def get_activation_fn(fn_name="identity", kwargs=None):
     if fn_name is None:
         return None
 
-    fn_modules = ['tensorflow', 'tensorflow.nn', 'texar.custom', 'texar.core.layers']
+    fn_modules = ['tensorflow', 'tensorflow.nn', 'texar.custom',
+                  'texar.core.layers']
     activation_fn_ = utils.get_function(fn_name, fn_modules)
     activation_fn = activation_fn_
 
@@ -1175,14 +1176,16 @@ def default_average_pooling3d_kwargs():
 def layer_normalize(inputs,
                     scope=None,
                     **kwargs):
-    '''Applies layer normalization. averaging over the last dimension
+    """Applies layer normalization. Normalizes over the last dimension.
+
     Args:
         inputs: A tensor with 2 or more dimensions, where the first
-            dimension has `batch_size`.
-        scope: Optional scope for `variable_scope`.
+            dimension must be `batch_size`.
+        scope (optional): variable scope.
+
     Returns:
         A tensor with the same shape and data dtype as `inputs`.
-    '''
+    """
     return tf.contrib.layers.layer_norm(
         inputs=inputs, begin_norm_axis=-1, begin_params_axis=-1, scope=scope,
         **kwargs
@@ -1191,10 +1194,13 @@ def layer_normalize(inputs,
 
 def gelu(input_tensor):
     """Gaussian Error Linear Unit.
+
     This is a smoother version of the RELU.
     Original paper: https://arxiv.org/abs/1606.08415
+
     Args:
       input_tensor: float Tensor to perform activation.
+
     Returns:
       `input_tensor` with the GELU activation applied.
     """

diff --git a/texar/modules/decoders/transformer_decoders.py b/texar/modules/decoders/transformer_decoders.py
@@ -62,18 +62,17 @@ class TransformerDecoderOutput(
 
 
 class TransformerDecoder(ModuleBase, TFDecoder):
-    """Transformer decoder that applies multi-head attention for
+    """Transformer decoder that applies multi-head self-attention for
     sequence decoding.
-    Stacked :class:`~texar.modules.encoders.MultiheadAttentionEncoder` for
-    encoder-decoder attention and self attention,
-    :class:`~texar.modules.FeedForwardNetwork` and residual connections.
 
-    Use the passed `embedding` variable as the parameters of the
-    transform layer from output to logits.
+    It is a stack of :class:`~texar.modules.encoders.MultiheadAttentionEncoder`,
+    :class:`~texar.modules.FeedForwardNetwork`, and residual connections.
 
     Args:
         embedding: A Tensor of shape `[vocab_size, dim]` containing the
-            word embedding. The Tensor is used as the decoder output layer.
+            word embedding matrix. The Tensor is used as the decoder output
+            layer that computes logits over vocabulary. Ignored if
+            `hparams['embedding_tie']` is False.
         hparams (dict or HParams, optional): Hyperparameters. Missing
             hyperparameter will be set to default values. See
             :meth:`default_hparams` for the hyperparameter sturcture and
@@ -208,7 +207,7 @@ def default_hparams():
 
         "embedding_tie" : bool
             Whether to use the word embedding matrix as the output layer
-            that computes logits. If `False`, an additional dense layer
+            that computes logits. If `False`, a new dense layer
             is created.
 
         "output_layer_bias" : bool