From 5a9cf0535e84a006110480e5d55be044c9143cad Mon Sep 17 00:00:00 2001
From: Pengzhi Gao <pengzhi.gao@petuum.com>
Date: Wed, 5 Feb 2020 00:36:36 -0500
Subject: [PATCH 1/2] Fix docs issue in T5

---
 .../encoder_decoders/t5_encoder_decoder.py    |  4 ++--
 texar/torch/modules/pretrained/t5.py          | 21 +++++++++++--------
 2 files changed, 14 insertions(+), 11 deletions(-)

diff --git a/texar/torch/modules/encoder_decoders/t5_encoder_decoder.py b/texar/torch/modules/encoder_decoders/t5_encoder_decoder.py
index 3ee7bb306..52aa2de3c 100644
--- a/texar/torch/modules/encoder_decoders/t5_encoder_decoder.py
+++ b/texar/torch/modules/encoder_decoders/t5_encoder_decoder.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
-T5 Model
+T5 Model.
 """
 
 from typing import Optional, Union
@@ -100,7 +100,7 @@ def reset_parameters(self):
     def default_hparams():
         r"""Returns a dictionary of hyperparameters with default values.
 
-        * The encoder arch is determined by the constructor argument
+        * The model arch is determined by the constructor argument
           :attr:`pretrained_model_name` if it's specified. In this case,
           `hparams` are ignored.
         * Otherwise, the encoder arch is determined by
diff --git a/texar/torch/modules/pretrained/t5.py b/texar/torch/modules/pretrained/t5.py
index 4c5fcd78c..c7684fbef 100644
--- a/texar/torch/modules/pretrained/t5.py
+++ b/texar/torch/modules/pretrained/t5.py
@@ -62,16 +62,16 @@ def _generate_t5_file_list(ckpt_tuple: tuple) -> List[str]:
 
 class PretrainedT5Mixin(PretrainedMixin, ABC):
     r"""A mixin class to support loading pre-trained checkpoints for modules
-        that implement the T5 model.
+    that implement the T5 model.
 
-    The T5 model treats multiple NLP tasks in a similar manner by encoding the
-    different tasks as text directives in the input stream. This enables a
-    single model to be trained supervised on a wide variety of NLP tasks.
-
-    The T5 model examines factors relevant for leveraging transfer learning
-    at scale from pure unsupervised pre-training to supervised tasks. It is
-    discussed in much detail in `Exploring the Limits of Transfer Learning
-    with a Unified Text-to-Text Transformer` from Google.
+    The T5 model was proposed in
+    `Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer`_
+    by `Raffel et al.` from Google. It treats multiple NLP tasks in a similar
+    manner by encoding the different tasks as text directives in the input
+    stream. This enables a single model to be trained supervised on a wide
+    variety of NLP tasks. The T5 model examines factors relevant for leveraging
+    transfer learning at scale from pure unsupervised pre-training to
+    supervised tasks.
 
     The available T5 models are as follows:
 
@@ -89,6 +89,9 @@ class PretrainedT5Mixin(PretrainedMixin, ABC):
         decoding stack.
       * :class:`~texar.torch.modules.T5EncoderDecoder` as a raw pre-trained
         model.
+
+    .. _`Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer`:
+        https://arxiv.org/abs/1910.10683
     """
     _MODEL_NAME = "T5"
 

From 82590c8f6b452b6779b55b329a0b2f11f2b63b4c Mon Sep 17 00:00:00 2001
From: Pengzhi Gao <pengzhi.gao@petuum.com>
Date: Fri, 7 Feb 2020 12:06:58 -0500
Subject: [PATCH 2/2] Polish T5 module

---
 texar/torch/modules/decoders/t5_decoder.py    | 14 ++--
 .../encoder_decoders/t5_encoder_decoder.py    | 73 ++++++++++---------
 texar/torch/modules/encoders/t5_encoder.py    | 10 +--
 3 files changed, 50 insertions(+), 47 deletions(-)

diff --git a/texar/torch/modules/decoders/t5_decoder.py b/texar/torch/modules/decoders/t5_decoder.py
index 58c4f2c27..67b088c75 100644
--- a/texar/torch/modules/decoders/t5_decoder.py
+++ b/texar/torch/modules/decoders/t5_decoder.py
@@ -34,7 +34,7 @@ class T5Decoder(TransformerDecoder):
     position representation for sequence decoding.
 
     It is a stack of
-    :class:`~texar.torch.modules.MultiheadRPRAttention`,
+    :class:`~texar.torch.modules.pretrained.t5_utilsMultiheadRPRAttention`,
     :class:`~texar.torch.modules.FeedForwardNetwork`, and residual connections.
 
     Args:
@@ -86,21 +86,18 @@ def __init__(self,
                  token_embedder: Optional[TokenEmbedder] = None,
                  token_pos_embedder: Optional[TokenPosEmbedder] = None,
                  vocab_size: Optional[int] = None,
-                 output_layer: Optional[Union[nn.Module,
-                                              torch.Tensor,
-                                              ]] = None,
+                 output_layer: Optional[Union[nn.Module, torch.Tensor]] = None,
                  hparams=None):
         super().__init__(
             token_embedder, token_pos_embedder,
-            vocab_size=vocab_size,
-            output_layer=output_layer,
-            hparams=hparams)
+            vocab_size=vocab_size, output_layer=output_layer, hparams=hparams)
 
         self.final_layer_norm = T5LayerNorm(self._input_size,  # type: ignore
                                             eps=self._hparams.eps)
 
     def initialize_blocks(self):
-
+        r"""Helper function to initialize blocks.
+        """
         for i in range(self._hparams.num_blocks):
             attn_module = MultiheadRPRAttention(
                 self._input_size,
@@ -161,6 +158,7 @@ def default_hparams():
                     'relative_attention_num_buckets': 32
                 },
                 "initializer": None,
+                "eps": 1e-6,
                 "name": "t5_decoder"
 
                 # Additional for TransformerDecoder
diff --git a/texar/torch/modules/encoder_decoders/t5_encoder_decoder.py b/texar/torch/modules/encoder_decoders/t5_encoder_decoder.py
index 52aa2de3c..a6d498a77 100644
--- a/texar/torch/modules/encoder_decoders/t5_encoder_decoder.py
+++ b/texar/torch/modules/encoder_decoders/t5_encoder_decoder.py
@@ -39,9 +39,8 @@ class T5EncoderDecoder(EncoderDecoderBase, PretrainedT5Mixin):
 
     This module basically stacks
     :class:`~texar.torch.modules.WordEmbedder`,
-    :class:`~texar.torch.modules.TransformerEncoder`,
-    :class:`~texar.torch.modules.TransformerDecoder` and a dense
-    pooler.
+    :class:`~texar.torch.modules.T5Encoder`, and
+    :class:`~texar.torch.modules.T5Decoder`.
 
     Args:
         pretrained_model_name (optional): a `str`, the name
@@ -103,7 +102,7 @@ def default_hparams():
         * The model arch is determined by the constructor argument
           :attr:`pretrained_model_name` if it's specified. In this case,
           `hparams` are ignored.
-        * Otherwise, the encoder arch is determined by
+        * Otherwise, the model arch is determined by
           `hparams['pretrained_model_name']` if it's specified. All other
           configurations in `hparams` are ignored.
         * If the above two are `None`, the encoder arch is defined by the
@@ -112,7 +111,7 @@ def default_hparams():
         .. code-block:: python
 
             {
-                "pretrained_model_name": "bert-base-uncased",
+                "pretrained_model_name": "T5-Small",
                 "embed": {
                     "dim": 768,
                     "name": "word_embeddings"
@@ -128,10 +127,12 @@ def default_hparams():
                         "num_heads": 12,
                         "num_units": 768,
                         "output_dim": 768,
-                        "use_bias": True
+                        "use_bias": False,
+                        "is_decoder": False,
+                        "relative_attention_num_buckets": 32,
                     },
-                    "relative_attention_num_buckets": 32,
-                    "name": "t5encoder",
+                    "eps": 1e-6,
+                    "name": "encoder",
                     "num_blocks": 12,
                     "poswise_feedforward": {
                         "layers": [
@@ -139,7 +140,7 @@ def default_hparams():
                                 "kwargs": {
                                     "in_features": 768,
                                     "out_features": 3072,
-                                    "bias": True
+                                    "bias": False
                                 },
                                 "type": "Linear"
                             },
@@ -148,7 +149,7 @@ def default_hparams():
                                 "kwargs": {
                                     "in_features": 3072,
                                     "out_features": 768,
-                                    "bias": True
+                                    "bias": False
                                 },
                                 "type": "Linear"
                             }
@@ -158,6 +159,7 @@ def default_hparams():
                     },
 
                 "decoder": {
+                    "eps": 1e-6,
                     "dim": 768,
                     "embedding_dropout": 0.1,
                     "multihead_attention": {
@@ -166,11 +168,11 @@ def default_hparams():
                         "num_heads": 12,
                         "num_units": 768,
                         "output_dim": 768,
-                        "use_bias": True,
+                        "use_bias": False,
+                        "is_decoder": True,
                         "relative_attention_num_buckets": 32,
                     },
-
-                    "name": "t5coder",
+                    "name": "decoder",
                     "num_blocks": 12,
                     "poswise_feedforward": {
                         "layers": [
@@ -178,7 +180,7 @@ def default_hparams():
                                 "kwargs": {
                                     "in_features": 768,
                                     "out_features": 3072,
-                                    "bias": True
+                                    "bias": False
                                 },
                                 "type": "Linear"
                             },
@@ -187,7 +189,7 @@ def default_hparams():
                                 "kwargs": {
                                     "in_features": 3072,
                                     "out_features": 768,
-                                    "bias": True
+                                    "bias": False
                                 },
                                 "type": "Linear"
                             }
@@ -202,34 +204,30 @@ def default_hparams():
 
         Here:
 
-        The default parameters are values for uncased BERT-Base model.
+        The default parameters are values for T5-Small model.
 
         `"pretrained_model_name"`: str or None
-            The name of the pre-trained BERT model. If None, the model
+            The name of the pre-trained T5 model. If None, the model
             will be randomly initialized.
 
         `"embed"`: dict
             Hyperparameters for word embedding layer.
 
         `"vocab_size"`: int
-            The vocabulary size of `inputs` in BERT model.
-
-        `"type_vocab_size"`: int
-            The vocabulary size of the `segment_ids` passed into `BertModel`.
-
-        `"position_embed"`: dict
-            Hyperparameters for position embedding layer.
-
-        `"position_size"`: int
-            The maximum sequence length that this model might ever be used with.
+            The vocabulary size of `inputs` in T5 model.
 
         `"encoder"`: dict
-            Hyperparameters for the T5Encoder.
+            Hyperparameters for the `T5Encoder`.
             See :func:`~texar.torch.modules.T5Encoder.default_hparams`
             for details.
 
+        `"decoder"`: dict
+            Hyperparameters for the `T5Decoder`.
+            See :func:`~texar.torch.modules.T5Decoder.default_hparams`
+            for details.
+
         `"hidden_size"`: int
-            Size of the pooler dense layer.
+            Size of the hidden layer.
 
         `"initializer"`: dict, optional
             Hyperparameters of the default initializer that initializes
@@ -301,7 +299,7 @@ def default_hparams():
                     'is_decoder': True,
                     'relative_attention_num_buckets': 32
                 },
-                'name': 'encoder',
+                'name': 'decoder',
                 'num_blocks': 12,
                 'poswise_feedforward': {
                     'layers': [
@@ -335,10 +333,10 @@ def default_hparams():
     def forward(self,  # type: ignore
                 inputs: Union[torch.Tensor, torch.LongTensor],
                 sequence_length: Optional[torch.LongTensor] = None):
-        r"""
+        r"""Performs encoding and decoding.
 
         Args:
-            inputs: Either a **2D Tensor** of shape `[batch_size, max_time]`,
+            inputs: Either a **2D Tensor** of shape ``[batch_size, max_time]``,
                 containing the ids of tokens in input sequences, or
                 a **3D Tensor** of shape `[batch_size, max_time, vocab_size]`,
                 containing soft token ids (i.e., weights or probabilities)
@@ -348,6 +346,14 @@ def forward(self,  # type: ignore
                 lengths are masked out automatically.
 
         Returns:
+            A pair :attr:`(encoder_output, decoder_output)`
+
+            - :attr:`encoder_output`: A Tensor of shape
+              `[batch_size, max_time, dim]` containing the encoded vectors.
+
+            - :attr:`decoder_output`: An instance of
+              :class:`~texar.torch.modules.TransformerDecoderOutput` which
+              contains `sample_id` and `logits`.
         """
         if inputs.dim() == 2:
             word_embeds = self.word_embedder(ids=inputs)
@@ -373,7 +379,6 @@ def forward(self,  # type: ignore
 
     @property
     def output_size(self):
-        r"""The feature size of :meth:`forward` output
-        :attr:`pooled_output`.
+        r"""The feature size of :meth:`forward` output of the encoder.
         """
         return self._hparams.hidden_size
diff --git a/texar/torch/modules/encoders/t5_encoder.py b/texar/torch/modules/encoders/t5_encoder.py
index d5e741363..a8d0cca20 100644
--- a/texar/torch/modules/encoders/t5_encoder.py
+++ b/texar/torch/modules/encoders/t5_encoder.py
@@ -25,10 +25,10 @@
 
 class T5Encoder(TransformerEncoder):
     r"""Transformer based encoder that applies multi-head self attention with
-     relative positional representations for encoding sequences for T5.
+    relative positional representations for encoding sequences for T5.
 
     This module basically stacks
-    :class:`~texar.torch.modules.MultiheadRPRAttention`,
+    :class:`~texar.torch.modules.pretrained.t5_utils.MultiheadRPRAttention`,
     :class:`~texar.torch.modules.FeedForwardNetwork` and residual connections.
     This module supports the standard T5 architecture proposed in
     `(Raffel et al.) "Exploring the Limits of Transfer Learning with a Unified
@@ -49,8 +49,7 @@ def __init__(self, hparams=None):
                                             eps=self._hparams.eps)
 
     def initialize_blocks(self):
-        r""" Helper function to initialize blocks.
-
+        r"""Helper function to initialize blocks.
         """
         for i in range(self._hparams.num_blocks):
             mh_attn = MultiheadRPRAttention(
@@ -106,6 +105,7 @@ def default_hparams():
                     'relative_attention_num_buckets': 32
                 },
                 "initializer": None,
+                "eps": 1e-6,
                 "name": "t5_encoder"
             }
 
@@ -179,7 +179,7 @@ def forward(self,  # type: ignore
         Args:
             inputs: A 3D Tensor of shape ``[batch_size, max_time, dim]``,
                 containing the embedding of input sequences. Note that
-                the embedding dimension `dim` must equal "dim" in
+                the embedding dimension `dim` must equal `"dim"` in
                 :attr:`hparams`. The input embedding is typically an
                 aggregation of word embedding and position embedding.
             sequence_length: A 1D :tensor:`LongTensor` of shape