From 5a9cf0535e84a006110480e5d55be044c9143cad Mon Sep 17 00:00:00 2001 From: Pengzhi Gao Date: Wed, 5 Feb 2020 00:36:36 -0500 Subject: [PATCH 1/2] Fix docs issue in T5 --- .../encoder_decoders/t5_encoder_decoder.py | 4 ++-- texar/torch/modules/pretrained/t5.py | 21 +++++++++++-------- 2 files changed, 14 insertions(+), 11 deletions(-) diff --git a/texar/torch/modules/encoder_decoders/t5_encoder_decoder.py b/texar/torch/modules/encoder_decoders/t5_encoder_decoder.py index 3ee7bb306..52aa2de3c 100644 --- a/texar/torch/modules/encoder_decoders/t5_encoder_decoder.py +++ b/texar/torch/modules/encoder_decoders/t5_encoder_decoder.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """ -T5 Model +T5 Model. """ from typing import Optional, Union @@ -100,7 +100,7 @@ def reset_parameters(self): def default_hparams(): r"""Returns a dictionary of hyperparameters with default values. - * The encoder arch is determined by the constructor argument + * The model arch is determined by the constructor argument :attr:`pretrained_model_name` if it's specified. In this case, `hparams` are ignored. * Otherwise, the encoder arch is determined by diff --git a/texar/torch/modules/pretrained/t5.py b/texar/torch/modules/pretrained/t5.py index 4c5fcd78c..c7684fbef 100644 --- a/texar/torch/modules/pretrained/t5.py +++ b/texar/torch/modules/pretrained/t5.py @@ -62,16 +62,16 @@ def _generate_t5_file_list(ckpt_tuple: tuple) -> List[str]: class PretrainedT5Mixin(PretrainedMixin, ABC): r"""A mixin class to support loading pre-trained checkpoints for modules - that implement the T5 model. + that implement the T5 model. - The T5 model treats multiple NLP tasks in a similar manner by encoding the - different tasks as text directives in the input stream. This enables a - single model to be trained supervised on a wide variety of NLP tasks. - - The T5 model examines factors relevant for leveraging transfer learning - at scale from pure unsupervised pre-training to supervised tasks. It is - discussed in much detail in `Exploring the Limits of Transfer Learning - with a Unified Text-to-Text Transformer` from Google. + The T5 model was proposed in + `Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer`_ + by `Raffel et al.` from Google. It treats multiple NLP tasks in a similar + manner by encoding the different tasks as text directives in the input + stream. This enables a single model to be trained supervised on a wide + variety of NLP tasks. The T5 model examines factors relevant for leveraging + transfer learning at scale from pure unsupervised pre-training to + supervised tasks. The available T5 models are as follows: @@ -89,6 +89,9 @@ class PretrainedT5Mixin(PretrainedMixin, ABC): decoding stack. * :class:`~texar.torch.modules.T5EncoderDecoder` as a raw pre-trained model. + + .. _`Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer`: + https://arxiv.org/abs/1910.10683 """ _MODEL_NAME = "T5" From 82590c8f6b452b6779b55b329a0b2f11f2b63b4c Mon Sep 17 00:00:00 2001 From: Pengzhi Gao Date: Fri, 7 Feb 2020 12:06:58 -0500 Subject: [PATCH 2/2] Polish T5 module --- texar/torch/modules/decoders/t5_decoder.py | 14 ++-- .../encoder_decoders/t5_encoder_decoder.py | 73 ++++++++++--------- texar/torch/modules/encoders/t5_encoder.py | 10 +-- 3 files changed, 50 insertions(+), 47 deletions(-) diff --git a/texar/torch/modules/decoders/t5_decoder.py b/texar/torch/modules/decoders/t5_decoder.py index 58c4f2c27..67b088c75 100644 --- a/texar/torch/modules/decoders/t5_decoder.py +++ b/texar/torch/modules/decoders/t5_decoder.py @@ -34,7 +34,7 @@ class T5Decoder(TransformerDecoder): position representation for sequence decoding. It is a stack of - :class:`~texar.torch.modules.MultiheadRPRAttention`, + :class:`~texar.torch.modules.pretrained.t5_utilsMultiheadRPRAttention`, :class:`~texar.torch.modules.FeedForwardNetwork`, and residual connections. Args: @@ -86,21 +86,18 @@ def __init__(self, token_embedder: Optional[TokenEmbedder] = None, token_pos_embedder: Optional[TokenPosEmbedder] = None, vocab_size: Optional[int] = None, - output_layer: Optional[Union[nn.Module, - torch.Tensor, - ]] = None, + output_layer: Optional[Union[nn.Module, torch.Tensor]] = None, hparams=None): super().__init__( token_embedder, token_pos_embedder, - vocab_size=vocab_size, - output_layer=output_layer, - hparams=hparams) + vocab_size=vocab_size, output_layer=output_layer, hparams=hparams) self.final_layer_norm = T5LayerNorm(self._input_size, # type: ignore eps=self._hparams.eps) def initialize_blocks(self): - + r"""Helper function to initialize blocks. + """ for i in range(self._hparams.num_blocks): attn_module = MultiheadRPRAttention( self._input_size, @@ -161,6 +158,7 @@ def default_hparams(): 'relative_attention_num_buckets': 32 }, "initializer": None, + "eps": 1e-6, "name": "t5_decoder" # Additional for TransformerDecoder diff --git a/texar/torch/modules/encoder_decoders/t5_encoder_decoder.py b/texar/torch/modules/encoder_decoders/t5_encoder_decoder.py index 52aa2de3c..a6d498a77 100644 --- a/texar/torch/modules/encoder_decoders/t5_encoder_decoder.py +++ b/texar/torch/modules/encoder_decoders/t5_encoder_decoder.py @@ -39,9 +39,8 @@ class T5EncoderDecoder(EncoderDecoderBase, PretrainedT5Mixin): This module basically stacks :class:`~texar.torch.modules.WordEmbedder`, - :class:`~texar.torch.modules.TransformerEncoder`, - :class:`~texar.torch.modules.TransformerDecoder` and a dense - pooler. + :class:`~texar.torch.modules.T5Encoder`, and + :class:`~texar.torch.modules.T5Decoder`. Args: pretrained_model_name (optional): a `str`, the name @@ -103,7 +102,7 @@ def default_hparams(): * The model arch is determined by the constructor argument :attr:`pretrained_model_name` if it's specified. In this case, `hparams` are ignored. - * Otherwise, the encoder arch is determined by + * Otherwise, the model arch is determined by `hparams['pretrained_model_name']` if it's specified. All other configurations in `hparams` are ignored. * If the above two are `None`, the encoder arch is defined by the @@ -112,7 +111,7 @@ def default_hparams(): .. code-block:: python { - "pretrained_model_name": "bert-base-uncased", + "pretrained_model_name": "T5-Small", "embed": { "dim": 768, "name": "word_embeddings" @@ -128,10 +127,12 @@ def default_hparams(): "num_heads": 12, "num_units": 768, "output_dim": 768, - "use_bias": True + "use_bias": False, + "is_decoder": False, + "relative_attention_num_buckets": 32, }, - "relative_attention_num_buckets": 32, - "name": "t5encoder", + "eps": 1e-6, + "name": "encoder", "num_blocks": 12, "poswise_feedforward": { "layers": [ @@ -139,7 +140,7 @@ def default_hparams(): "kwargs": { "in_features": 768, "out_features": 3072, - "bias": True + "bias": False }, "type": "Linear" }, @@ -148,7 +149,7 @@ def default_hparams(): "kwargs": { "in_features": 3072, "out_features": 768, - "bias": True + "bias": False }, "type": "Linear" } @@ -158,6 +159,7 @@ def default_hparams(): }, "decoder": { + "eps": 1e-6, "dim": 768, "embedding_dropout": 0.1, "multihead_attention": { @@ -166,11 +168,11 @@ def default_hparams(): "num_heads": 12, "num_units": 768, "output_dim": 768, - "use_bias": True, + "use_bias": False, + "is_decoder": True, "relative_attention_num_buckets": 32, }, - - "name": "t5coder", + "name": "decoder", "num_blocks": 12, "poswise_feedforward": { "layers": [ @@ -178,7 +180,7 @@ def default_hparams(): "kwargs": { "in_features": 768, "out_features": 3072, - "bias": True + "bias": False }, "type": "Linear" }, @@ -187,7 +189,7 @@ def default_hparams(): "kwargs": { "in_features": 3072, "out_features": 768, - "bias": True + "bias": False }, "type": "Linear" } @@ -202,34 +204,30 @@ def default_hparams(): Here: - The default parameters are values for uncased BERT-Base model. + The default parameters are values for T5-Small model. `"pretrained_model_name"`: str or None - The name of the pre-trained BERT model. If None, the model + The name of the pre-trained T5 model. If None, the model will be randomly initialized. `"embed"`: dict Hyperparameters for word embedding layer. `"vocab_size"`: int - The vocabulary size of `inputs` in BERT model. - - `"type_vocab_size"`: int - The vocabulary size of the `segment_ids` passed into `BertModel`. - - `"position_embed"`: dict - Hyperparameters for position embedding layer. - - `"position_size"`: int - The maximum sequence length that this model might ever be used with. + The vocabulary size of `inputs` in T5 model. `"encoder"`: dict - Hyperparameters for the T5Encoder. + Hyperparameters for the `T5Encoder`. See :func:`~texar.torch.modules.T5Encoder.default_hparams` for details. + `"decoder"`: dict + Hyperparameters for the `T5Decoder`. + See :func:`~texar.torch.modules.T5Decoder.default_hparams` + for details. + `"hidden_size"`: int - Size of the pooler dense layer. + Size of the hidden layer. `"initializer"`: dict, optional Hyperparameters of the default initializer that initializes @@ -301,7 +299,7 @@ def default_hparams(): 'is_decoder': True, 'relative_attention_num_buckets': 32 }, - 'name': 'encoder', + 'name': 'decoder', 'num_blocks': 12, 'poswise_feedforward': { 'layers': [ @@ -335,10 +333,10 @@ def default_hparams(): def forward(self, # type: ignore inputs: Union[torch.Tensor, torch.LongTensor], sequence_length: Optional[torch.LongTensor] = None): - r""" + r"""Performs encoding and decoding. Args: - inputs: Either a **2D Tensor** of shape `[batch_size, max_time]`, + inputs: Either a **2D Tensor** of shape ``[batch_size, max_time]``, containing the ids of tokens in input sequences, or a **3D Tensor** of shape `[batch_size, max_time, vocab_size]`, containing soft token ids (i.e., weights or probabilities) @@ -348,6 +346,14 @@ def forward(self, # type: ignore lengths are masked out automatically. Returns: + A pair :attr:`(encoder_output, decoder_output)` + + - :attr:`encoder_output`: A Tensor of shape + `[batch_size, max_time, dim]` containing the encoded vectors. + + - :attr:`decoder_output`: An instance of + :class:`~texar.torch.modules.TransformerDecoderOutput` which + contains `sample_id` and `logits`. """ if inputs.dim() == 2: word_embeds = self.word_embedder(ids=inputs) @@ -373,7 +379,6 @@ def forward(self, # type: ignore @property def output_size(self): - r"""The feature size of :meth:`forward` output - :attr:`pooled_output`. + r"""The feature size of :meth:`forward` output of the encoder. """ return self._hparams.hidden_size diff --git a/texar/torch/modules/encoders/t5_encoder.py b/texar/torch/modules/encoders/t5_encoder.py index d5e741363..a8d0cca20 100644 --- a/texar/torch/modules/encoders/t5_encoder.py +++ b/texar/torch/modules/encoders/t5_encoder.py @@ -25,10 +25,10 @@ class T5Encoder(TransformerEncoder): r"""Transformer based encoder that applies multi-head self attention with - relative positional representations for encoding sequences for T5. + relative positional representations for encoding sequences for T5. This module basically stacks - :class:`~texar.torch.modules.MultiheadRPRAttention`, + :class:`~texar.torch.modules.pretrained.t5_utils.MultiheadRPRAttention`, :class:`~texar.torch.modules.FeedForwardNetwork` and residual connections. This module supports the standard T5 architecture proposed in `(Raffel et al.) "Exploring the Limits of Transfer Learning with a Unified @@ -49,8 +49,7 @@ def __init__(self, hparams=None): eps=self._hparams.eps) def initialize_blocks(self): - r""" Helper function to initialize blocks. - + r"""Helper function to initialize blocks. """ for i in range(self._hparams.num_blocks): mh_attn = MultiheadRPRAttention( @@ -106,6 +105,7 @@ def default_hparams(): 'relative_attention_num_buckets': 32 }, "initializer": None, + "eps": 1e-6, "name": "t5_encoder" } @@ -179,7 +179,7 @@ def forward(self, # type: ignore Args: inputs: A 3D Tensor of shape ``[batch_size, max_time, dim]``, containing the embedding of input sequences. Note that - the embedding dimension `dim` must equal "dim" in + the embedding dimension `dim` must equal `"dim"` in :attr:`hparams`. The input embedding is typically an aggregation of word embedding and position embedding. sequence_length: A 1D :tensor:`LongTensor` of shape