Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 6 additions & 8 deletions texar/torch/modules/decoders/t5_decoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ class T5Decoder(TransformerDecoder):
position representation for sequence decoding.

It is a stack of
:class:`~texar.torch.modules.MultiheadRPRAttention`,
:class:`~texar.torch.modules.pretrained.t5_utilsMultiheadRPRAttention`,
:class:`~texar.torch.modules.FeedForwardNetwork`, and residual connections.

Args:
Expand Down Expand Up @@ -86,21 +86,18 @@ def __init__(self,
token_embedder: Optional[TokenEmbedder] = None,
token_pos_embedder: Optional[TokenPosEmbedder] = None,
vocab_size: Optional[int] = None,
output_layer: Optional[Union[nn.Module,
torch.Tensor,
]] = None,
output_layer: Optional[Union[nn.Module, torch.Tensor]] = None,
hparams=None):
super().__init__(
token_embedder, token_pos_embedder,
vocab_size=vocab_size,
output_layer=output_layer,
hparams=hparams)
vocab_size=vocab_size, output_layer=output_layer, hparams=hparams)

self.final_layer_norm = T5LayerNorm(self._input_size, # type: ignore
eps=self._hparams.eps)

def initialize_blocks(self):

r"""Helper function to initialize blocks.
"""
for i in range(self._hparams.num_blocks):
attn_module = MultiheadRPRAttention(
self._input_size,
Expand Down Expand Up @@ -161,6 +158,7 @@ def default_hparams():
'relative_attention_num_buckets': 32
},
"initializer": None,
"eps": 1e-6,
"name": "t5_decoder"

# Additional for TransformerDecoder
Expand Down
77 changes: 41 additions & 36 deletions texar/torch/modules/encoder_decoders/t5_encoder_decoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
"""
T5 Model
T5 Model.
"""

from typing import Optional, Union
Expand All @@ -39,9 +39,8 @@ class T5EncoderDecoder(EncoderDecoderBase, PretrainedT5Mixin):

This module basically stacks
:class:`~texar.torch.modules.WordEmbedder`,
:class:`~texar.torch.modules.TransformerEncoder`,
:class:`~texar.torch.modules.TransformerDecoder` and a dense
pooler.
:class:`~texar.torch.modules.T5Encoder`, and
:class:`~texar.torch.modules.T5Decoder`.

Args:
pretrained_model_name (optional): a `str`, the name
Expand Down Expand Up @@ -100,10 +99,10 @@ def reset_parameters(self):
def default_hparams():
r"""Returns a dictionary of hyperparameters with default values.

* The encoder arch is determined by the constructor argument
* The model arch is determined by the constructor argument
:attr:`pretrained_model_name` if it's specified. In this case,
`hparams` are ignored.
* Otherwise, the encoder arch is determined by
* Otherwise, the model arch is determined by
`hparams['pretrained_model_name']` if it's specified. All other
configurations in `hparams` are ignored.
* If the above two are `None`, the encoder arch is defined by the
Expand All @@ -112,7 +111,7 @@ def default_hparams():
.. code-block:: python

{
"pretrained_model_name": "bert-base-uncased",
"pretrained_model_name": "T5-Small",
"embed": {
"dim": 768,
"name": "word_embeddings"
Expand All @@ -128,18 +127,20 @@ def default_hparams():
"num_heads": 12,
"num_units": 768,
"output_dim": 768,
"use_bias": True
"use_bias": False,
"is_decoder": False,
"relative_attention_num_buckets": 32,
},
"relative_attention_num_buckets": 32,
"name": "t5encoder",
"eps": 1e-6,
"name": "encoder",
"num_blocks": 12,
"poswise_feedforward": {
"layers": [
{
"kwargs": {
"in_features": 768,
"out_features": 3072,
"bias": True
"bias": False
},
"type": "Linear"
},
Expand All @@ -148,7 +149,7 @@ def default_hparams():
"kwargs": {
"in_features": 3072,
"out_features": 768,
"bias": True
"bias": False
},
"type": "Linear"
}
Expand All @@ -158,6 +159,7 @@ def default_hparams():
},

"decoder": {
"eps": 1e-6,
"dim": 768,
"embedding_dropout": 0.1,
"multihead_attention": {
Expand All @@ -166,19 +168,19 @@ def default_hparams():
"num_heads": 12,
"num_units": 768,
"output_dim": 768,
"use_bias": True,
"use_bias": False,
"is_decoder": True,
"relative_attention_num_buckets": 32,
},

"name": "t5coder",
"name": "decoder",
"num_blocks": 12,
"poswise_feedforward": {
"layers": [
{
"kwargs": {
"in_features": 768,
"out_features": 3072,
"bias": True
"bias": False
},
"type": "Linear"
},
Expand All @@ -187,7 +189,7 @@ def default_hparams():
"kwargs": {
"in_features": 3072,
"out_features": 768,
"bias": True
"bias": False
},
"type": "Linear"
}
Expand All @@ -202,34 +204,30 @@ def default_hparams():

Here:

The default parameters are values for uncased BERT-Base model.
The default parameters are values for T5-Small model.

`"pretrained_model_name"`: str or None
The name of the pre-trained BERT model. If None, the model
The name of the pre-trained T5 model. If None, the model
will be randomly initialized.

`"embed"`: dict
Hyperparameters for word embedding layer.

`"vocab_size"`: int
The vocabulary size of `inputs` in BERT model.

`"type_vocab_size"`: int
The vocabulary size of the `segment_ids` passed into `BertModel`.

`"position_embed"`: dict
Hyperparameters for position embedding layer.

`"position_size"`: int
The maximum sequence length that this model might ever be used with.
The vocabulary size of `inputs` in T5 model.

`"encoder"`: dict
Hyperparameters for the T5Encoder.
Hyperparameters for the `T5Encoder`.
See :func:`~texar.torch.modules.T5Encoder.default_hparams`
for details.

`"decoder"`: dict
Hyperparameters for the `T5Decoder`.
See :func:`~texar.torch.modules.T5Decoder.default_hparams`
for details.

`"hidden_size"`: int
Size of the pooler dense layer.
Size of the hidden layer.

`"initializer"`: dict, optional
Hyperparameters of the default initializer that initializes
Expand Down Expand Up @@ -301,7 +299,7 @@ def default_hparams():
'is_decoder': True,
'relative_attention_num_buckets': 32
},
'name': 'encoder',
'name': 'decoder',
'num_blocks': 12,
'poswise_feedforward': {
'layers': [
Expand Down Expand Up @@ -335,10 +333,10 @@ def default_hparams():
def forward(self, # type: ignore
inputs: Union[torch.Tensor, torch.LongTensor],
sequence_length: Optional[torch.LongTensor] = None):
r"""
r"""Performs encoding and decoding.

Args:
inputs: Either a **2D Tensor** of shape `[batch_size, max_time]`,
inputs: Either a **2D Tensor** of shape ``[batch_size, max_time]``,
containing the ids of tokens in input sequences, or
a **3D Tensor** of shape `[batch_size, max_time, vocab_size]`,
containing soft token ids (i.e., weights or probabilities)
Expand All @@ -348,6 +346,14 @@ def forward(self, # type: ignore
lengths are masked out automatically.

Returns:
A pair :attr:`(encoder_output, decoder_output)`

- :attr:`encoder_output`: A Tensor of shape
`[batch_size, max_time, dim]` containing the encoded vectors.

- :attr:`decoder_output`: An instance of
:class:`~texar.torch.modules.TransformerDecoderOutput` which
contains `sample_id` and `logits`.
"""
if inputs.dim() == 2:
word_embeds = self.word_embedder(ids=inputs)
Expand All @@ -373,7 +379,6 @@ def forward(self, # type: ignore

@property
def output_size(self):
r"""The feature size of :meth:`forward` output
:attr:`pooled_output`.
r"""The feature size of :meth:`forward` output of the encoder.
"""
return self._hparams.hidden_size
10 changes: 5 additions & 5 deletions texar/torch/modules/encoders/t5_encoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,10 +25,10 @@

class T5Encoder(TransformerEncoder):
r"""Transformer based encoder that applies multi-head self attention with
relative positional representations for encoding sequences for T5.
relative positional representations for encoding sequences for T5.

This module basically stacks
:class:`~texar.torch.modules.MultiheadRPRAttention`,
:class:`~texar.torch.modules.pretrained.t5_utils.MultiheadRPRAttention`,
:class:`~texar.torch.modules.FeedForwardNetwork` and residual connections.
This module supports the standard T5 architecture proposed in
`(Raffel et al.) "Exploring the Limits of Transfer Learning with a Unified
Expand All @@ -49,8 +49,7 @@ def __init__(self, hparams=None):
eps=self._hparams.eps)

def initialize_blocks(self):
r""" Helper function to initialize blocks.

r"""Helper function to initialize blocks.
"""
for i in range(self._hparams.num_blocks):
mh_attn = MultiheadRPRAttention(
Expand Down Expand Up @@ -106,6 +105,7 @@ def default_hparams():
'relative_attention_num_buckets': 32
},
"initializer": None,
"eps": 1e-6,
"name": "t5_encoder"
}

Expand Down Expand Up @@ -179,7 +179,7 @@ def forward(self, # type: ignore
Args:
inputs: A 3D Tensor of shape ``[batch_size, max_time, dim]``,
containing the embedding of input sequences. Note that
the embedding dimension `dim` must equal "dim" in
the embedding dimension `dim` must equal `"dim"` in
:attr:`hparams`. The input embedding is typically an
aggregation of word embedding and position embedding.
sequence_length: A 1D :tensor:`LongTensor` of shape
Expand Down
21 changes: 12 additions & 9 deletions texar/torch/modules/pretrained/t5.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,16 +62,16 @@ def _generate_t5_file_list(ckpt_tuple: tuple) -> List[str]:

class PretrainedT5Mixin(PretrainedMixin, ABC):
r"""A mixin class to support loading pre-trained checkpoints for modules
that implement the T5 model.
that implement the T5 model.

The T5 model treats multiple NLP tasks in a similar manner by encoding the
different tasks as text directives in the input stream. This enables a
single model to be trained supervised on a wide variety of NLP tasks.

The T5 model examines factors relevant for leveraging transfer learning
at scale from pure unsupervised pre-training to supervised tasks. It is
discussed in much detail in `Exploring the Limits of Transfer Learning
with a Unified Text-to-Text Transformer` from Google.
The T5 model was proposed in
`Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer`_
by `Raffel et al.` from Google. It treats multiple NLP tasks in a similar
manner by encoding the different tasks as text directives in the input
stream. This enables a single model to be trained supervised on a wide
variety of NLP tasks. The T5 model examines factors relevant for leveraging
transfer learning at scale from pure unsupervised pre-training to
supervised tasks.

The available T5 models are as follows:

Expand All @@ -89,6 +89,9 @@ class PretrainedT5Mixin(PretrainedMixin, ABC):
decoding stack.
* :class:`~texar.torch.modules.T5EncoderDecoder` as a raw pre-trained
model.

.. _`Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer`:
https://arxiv.org/abs/1910.10683
"""
_MODEL_NAME = "T5"

Expand Down