Merge branch 'TransformerToolkitUpdates' into Tango

allenai · Jun 18, 2021 · 43a200b · 43a200b
2 parents e93ef1d + 32fda86
commit 43a200b
Show file tree

Hide file tree

Showing 4 changed files with 9 additions and 27 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -12,6 +12,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Added `on_backward` training callback which allows for control over backpropagation and gradient manipulation.
 - Added `AdversarialBiasMitigator`, a Model wrapper to adversarially mitigate biases in predictions produced by a pretrained model for a downstream task.
 - Added `which_loss` parameter to `ensure_model_can_train_save_and_load` in `ModelTestCase` to specify which loss to test.
+- The activation layer in the transformer toolkit now can be queried for its output dimension.
+- `TransformerEmbeddings` now takes, but ignores, a parameter for the attention mask. This is needed for compatibility with some other modules that get called the same way and use the mask.
+- `TransformerPooler` can now be instantiated from a pretrained transformer module, just like the other modules in the transformer toolkit.
 
 ### Fixed
 

diff --git a/allennlp/modules/transformer/transformer_embeddings.py b/allennlp/modules/transformer/transformer_embeddings.py
@@ -104,7 +104,7 @@ class TransformerEmbeddings(Embeddings):
         Optionally apply a linear transform after the dropout, projecting to `output_size`.
     """
 
-    _pretrained_relevant_module = ["embeddings", "bert.embeddings"]
+    _pretrained_relevant_module = ["embeddings", "bert.embeddings", "roberta.embeddings"]
     _pretrained_mapping = {
         "LayerNorm": "layer_norm",
         "word_embeddings": "embeddings.word_embeddings",
@@ -163,6 +163,7 @@ def forward(  # type: ignore
         attention_mask: Optional[torch.Tensor] = None,
         token_type_ids: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
 
         """

diff --git a/allennlp/modules/transformer/transformer_pooler.py b/allennlp/modules/transformer/transformer_pooler.py
@@ -1,4 +1,4 @@
-from typing import Dict, Optional, Any, Union, TYPE_CHECKING
+from typing import Union, TYPE_CHECKING
 
 import torch
 
@@ -11,7 +11,7 @@
 
 class TransformerPooler(ActivationLayer, FromParams):
 
-    _pretrained_relevant_module = ["pooler", "bert.pooler"]
+    _pretrained_relevant_module = ["pooler", "bert.pooler", "roberta.pooler"]
 
     def __init__(
         self,
@@ -21,28 +21,6 @@ def __init__(
     ):
         super().__init__(hidden_size, intermediate_size, activation, pool=True)
 
-    @classmethod
-    def _get_input_arguments(
-        cls,
-        pretrained_module: torch.nn.Module,
-        source: str = "huggingface",
-        mapping: Optional[Dict[str, str]] = None,
-        **kwargs,
-    ) -> Dict[str, Any]:
-        final_kwargs = {}
-
-        final_kwargs["hidden_size"] = pretrained_module.dense.in_features
-        final_kwargs["intermediate_size"] = pretrained_module.dense.out_features
-        final_kwargs["activation"] = pretrained_module.activation
-
-        final_kwargs.update(kwargs)
-
-        return final_kwargs
-
     @classmethod
     def _from_config(cls, config: "PretrainedConfig", **kwargs):
-        return cls(
-            config.hidden_size,
-            config.hidden_size,
-            "tanh"      # BERT has this hardcoded
-        )
+        return cls(config.hidden_size, config.hidden_size, "tanh")  # BERT has this hardcoded
diff --git a/allennlp/modules/transformer/transformer_stack.py b/allennlp/modules/transformer/transformer_stack.py
@@ -56,7 +56,7 @@ class TransformerStack(TransformerModule, FromParams):
     """
 
     _pretrained_mapping = {"layer": "layers"}
-    _pretrained_relevant_module = ["encoder", "bert.encoder"]
+    _pretrained_relevant_module = ["encoder", "bert.encoder", "roberta.encoder"]
 
     def __init__(
         self,