clean up unused classifier_dropout in config (huggingface#20596)

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
amyeroberts · Dec 7, 2022 · c020674 · c020674
1 parent 9be5692
commit c020674
Show file tree

Hide file tree

Showing 6 changed files with 0 additions and 24 deletions.
diff --git a/src/transformers/models/blenderbot/configuration_blenderbot.py b/src/transformers/models/blenderbot/configuration_blenderbot.py
@@ -71,8 +71,6 @@ class BlenderbotConfig(PretrainedConfig):
             The dropout ratio for the attention probabilities.
         activation_dropout (`float`, *optional*, defaults to 0.0):
             The dropout ratio for activations inside the fully connected layer.
-        classifier_dropout (`float`, *optional*, defaults to 0.0):
-            The dropout ratio for classifier.
         max_position_embeddings (`int`, *optional*, defaults to 128):
             The maximum sequence length that this model might ever be used with. Typically set this to something large
             just in case (e.g., 512 or 1024 or 2048).
@@ -131,7 +129,6 @@ def __init__(
         activation_dropout=0.0,
         init_std=0.02,
         decoder_start_token_id=1,
-        classifier_dropout=0.0,
         scale_embedding=False,
         pad_token_id=0,
         bos_token_id=1,
@@ -156,7 +153,6 @@ def __init__(
         self.init_std = init_std
         self.encoder_layerdrop = encoder_layerdrop
         self.decoder_layerdrop = decoder_layerdrop
-        self.classifier_dropout = classifier_dropout
         self.use_cache = use_cache
         self.num_hidden_layers = encoder_layers
         self.scale_embedding = scale_embedding  # scale factor will be sqrt(d_model) if True

diff --git a/src/transformers/models/blenderbot_small/configuration_blenderbot_small.py b/src/transformers/models/blenderbot_small/configuration_blenderbot_small.py
@@ -71,8 +71,6 @@ class BlenderbotSmallConfig(PretrainedConfig):
             The dropout ratio for the attention probabilities.
         activation_dropout (`float`, *optional*, defaults to 0.0):
             The dropout ratio for activations inside the fully connected layer.
-        classifier_dropout (`float`, *optional*, defaults to 0.0):
-            The dropout ratio for classifier.
         max_position_embeddings (`int`, *optional*, defaults to 512):
             The maximum sequence length that this model might ever be used with. Typically set this to something large
             just in case (e.g., 512 or 1024 or 2048).
@@ -131,7 +129,6 @@ def __init__(
         activation_dropout=0.0,
         init_std=0.02,
         decoder_start_token_id=1,
-        classifier_dropout=0.0,
         scale_embedding=False,
         pad_token_id=0,
         bos_token_id=1,
@@ -155,7 +152,6 @@ def __init__(
         self.init_std = init_std
         self.encoder_layerdrop = encoder_layerdrop
         self.decoder_layerdrop = decoder_layerdrop
-        self.classifier_dropout = classifier_dropout
         self.use_cache = use_cache
         self.num_hidden_layers = encoder_layers
         self.scale_embedding = scale_embedding  # scale factor will be sqrt(d_model) if True

diff --git a/src/transformers/models/chinese_clip/configuration_chinese_clip.py b/src/transformers/models/chinese_clip/configuration_chinese_clip.py
@@ -87,8 +87,6 @@ class ChineseCLIPTextConfig(PretrainedConfig):
         use_cache (`bool`, *optional*, defaults to `True`):
             Whether or not the model should return the last key/values attentions (not used by all models). Only
             relevant if `config.is_decoder=True`.
-        classifier_dropout (`float`, *optional*):
-            The dropout ratio for the classification head.
 
     Example:
 
@@ -124,7 +122,6 @@ def __init__(
         pad_token_id=0,
         position_embedding_type="absolute",
         use_cache=True,
-        classifier_dropout=None,
         **kwargs
     ):
         super().__init__(pad_token_id=pad_token_id, **kwargs)
@@ -144,7 +141,6 @@ def __init__(
         self.layer_norm_eps = layer_norm_eps
         self.position_embedding_type = position_embedding_type
         self.use_cache = use_cache
-        self.classifier_dropout = classifier_dropout
 
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":

diff --git a/src/transformers/models/marian/configuration_marian.py b/src/transformers/models/marian/configuration_marian.py
@@ -69,8 +69,6 @@ class MarianConfig(PretrainedConfig):
             The dropout ratio for the attention probabilities.
         activation_dropout (`float`, *optional*, defaults to 0.0):
             The dropout ratio for activations inside the fully connected layer.
-        classifier_dropout (`float`, *optional*, defaults to 0.0):
-            The dropout ratio for classifier.
         max_position_embeddings (`int`, *optional*, defaults to 1024):
             The maximum sequence length that this model might ever be used with. Typically set this to something large
             just in case (e.g., 512 or 1024 or 2048).
@@ -130,7 +128,6 @@ def __init__(
         activation_dropout=0.0,
         init_std=0.02,
         decoder_start_token_id=58100,
-        classifier_dropout=0.0,
         scale_embedding=False,
         pad_token_id=58100,
         eos_token_id=0,
@@ -155,7 +152,6 @@ def __init__(
         self.init_std = init_std
         self.encoder_layerdrop = encoder_layerdrop
         self.decoder_layerdrop = decoder_layerdrop
-        self.classifier_dropout = classifier_dropout
         self.use_cache = use_cache
         self.num_hidden_layers = encoder_layers
         self.scale_embedding = scale_embedding  # scale factor will be sqrt(d_model) if True

diff --git a/src/transformers/models/pegasus/configuration_pegasus.py b/src/transformers/models/pegasus/configuration_pegasus.py
@@ -64,8 +64,6 @@ class PegasusConfig(PretrainedConfig):
             The dropout ratio for the attention probabilities.
         activation_dropout (`float`, *optional*, defaults to 0.0):
             The dropout ratio for activations inside the fully connected layer.
-        classifier_dropout (`float`, *optional*, defaults to 0.0):
-            The dropout ratio for classifier.
         max_position_embeddings (`int`, *optional*, defaults to 1024):
             The maximum sequence length that this model might ever be used with. Typically set this to something large
             just in case (e.g., 512 or 1024 or 2048).
@@ -124,7 +122,6 @@ def __init__(
         activation_dropout=0.0,
         init_std=0.02,
         decoder_start_token_id=0,
-        classifier_dropout=0.0,
         scale_embedding=False,
         pad_token_id=0,
         eos_token_id=1,
@@ -147,7 +144,6 @@ def __init__(
         self.init_std = init_std
         self.encoder_layerdrop = encoder_layerdrop
         self.decoder_layerdrop = decoder_layerdrop
-        self.classifier_dropout = classifier_dropout
         self.use_cache = use_cache
         self.num_hidden_layers = encoder_layers
         self.scale_embedding = scale_embedding  # scale factor will be sqrt(d_model) if True

diff --git a/src/transformers/models/speech_to_text_2/configuration_speech_to_text_2.py b/src/transformers/models/speech_to_text_2/configuration_speech_to_text_2.py
@@ -60,8 +60,6 @@ class Speech2Text2Config(PretrainedConfig):
             The dropout ratio for the attention probabilities.
         activation_dropout (`float`, *optional*, defaults to 0.0):
             The dropout ratio for activations inside the fully connected layer.
-        classifier_dropout (`float`, *optional*, defaults to 0.0):
-            The dropout ratio for classifier.
         init_std (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
             https://arxiv.org/abs/1909.11556>`__ for more details.
@@ -109,7 +107,6 @@ def __init__(
         activation_dropout=0.0,
         init_std=0.02,
         decoder_start_token_id=2,
-        classifier_dropout=0.0,
         scale_embedding=True,
         pad_token_id=1,
         bos_token_id=0,
@@ -129,7 +126,6 @@ def __init__(
         self.activation_function = activation_function
         self.init_std = init_std
         self.decoder_layerdrop = decoder_layerdrop
-        self.classifier_dropout = classifier_dropout
         self.use_cache = use_cache
         self.num_hidden_layers = decoder_layers
         self.scale_embedding = scale_embedding  # scale factor will be sqrt(d_model) if True