Update GPT-2 Model List (#170)

* Update GPT-2 model list
asyml · Aug 23, 2019 · a1f3ac5 · a1f3ac5
1 parent 462559e
commit a1f3ac5
Show file tree

Hide file tree

Showing 11 changed files with 86 additions and 45 deletions.
diff --git a/examples/gpt-2/README.md b/examples/gpt-2/README.md
@@ -41,7 +41,7 @@ pip install -r requirements.txt
 
 This mode will initialize an interactive interface, which allows users to type in the context sentence. The model then
 generates continuation of the context. The example supports both Top-K and Top-P sample decoding. By default, the GPT-2
-`117M` model with Top-K sample decoding is used.
+`gpt2-small` model with Top-K sample decoding is used.
 
 ```bash
 python gpt2_generate_main.py --interactive \
@@ -68,20 +68,18 @@ For *top-p decoding*:
 - `--top-p`: Select tokens with cumulative probability of at most `p` as candidates for sampling. Do not specify it
   if you want to use top-k decoding. 
 
-To use the GPT-2 `345M` model, specify `--pretrained-model-name`:
+To use the GPT-2 `gpt2-medium` or `gpt2-large` model, specify `--pretrained-model-name`:
 
 ```bash
 python gpt2_generate_main.py --interactive \
     --max-decoding-length=100 \
     --temperature=0.7 \
     --top-k=40 \
-    --pretrained-model-name=345M 
+    --pretrained-model-name=gpt2-medium
 ```
-
 Here:
 
-- `pretrained-model-name`: Name of the pre-trained checkpoint to load. Available options are: `117M` and `345M`.
-  Defaults to `345M`. 
+- `pretrained-model-name`: Name of the pre-trained checkpoint to load. Available options are: `gpt2-small`, `gpt2-medium`, and `gpt2-large`. Defaults to `gpt2-small`. 
 
 To use Top-P sample decoding, specify `--top-p`:
 
@@ -90,7 +88,7 @@ python gpt2_generate_main.py --interactive \
     --max-decoding-length=100 \
     --temperature=0.7 \
     --top-p=40 \
-    --pretrained-model-name=345M 
+    --pretrained-model-name=gpt2-medium 
 ```
 
 Here:
@@ -102,7 +100,7 @@ Here:
 **Example input:**
 
 ```
-Model input >>> Micheal Jordan is the greatest player in history !
+Model input >>> Michael Jordan is the greatest player in history !
 ```
 
 **Example output:**
@@ -138,7 +136,7 @@ Here:
 - `--nsamples`: Total number of samples to generate, must be divisible by the batch size.
 - `--batch-size`: The batch size. Each iteration generates this many samples.
 
-To use GPT-2 `345M` model, `--pretrained-model-name` as above.
+To use GPT-2 `gpt2-medium` or `gpt2-large` model, specify `--pretrained-model-name` as above.
 
 **Example output:**
 
@@ -170,7 +168,7 @@ perform processing such as truncation, BPE encoding, adding special tokens, etc:
 python prepare_data.py --data-dir data/toy \
     --max-seq-length=128 \
     --output-dir=data/toy \
-    --pretrained-model-name=117M
+    --pretrained-model-name=gpt2-small
 ```
 
 - `--data-dir`: The directory of raw data, wherein data files must be named as 'train.txt', 'dev.txt', or 'test.txt'. It
@@ -179,7 +177,7 @@ python prepare_data.py --data-dir data/toy \
   automatically added. Longer sequence will be trimmed. 
 - `--output-dir`: The output path where the resulting pickled files will be put in. Be default, it is set to be the same
   as `--data-dir`. 
-- `--pretrained-model-name`: The name of a pre-trained model to load selected in the list of: `117M`, `345M`.
+- `--pretrained-model-name`: The name of a pre-trained model to load selected in the list of: `gpt2-small`, `gpt2-medium`, and `gpt2-large`.
 
 The above command will output pickled files in the specified output directory. E.g., if `train.txt` is provided under
 `data_dir`, the output file `train.pkl` will be produced under `output_dir`.
@@ -202,11 +200,11 @@ Here:
   using your own data.
 - `--output-dir`: The output path where checkpoints are saved.
 
-By default, the GPT-2 `117M` model is used. To use the GPT-2 `345M` model instead, specify relevant arguments as below:
+By default, the GPT-2 `gpt2-small` model is used. To use the GPT-2 `gpt2-medium` or `gpt2-large` model instead, specify relevant arguments as below:
 
 ```bash
 python gpt2_train_main.py --do-train --do-eval \
-    --pretrained-model-name=345M \
+    --pretrained-model-name=gpt2-medium \
     --config-train=configs.config_train \
     --output-dir=output
 ```

diff --git a/examples/gpt-2/gpt2_generate_main.py b/examples/gpt-2/gpt2_generate_main.py
@@ -27,7 +27,7 @@
     '--checkpoint', type=str, default=None,
     help="Model checkpoint to load model weights from.")
 parser.add_argument(
-    "--pretrained-model-name", type=str, default="117M",
+    "--pretrained-model-name", type=str, default="gpt2-small",
     choices=tx.modules.GPT2Decoder.available_checkpoints(),
     help="Name of the pre-trained checkpoint to load.")
 parser.add_argument(

diff --git a/examples/gpt-2/gpt2_train_main.py b/examples/gpt-2/gpt2_train_main.py
@@ -28,7 +28,7 @@
     '--checkpoint', type=str, default=None,
     help="Model checkpoint to load model weights from.")
 parser.add_argument(
-    "--pretrained-model-name", type=str, default="117M",
+    "--pretrained-model-name", type=str, default="gpt2-small",
     choices=tx.modules.GPT2Decoder.available_checkpoints(),
     help="Name of the pre-trained checkpoint to load.")
 parser.add_argument(

diff --git a/examples/gpt-2/prepare_data.py b/examples/gpt-2/prepare_data.py
@@ -34,7 +34,7 @@
     help="The output directory where the pickle files will be generated. "
          "By default it is set to be the same as `--data-dir`.")
 parser.add_argument(
-    "--pretrained-model-name", type=str, default="117M",
+    "--pretrained-model-name", type=str, default="gpt2-small",
     choices=tx.modules.GPT2Decoder.available_checkpoints(),
     help="Name of the pre-trained checkpoint to load.")
 parser.add_argument(

diff --git a/texar/torch/modules/classifiers/gpt2_classifier.py b/texar/torch/modules/classifiers/gpt2_classifier.py
@@ -45,7 +45,7 @@ class GPT2Classifier(ClassifierBase, PretrainedGPT2Mixin):
 
     Args:
         pretrained_model_name (optional): a `str`, the name
-            of pre-trained model (e.g., ``117M``). Please refer to
+            of pre-trained model (e.g., ``gpt2-small``). Please refer to
             :class:`~texar.torch.modules.pretrained.PretrainedGPT2Mixin` for
             all supported models.
             If `None`, the model name in :attr:`hparams` is used.

diff --git a/texar/torch/modules/decoders/gpt2_decoder.py b/texar/torch/modules/decoders/gpt2_decoder.py
@@ -41,7 +41,7 @@ class GPT2Decoder(TransformerDecoder, PretrainedGPT2Mixin):
 
     Args:
         pretrained_model_name (optional): a `str`, the name
-            of pre-trained model (e.g., ``117M``). Please refer to
+            of pre-trained model (e.g., ``gpt2-small``). Please refer to
             :class:`~texar.torch.modules.pretrained.PretrainedGPT2Mixin` for
             all supported models.
             If `None`, the model name in :attr:`hparams` is used.
@@ -104,7 +104,7 @@ def default_hparams():
 
             {
                 "name": "gpt2_decoder",
-                "pretrained_model_name": "117M",
+                "pretrained_model_name": "gpt2-small",
                 "vocab_size": 50257,
                 "context_size": 1024,
                 "embedding_size": 768,
@@ -170,7 +170,7 @@ def default_hparams():
 
         Here:
 
-        The default parameters are values for 117M GPT2 model.
+        The default parameters are values for 124M GPT2 model.
 
         `"pretrained_model_name"`: str or None
             The name of the pre-trained GPT2 model. If None, the model
@@ -239,7 +239,7 @@ def default_hparams():
                 'name': 'ffn'
             },
 
-            'pretrained_model_name': '117M',
+            'pretrained_model_name': 'gpt2-small',
             'vocab_size': 50257,
             'context_size': 1024,
             'embedding_size': 768,

diff --git a/texar/torch/modules/decoders/gpt2_decoder_test.py b/texar/torch/modules/decoders/gpt2_decoder_test.py
@@ -7,7 +7,8 @@
 
 from texar.torch.modules.decoders import decoder_helpers
 from texar.torch.modules.decoders.gpt2_decoder import GPT2Decoder
-from texar.torch.modules.decoders.transformer_decoders import TransformerDecoderOutput
+from texar.torch.modules.decoders.transformer_decoders import \
+    TransformerDecoderOutput
 from texar.torch.utils.test import pretrained_test
 
 
@@ -29,16 +30,16 @@ def test_hparams(self):
         """
         # case 1: set "pretrained_mode_name" by constructor argument
         hparams = {
-            "pretrained_model_name": "345M",
+            "pretrained_model_name": "gpt2-medium",
         }
-        decoder = GPT2Decoder(pretrained_model_name="117M",
+        decoder = GPT2Decoder(pretrained_model_name="gpt2-small",
                               hparams=hparams)
         self.assertEqual(decoder.hparams.num_blocks, 12)
         _ = decoder(self.inputs)
 
         # case 2: set "pretrained_mode_name" by hparams
         hparams = {
-            "pretrained_model_name": "117M",
+            "pretrained_model_name": "gpt2-small",
             "num_blocks": 6,
         }
         decoder = GPT2Decoder(hparams=hparams)
@@ -67,19 +68,27 @@ def test_trainable_variables(self):
         def get_variable_num(n_layers: int) -> int:
             return 1 + 1 + n_layers * 26 + 2
 
-        # case 1: GPT2 117M
+        # case 1: GPT2 small
         decoder = GPT2Decoder()
         self.assertEqual(len(decoder.trainable_variables), get_variable_num(12))
         _ = decoder(self.inputs)
 
-        # case 2: GPT2 345M
+        # case 2: GPT2 medium
         hparams = {
-            "pretrained_model_name": "345M",
+            "pretrained_model_name": "gpt2-medium",
         }
         decoder = GPT2Decoder(hparams=hparams)
         self.assertEqual(len(decoder.trainable_variables), get_variable_num(24))
         _ = decoder(self.inputs)
 
+        # case 2: GPT2 large
+        hparams = {
+            "pretrained_model_name": "gpt2-large",
+        }
+        decoder = GPT2Decoder(hparams=hparams)
+        self.assertEqual(len(decoder.trainable_variables), get_variable_num(36))
+        _ = decoder(self.inputs)
+
         # case 3: self-designed GPT2
         hparams = {
             "pretrained_model_name": None,

diff --git a/texar/torch/modules/encoders/gpt2_encoder.py b/texar/torch/modules/encoders/gpt2_encoder.py
@@ -39,7 +39,7 @@ class GPT2Encoder(TransformerEncoder, PretrainedGPT2Mixin):
 
     Args:
         pretrained_model_name (optional): a `str`, the name
-            of pre-trained model (e.g., ``117M``). Please refer to
+            of pre-trained model (e.g., ``gpt2-small``). Please refer to
             :class:`~texar.torch.modules.pretrained.PretrainedGPT2Mixin` for
             all supported models.
             If `None`, the model name in :attr:`hparams` is used.
@@ -93,7 +93,7 @@ def default_hparams():
         .. code-block:: python
 
             {
-                "pretrained_model_name": "117M",
+                "pretrained_model_name": "gpt2-small",
                 "vocab_size": 50257,
                 "context_size": 1024,
                 "embedding_size": 768,
@@ -159,7 +159,7 @@ def default_hparams():
 
         Here:
 
-        The default parameters are values for 117M GPT2 model.
+        The default parameters are values for 124M GPT2 model.
 
         `"pretrained_model_name"`: str or None
             The name of the pre-trained GPT2 model. If None, the model
@@ -238,7 +238,7 @@ def default_hparams():
                 'name': 'ffn'
             },
 
-            'pretrained_model_name': '117M',
+            'pretrained_model_name': 'gpt2-small',
             'vocab_size': 50257,
             'context_size': 1024,
             'embedding_size': 768,

diff --git a/texar/torch/modules/encoders/gpt2_encoder_test.py b/texar/torch/modules/encoders/gpt2_encoder_test.py
@@ -31,16 +31,16 @@ def test_hparams(self):
         """
         # case 1: set "pretrained_mode_name" by constructor argument
         hparams = {
-            "pretrained_model_name": "345M",
+            "pretrained_model_name": "gpt2-medium",
         }
-        encoder = GPT2Encoder(pretrained_model_name="117M",
+        encoder = GPT2Encoder(pretrained_model_name="gpt2-small",
                               hparams=hparams)
         self.assertEqual(encoder.hparams.num_blocks, 12)
         _ = encoder(self.inputs)
 
         # case 2: set "pretrained_mode_name" by hparams
         hparams = {
-            "pretrained_model_name": "117M",
+            "pretrained_model_name": "gpt2-small",
             "num_blocks": 6,
         }
         encoder = GPT2Encoder(hparams=hparams)
@@ -70,19 +70,27 @@ def test_trainable_variables(self):
         def get_variable_num(n_layers: int) -> int:
             return 1 + 1 + n_layers * 16 + 2
 
-        # case 1: GPT2 117M
+        # case 1: GPT2 small
         encoder = GPT2Encoder()
         self.assertEqual(len(encoder.trainable_variables), get_variable_num(12))
         _ = encoder(self.inputs)
 
-        # case 2: GPT2 345M
+        # case 2: GPT2 medium
         hparams = {
-            "pretrained_model_name": "345M",
+            "pretrained_model_name": "gpt2-medium",
         }
         encoder = GPT2Encoder(hparams=hparams)
         self.assertEqual(len(encoder.trainable_variables), get_variable_num(24))
         _ = encoder(self.inputs)
 
+        # case 2: GPT2 large
+        hparams = {
+            "pretrained_model_name": "gpt2-large",
+        }
+        encoder = GPT2Encoder(hparams=hparams)
+        self.assertEqual(len(encoder.trainable_variables), get_variable_num(36))
+        _ = encoder(self.inputs)
+
         # case 3: self-designed GPT2
         hparams = {
             "pretrained_model_name": None,
@@ -95,7 +103,7 @@ def get_variable_num(n_layers: int) -> int:
     def test_encode(self):
         r"""Tests encoding.
         """
-        # case 1: GPT2 117M
+        # case 1: GPT2 small
         hparams = {
             "pretrained_model_name": None,
         }

diff --git a/texar/torch/modules/pretrained/pretrained_gpt2.py b/texar/torch/modules/pretrained/pretrained_gpt2.py
@@ -18,6 +18,7 @@
 import json
 import os
 import sys
+import warnings
 from abc import ABC
 from typing import Any, Dict
 
@@ -46,18 +47,43 @@ class PretrainedGPT2Mixin(PretrainedMixin, ABC):
 
     The available GPT2 models are as follows:
 
-      * ``117M``: Small version of GPT-2, 117M parameters.
-      * ``345M``: Medium version of GPT-2, 345M parameters.
+      * ``gpt2-small``: Small version of GPT-2, 124M parameters.
+      * ``gpt2-medium``: Medium version of GPT-2, 355M parameters.
+      * ``gpt2-large``: Large version of GPT-2, 774M parameters.
 
     .. _`Language Models are Unsupervised Multitask Learners`:
         https://openai.com/blog/better-language-models/
     """
     _MODEL_NAME = "GPT2"
     _MODEL2URL = {
-        '117M': [_GPT2_PATH + f"117M/{file}" for file in _CHECKPOINT_FILES],
-        '345M': [_GPT2_PATH + f"345M/{file}" for file in _CHECKPOINT_FILES],
+        'gpt2-small': [_GPT2_PATH + f"124M/{file}"
+                       for file in _CHECKPOINT_FILES],
+        'gpt2-medium': [_GPT2_PATH + f"355M/{file}"
+                        for file in _CHECKPOINT_FILES],
+        'gpt2-large': [_GPT2_PATH + f"774M/{file}"
+                       for file in _CHECKPOINT_FILES],
     }
 
+    # Raise warning for the deprecated pre-trained model names
+    class MyDict(dict):
+        def __contains__(self, key):
+            if key == '117M':
+                warnings.warn("Pre-trained model name '117M' is deprecated, "
+                              "use 'gpt2-small' instead.", UserWarning)
+                return True
+            elif key == '345M':
+                warnings.warn("Pre-trained model name '345M' is deprecated, "
+                              "use 'gpt2-medium' instead.", UserWarning)
+                return True
+            else:
+                return super().__contains__(key)
+    _DEPRECATED_MODEL2URL = {
+        '117M': [_GPT2_PATH + f"124M/{file}" for file in _CHECKPOINT_FILES],
+        '345M': [_GPT2_PATH + f"355M/{file}" for file in _CHECKPOINT_FILES],
+    }
+    _MODEL2URL.update(_DEPRECATED_MODEL2URL)
+    _MODEL2URL = MyDict(_MODEL2URL)  # type: ignore
+
     @classmethod
     def _transform_config(cls, pretrained_model_name: str,
                           cache_dir: str) -> Dict[str, Any]:

diff --git a/texar/torch/modules/pretrained/pretrained_gpt2_test.py b/texar/torch/modules/pretrained/pretrained_gpt2_test.py
@@ -16,7 +16,7 @@ class GPT2UtilsTest(unittest.TestCase):
     @pretrained_test
     def test_load_pretrained_gpt2_AND_transform_gpt2_to_texar_config(self):
         pretrained_model_dir = PretrainedGPT2Mixin.download_checkpoint(
-            pretrained_model_name="117M")
+            pretrained_model_name="gpt2-small")
 
         info = list(os.walk(pretrained_model_dir))
         _, _, files = info[0]
@@ -29,7 +29,7 @@ def test_load_pretrained_gpt2_AND_transform_gpt2_to_texar_config(self):
         self.assertIn('vocab.bpe', files)
 
         model_config = PretrainedGPT2Mixin._transform_config(
-            pretrained_model_name="117M",
+            pretrained_model_name="gpt2-small",
             cache_dir=pretrained_model_dir)
 
         exp_config = {