Add Support for DeepSpeed ZeRO Stage 1 (#1059)

awslabs · Aug 25, 2022 · 9690498 · 9690498
1 parent d80169d
commit 9690498
Show file tree

Hide file tree

Showing 25 changed files with 651 additions and 61 deletions.
diff --git a/.github/workflows/push_pr.yml b/.github/workflows/push_pr.yml
@@ -30,6 +30,8 @@ jobs:
       run: python -m pip install --upgrade pip
     - name: Sockeye requirements
       run: pip install -r requirements/requirements.txt
+    - name: DeepSpeed requirements
+      run: pip install -r requirements/requirements.deepspeed.txt
     - name: Development requirements
       run: pip install -r requirements/requirements.dev.txt
     - name: Unit tests

diff --git a/.github/workflows/python-publish.yml b/.github/workflows/python-publish.yml
@@ -24,6 +24,8 @@ jobs:
         pip install setuptools wheel twine
     - name: Sockeye requirements
       run: pip install -r requirements/requirements.txt
+    - name: DeepSpeed requirements
+      run: pip install -r requirements/requirements.deepspeed.txt
     - name: Development requirements
       run: pip install -r requirements/requirements.dev.txt
     - name: Unit tests

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -11,6 +11,16 @@ Note that Sockeye has checks in place to not translate with an old model that wa
 
 Each version section may have subsections for: _Added_, _Changed_, _Removed_, _Deprecated_, and _Fixed_.
 
+## [3.1.20]
+
+### Added
+
+- Added training support for [DeepSpeed](https://www.deepspeed.ai/).
+  - Installation: `pip install deepspeed`
+  - Usage: `deepspeed --no_python ... sockeye-train ...`
+  - DeepSpeed mode uses Zero Redundancy Optimizer (ZeRO) stage 1 ([Rajbhandari et al., 2019](https://arxiv.org/abs/1910.02054v3)).
+  - Run in FP16 mode with `--deepspeed-fp16` or BF16 mode with `--deepspeed-bf16`.
+
 ## [3.1.19]
 
 ### Added

diff --git a/MANIFEST.in b/MANIFEST.in
@@ -7,6 +7,7 @@ include .pylintrc
 include .flake8
 include typechecked-files
 include test/data/config_with_missing_attributes.yaml
+recursive-include test/data/deepspeed *
 include test/data/model_3.0.x/*
 include sockeye/git_version.py
 include *.bib

diff --git a/requirements/requirements.deepspeed.txt b/requirements/requirements.deepspeed.txt
@@ -0,0 +1 @@
+deepspeed
diff --git a/setup.py b/setup.py
@@ -76,6 +76,7 @@ def get_requirements(filename):
 entry_points = {
     'console_scripts': [
         'sockeye-average = sockeye.average:main',
+        'sockeye-convert-deepspeed = sockeye.convert_deepspeed:main',
         'sockeye-embeddings = sockeye.embeddings:main',
         'sockeye-evaluate = sockeye.evaluate:main',
         'sockeye-lexicon = sockeye.lexicon:main',

diff --git a/sockeye/__init__.py b/sockeye/__init__.py
@@ -11,4 +11,4 @@
 # express or implied. See the License for the specific language governing
 # permissions and limitations under the License.
 
-__version__ = '3.1.19'
+__version__ = '3.1.20'
diff --git a/sockeye/arguments.py b/sockeye/arguments.py
@@ -1043,6 +1043,24 @@ def add_training_args(params):
                               nargs='*',
                               help="Manually specify names of parameters to fix during training. Default: %(default)s.")
 
+    # DeepSpeed arguments
+    train_params.add_argument('--local_rank',
+                               type=int_greater_or_equal(0),
+                               default=None,
+                               help='The DeepSpeed launcher (`deepspeed`) automatically adds this argument. When it is '
+                                    'present, training runs in DeepSpeed mode. This argument does not need to be '
+                                    'specified manually.')
+    train_params.add_argument('--deepspeed-fp16',
+                              action='store_true',
+                              default=False,
+                              help='Run the model in float16 mode with float32 master weights and dynamic loss '
+                                   'scaling. This is similar to --apex-amp. Default: %(default)s.')
+    train_params.add_argument('--deepspeed-bf16',
+                              action='store_true',
+                              default=False,
+                              help='Run the model in bfloat16 mode, which does not require loss scaling. '
+                                   'Default: %(default)s.')
+
     train_params.add_argument(C.TRAIN_ARGS_MONITOR_BLEU,
                               default=500,
                               type=int,

diff --git a/sockeye/constants.py b/sockeye/constants.py
@@ -162,6 +162,7 @@
 TRAINING_STATE_DIRNAME = "training_state"
 TRAINING_STATE_TEMP_DIRNAME = "tmp.training_state"
 TRAINING_STATE_TEMP_DELETENAME = "delete.training_state"
+TRAINING_STATE_DEEPSPEED = "deepspeed"
 
 OPT_STATE_LAST = "optimizer_last.pkl"
 OPT_STATE_BEST = "optimizer_best.pkl"
@@ -180,7 +181,7 @@
 # Arguments that may differ and still resume training
 ARGS_MAY_DIFFER = ["device_id", "device_ids", "overwrite_output", "use_tensorboard", "quiet", "align_plot_prefix",
                    "sure_align_threshold", "keep_last_params", "seed", "max_updates", "min_updates", "max_num_epochs",
-                   "min_num_epochs", "max_samples", "min_samples", "max_checkpoints", "max_seconds"]
+                   "min_num_epochs", "max_samples", "min_samples", "max_checkpoints", "max_seconds", "local_rank"]
 
 # Other argument constants
 TRAINING_ARG_SOURCE = "--source"

diff --git a/sockeye/convert_deepspeed.py b/sockeye/convert_deepspeed.py
@@ -0,0 +1,90 @@
+# Copyright 2022 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"). You may not
+# use this file except in compliance with the License. A copy of the License
+# is located at
+#
+#     http://aws.amazon.com/apache2.0/
+#
+# or in the "license" file accompanying this file. This file is distributed on
+# an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
+# express or implied. See the License for the specific language governing
+# permissions and limitations under the License.
+
+import argparse
+import gc
+import logging
+import os
+import shutil
+
+from . import constants as C
+from . import model
+
+try:
+    import deepspeed
+    import deepspeed.utils.zero_to_fp32
+except ImportError:
+    pass
+
+
+logger = logging.getLogger(__name__)
+
+
+def convert_checkpoint_to_params(model_config_fname: str, checkpoint_dirname: str, params_fname: str):
+    # Create a temporary SockeyeModel
+    model_config = model.SockeyeModel.load_config(model_config_fname)
+    sockeye_model = model.SockeyeModel(model_config)
+    # Gather the float32 params on CPU
+    state_dict = deepspeed.utils.zero_to_fp32.get_fp32_state_dict_from_zero_checkpoint(checkpoint_dirname)
+    # Strip the first prefix from each param name to match the SockeyeModel
+    # Ex: 'model.encoder.layers...' -> 'encoder.layers...'
+    state_dict = {name[name.find('.') + 1:]: param for (name, param) in state_dict.items()}
+    # Load the float32 params. Use non-strict mode because shared and constant
+    # params are not included in the DeepSpeed-generated state dict.
+    sockeye_model.load_state_dict(state_dict, strict=False)
+    # Save the float32 params to disk
+    sockeye_model.save_parameters(params_fname)
+    # Cleanup
+    del sockeye_model
+    gc.collect()
+
+
+def convert_model_checkpoints(model_dirname: str, keep_deepspeed: bool = False):
+    model_config_fname = os.path.join(model_dirname, C.CONFIG_NAME)
+    # Find and convert params.00000, etc.
+    for fname in os.listdir(model_dirname):
+        if fname.startswith(C.PARAMS_PREFIX) and fname[len(C.PARAMS_PREFIX):].isdigit():
+            params_fname = os.path.join(model_dirname, fname)
+            if os.path.isdir(params_fname):
+                logger.info(f'Converting checkpoint {params_fname}')
+                # Move directory checkpoint to e.g., params.00000.ds
+                checkpoint_dirname = params_fname + '.ds'
+                shutil.move(params_fname, checkpoint_dirname)
+                # Create params file for directory checkpoint
+                convert_checkpoint_to_params(model_config_fname, checkpoint_dirname, params_fname)
+                if not keep_deepspeed:
+                    shutil.rmtree(checkpoint_dirname)
+    # Update params.best
+    params_best_fname = os.path.join(model_dirname, C.PARAMS_BEST_NAME)
+    if os.path.exists(params_best_fname) and os.path.islink(params_best_fname):
+        logger.info(f'Updating {params_best_fname}')
+        params_best_target = os.readlink(params_best_fname)
+        os.remove(params_best_fname)
+        os.symlink(params_best_target, params_best_fname)
+
+
+def main():
+    params = argparse.ArgumentParser(
+        description="Convert DeepSpeed checkpoints to regular parameter files in a Sockeye model directory.")
+    params.add_argument('--model', '-m',
+                        required=True,
+                        help='Model directory containing DeepSpeed checkpoints.')
+    params.add_argument('--keep-deepspeed', '-k',
+                        action='store_true',
+                        help='Keep DeepSpeed checkpoints (renamed e.g., params.00000.ds).')
+    args = params.parse_args()
+    convert_model_checkpoints(args.model, keep_deepspeed=args.keep_deepspeed)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/sockeye/optimizers.py b/sockeye/optimizers.py
@@ -61,8 +61,9 @@ def get_optimizer(config: OptimizerConfig) -> Tuple[Type[torch.optim.Optimizer],
     # https://pytorch.org/tutorials/recipes/recipes/tuning_guide.html
     zero_grad_kwargs = {'set_to_none': True}
 
-    # Use Apex's fused optimizers if Apex is available
-    if config.running_on_gpu:
+    # Use Apex's fused optimizers if Apex is available and we aren't using
+    # DeepSpeed, which includes its own optimizers.
+    if config.running_on_gpu and not utils.using_deepspeed():
         try:
             from apex.optimizers import FusedAdam, FusedSGD
             adam_impl = FusedAdam