Sockeye 2 heafield quantize pr2 (#812)

* Quantize CLI, Docker build update, version/changelog update.
awslabs · May 22, 2020 · 50393fc · 50393fc
1 parent e4553d3
commit 50393fc
Show file tree

Hide file tree

Showing 15 changed files with 311 additions and 181 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -11,6 +11,19 @@ Note that Sockeye has checks in place to not translate with an old model that wa
 
 Each version section may have have subsections for: _Added_, _Changed_, _Removed_, _Deprecated_, and _Fixed_.
 
+## [2.1.6]
+
+### Changed
+
+- Updated Dockerfiles optimized for CPU (intgemm int8 inference, full MKL support) and GPU (distributed training with Horovod).  See [sockeye_contrib/docker](sockeye_contrib/docker).
+
+### Added
+
+- Official support for int8 quantization with [intgemm](https://github.com/kpu/intgemm):
+  - This requires the "intgemm" fork of MXNet ([kpuatamazon/incubator-mxnet/intgemm](https://github.com/kpuatamazon/incubator-mxnet/tree/intgemm)).  This is the version of MXNet used in the Sockeye CPU docker image (see [sockeye_contrib/docker](sockeye_contrib/docker)).
+  - Use `sockeye.translate --dtype int8` to quantize a trained float32 model at runtime.
+  - Use the `sockeye.quantize` CLI to annotate a float32 model with int8 scaling factors for fast runtime quantization.
+
 ## [2.1.5]
 
 ### Changed

diff --git a/requirements/requirements.horovod.txt b/requirements/requirements.horovod.txt
@@ -1,2 +1,2 @@
-horovod==0.18.1
+horovod==0.19.1
 mpi4py
diff --git a/setup.py b/setup.py
@@ -82,6 +82,7 @@ def get_requirements(filename):
         'sockeye-lexicon = sockeye.lexicon:main',
         'sockeye-init-embed = sockeye.init_embedding:main',
         'sockeye-prepare-data = sockeye.prepare_data:main',
+        'sockeye-quantize = sockeye.quantize:main',
         'sockeye-score = sockeye.score:main',
         'sockeye-train = sockeye.train:main',
         'sockeye-translate = sockeye.translate:main',

diff --git a/sockeye/__init__.py b/sockeye/__init__.py
@@ -1,4 +1,4 @@
-# Copyright 2017--2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# Copyright 2017--2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"). You may not
 # use this file except in compliance with the License. A copy of the License
@@ -11,4 +11,4 @@
 # express or implied. See the License for the specific language governing
 # permissions and limitations under the License.
 
-__version__ = '2.1.5'
+__version__ = '2.1.6'
diff --git a/sockeye/beam_search.py b/sockeye/beam_search.py
@@ -593,8 +593,9 @@ def forward(self,
                 full_to_reduced = dict((val, i) for i, val in enumerate(vocab_slice_ids))
                 raw_constraint_list = [[[full_to_reduced[x] for x in phr] for phr in sent] for sent in
                                        raw_constraint_list]
-            #Pad to a multiple of 8.
-            vocab_slice_ids = np.pad(vocab_slice_ids, (0,7-((len(vocab_slice_ids)-1) % 8)), mode='constant', constant_values = self.eos_id)
+            # Pad to a multiple of 8.
+            vocab_slice_ids = np.pad(vocab_slice_ids, (0, 7 - ((len(vocab_slice_ids) - 1) % 8)),
+                                     mode='constant', constant_values = self.eos_id)
             vocab_slice_ids = mx.nd.array(vocab_slice_ids, ctx=self.context, dtype='int32')
 
             if vocab_slice_ids.shape[0] < self.beam_size + 1:

diff --git a/sockeye/constants.py b/sockeye/constants.py
@@ -185,6 +185,7 @@
 
 VERSION_NAME = "version"
 CONFIG_NAME = "config"
+CONFIG_NAME_FLOAT32 = CONFIG_NAME + ".float32"
 LOG_NAME = "log"
 JSON_SUFFIX = ".json"
 VOCAB_SRC_PREFIX = "vocab.src"
@@ -195,6 +196,7 @@
 PARAMS_PREFIX = "params."
 PARAMS_NAME = PARAMS_PREFIX + "%05d"
 PARAMS_BEST_NAME = "params.best"
+PARAMS_BEST_NAME_FLOAT32 = PARAMS_BEST_NAME + ".float32"
 DECODE_OUT_NAME = "decode.output.%05d"
 DECODE_IN_NAME = "decode.source.%d"
 DECODE_REF_NAME = "decode.target"

diff --git a/sockeye/layers.py b/sockeye/layers.py
@@ -129,9 +129,9 @@ def __init__(self,
             if weight is None or dtype == C.DTYPE_INT8:
                 if dtype == C.DTYPE_INT8:
                     self.scaling = self.params.get('scaling', shape=(1,), init=mx.initializer.Constant(-1.0), dtype=C.DTYPE_FP32, allow_deferred_init=False)
-                    #This is only for inference but MXNet tries to create an
-                    #initializer anyway, then fails because most random
-                    #generators don't support int8 output.
+                    # This is only for inference but MXNet tries to create an
+                    # initializer anyway, then fails because most random
+                    # generators don't support int8 output.
                     weight_initializer = 'zeros'
                 self.weight = self.params.get("weight",
                                               shape=(vocab_size, hidden_size),
@@ -444,7 +444,7 @@ def __init__(self,
 
         self.depth_att = depth_att
         with self.name_scope():
-            self.ff_in = quantization.QuantizableDense(in_units=depth_att, units=depth_att * 3, flatten=False, use_bias=False, prefix='i2h_', dtype = dtype)
+            self.ff_in = quantization.QuantizableDense(in_units=depth_att, units=depth_att * 3, flatten=False, use_bias=False, prefix='i2h_', dtype=dtype)
 
     def hybrid_forward(self, F,
                        inputs: mx.sym.Symbol,
@@ -526,9 +526,9 @@ def __init__(self,
         super().__init__(prefix, depth_att, heads, depth_out, dropout, dtype)
 
         with self.name_scope():
-            self.ff_q = quantization.QuantizableDense(in_units=depth_out, units=depth_att, flatten=False, use_bias=False, prefix='q2h_', dtype = dtype)
-            self.ff_k = quantization.QuantizableDense(in_units=depth_key_value, units=depth_att, flatten=False, use_bias=False, prefix='k2h_', dtype = dtype)
-            self.ff_v = quantization.QuantizableDense(in_units=depth_key_value, units=depth_att, flatten=False, use_bias=False, prefix='v2h_', dtype = dtype)
+            self.ff_q = quantization.QuantizableDense(in_units=depth_out, units=depth_att, flatten=False, use_bias=False, prefix='q2h_', dtype=dtype)
+            self.ff_k = quantization.QuantizableDense(in_units=depth_key_value, units=depth_att, flatten=False, use_bias=False, prefix='k2h_', dtype=dtype)
+            self.ff_v = quantization.QuantizableDense(in_units=depth_key_value, units=depth_att, flatten=False, use_bias=False, prefix='v2h_', dtype=dtype)
 
     def project_and_isolate_heads(self, F, memory: mx.sym.Symbol) -> Tuple[mx.sym.Symbol, mx.sym.Symbol]:
         """
@@ -617,8 +617,8 @@ def __init__(self,
         super().__init__(prefix=prefix)
         self.num_hidden = num_hidden
         with self.name_scope():
-            self.q2h = quantization.QuantizableDense(units=num_hidden, flatten=False, use_bias=True, dtype = dtype)
-            self.kv2h = quantization.QuantizableDense(units=num_hidden * 2, flatten=False, use_bias=True, dtype = dtype)
+            self.q2h = quantization.QuantizableDense(units=num_hidden, flatten=False, use_bias=True, dtype=dtype)
+            self.kv2h = quantization.QuantizableDense(units=num_hidden * 2, flatten=False, use_bias=True, dtype=dtype)
             self.dot_att = DotAttentionCell()
 
     def hybrid_forward(self, F,

diff --git a/sockeye/model.py b/sockeye/model.py
@@ -50,7 +50,8 @@ class ModelConfig(Config):
     :param weight_tying_type: Determines which weights get tied.
     :param lhuc: LHUC (Vilar 2018) is applied at some part of the model.
     :param dtype: Data type of model parameters. Default: float32.
-    :param intgemm_custom_lib: Path to intgemm custom operator library used for dtype is int8.  Default: libintgemm.so in the same directory as this script.
+    :param intgemm_custom_lib: Path to intgemm custom operator library used for dtype is int8.  Default: libintgemm.so
+                               in the same directory as this script.
     """
 
     def __init__(self,
@@ -120,7 +121,8 @@ def __init__(self, config: ModelConfig, inference_only: bool = False, prefix: st
 
             # encoder & decoder first (to know the decoder depth)
             self.encoder = encoder.get_encoder(self.config.config_encoder, prefix=self.prefix, dtype=config.dtype)
-            self.decoder = decoder.get_decoder(self.config.config_decoder, inference_only=inference_only, prefix=self.prefix, dtype=config.dtype)
+            self.decoder = decoder.get_decoder(self.config.config_decoder, inference_only=inference_only,
+                                               prefix=self.prefix, dtype=config.dtype)
 
             self.output_layer = layers.OutputLayer(hidden_size=self.decoder.get_num_hidden(),
                                                    vocab_size=self.config.vocab_target_size,
@@ -452,7 +454,7 @@ def load_model(model_folder: str,
                checkpoint: Optional[int] = None,
                hybridize: bool = True,
                inference_only: bool = False,
-               for_disk_saving: str = None,
+               for_disk_saving: Optional[str] = None,
                allow_missing: bool = False,
                set_grad_req_null: bool = True) -> Tuple[SockeyeModel, List[vocab.Vocab], vocab.Vocab]:
     """
@@ -490,15 +492,19 @@ def load_model(model_folder: str,
     else:
         params_fname = os.path.join(model_folder, C.PARAMS_NAME % checkpoint)
 
-    if (dtype == C.DTYPE_INT8 or model_config.dtype == C.DTYPE_INT8 or for_disk_saving is not None) and "intgemm_fully_connected" not in dir(mx.nd.contrib):
-        #We're going to use int8 but it's not compiled into mxnet.
+    if (dtype == C.DTYPE_INT8 or
+        model_config.dtype == C.DTYPE_INT8 or
+        for_disk_saving is not None) and "intgemm_fully_connected" not in dir(mx.nd.contrib):
+        # We're going to use int8 but it's not compiled into mxnet.
         path = os.path.abspath(model_config.intgemm_custom_lib)
         try:
             mx.library.load(path)
-        except(mx.base.MXNetError):
-            raise NotImplementedError("8-bit int inference requested but intgemm was not compiled into MXNet and a custom operator library was not found in `" + path + "`.  Compile the custom operator then set the path using intgemm_custom_lib in the config file.")
+        except mx.base.MXNetError:
+            raise NotImplementedError("8-bit int inference requested but intgemm was not compiled into MXNet and a "
+                                      "custom operator library was not found in `%s`.  Compile the custom "
+                                      "operator then set the path using intgemm_custom_lib in the config file." % path)
 
-    #Are we converting the model to 8-bit?
+    # Are we converting the model to 8-bit?
     quantizing = model_config.dtype != C.DTYPE_INT8 and (dtype == C.DTYPE_INT8 or for_disk_saving is not None)
     if quantizing:
         model_config.dtype = C.DTYPE_INT8 # Ensure the scaling factor parameters are created.
@@ -535,12 +541,12 @@ def load_model(model_folder: str,
                           ignore_extra=True, #Scaling factors may be present in float32 models.
                           cast_dtype=cast_dtype,
                           dtype_source=dtype_source)
-    
+
     params = model.collect_params()
     if set_grad_req_null:
         for param in params.values():
             param.grad_req = 'null'
-    
+
     if for_disk_saving is not None:
         #Saving scaling factors and possibly int8 values to disk.
         if not quantizing:

diff --git a/sockeye/quantization.py b/sockeye/quantization.py
@@ -1,4 +1,4 @@
-# Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"). You may not
 # use this file except in compliance with the License. A copy of the License
@@ -11,14 +11,17 @@
 # express or implied. See the License for the specific language governing
 # permissions and limitations under the License.
 
-import mxnet as mx
+import logging
 import math
-from . import constants as C
+
+import mxnet as mx
 from mxnet.gluon.nn.activations import Activation
-import logging
+
+from . import constants as C
 
 logger = logging.getLogger(__name__)
 
+
 # Modified from the source to mxnet.gluon.nn.basic_layers.Dense which is:
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
@@ -130,7 +133,7 @@ def cast(self, dtype):
             #No casting an already quantized matrix.
             logger.warning("Ignoring casting on int8 matrix")
 
-    def hybrid_forward(self, F, x, weight, scaling = None, bias=None):
+    def hybrid_forward(self, F, x, weight, scaling=None, bias=None):
         if self._dtype == C.DTYPE_INT8:
             if bias is not None:
                 act = F.contrib.intgemm_fully_connected(x, weight, scaling, bias, no_bias=False, num_hidden=self._units,
@@ -155,28 +158,29 @@ def __repr__(self):
                         layout='{0} -> {1}'.format(shape[1] if shape[1] else None, shape[0]))
 
 
-#Minimize mean squared error of quantizing a tensor, returning the top value
-#(i.e. the one that quantizes to 127).  Scaling = 127.0 / return value.
-def optimize_quantization_mse(tensor, rounds = 10):
-    #This is a convex optimization problem.  EM works but makes slow steps.
-    #Instead of EM, use binary search in the direction minimization suggests.
+def optimize_quantization_mse(tensor, rounds=10):
+    """
+    Minimize mean squared error of quantizing a tensor, returning the top value
+    (i.e. the one that quantizes to 127).  Scaling = 127.0 / return value.
+
+    This is a convex optimization problem.  EM works but makes slow steps.
+    Instead of EM, use binary search in the direction minimization suggests.
+    """
     best_mse = math.inf
     best_top = None
     maxabs = mx.nd.contrib.intgemm_maxabsolute(tensor)
-    # For converting python numbers to MXNet NDArray
-    one = mx.nd.ones(shape=(1,))
     low = 0.0
     high = maxabs
-    for i in range(rounds):
+    for _ in range(rounds):
         value = (low + high) / 2.0
         quant = mx.nd.contrib.intgemm_prepare_data(tensor, value)
         quant_float = mx.nd.cast(quant, dtype=C.DTYPE_FP32)
         mse = (quant_float * (value / 127.0) - tensor).norm().asscalar() / math.sqrt(float(tensor.size))
         if mse < best_mse:
             best_mse = mse
             best_top = value
-        #This optimizes scaling subject to cluster assignment.
-        #It can be used for EM but the step is really slow, so use it for direction.
+        # This optimizes scaling subject to cluster assignment.
+        # It can be used for EM but the step is really slow, so use it for direction.
         scale = mx.nd.sum(quant_float * quant_float) / mx.nd.sum(quant_float * tensor)
         top = 127.0 / scale.asscalar()
         if top < value:
@@ -185,18 +189,19 @@ def optimize_quantization_mse(tensor, rounds = 10):
             low = value
     return best_top
 
+
 def extract_quant_max(tensor_param: mx.gluon.parameter.Parameter, scaling_param: mx.gluon.parameter.Parameter) -> float:
-     """
-     Extract or tune the scaling factor for a parameter.
-     """
-     scaling = scaling_param.data()
-     if scaling.asscalar() < 0:
-         #Bogus auto initialized scaling factor.
-         b_max = optimize_quantization_mse(tensor_param.data())
-         scaling_param.set_data(b_max / 127.0)
-     else:
-         b_max = scaling * 127.0
-     return b_max
+    """
+    Extract or tune the scaling factor for a parameter.
+    """
+    scaling = scaling_param.data()
+    if scaling.asscalar() < 0:
+        # Bogus auto initialized scaling factor.
+        b_max = optimize_quantization_mse(tensor_param.data())
+        scaling_param.set_data(b_max / 127.0)
+    else:
+        b_max = scaling * 127.0
+    return b_max
 
 
 def convert_weights_disk_format(params: mx.gluon.parameter.ParameterDict, dtype_store: str):
@@ -221,6 +226,7 @@ def convert_weights_disk_format(params: mx.gluon.parameter.ParameterDict, dtype_
                     param.set_data(quantized)
                     param.dtype = C.DTYPE_INT8
 
+
 def convert_weights_cpu_dependent(params: mx.gluon.parameter.ParameterDict):
     """
     Convert weights from disk format to intgemm's CPU-dependent format for
@@ -235,15 +241,11 @@ def convert_weights_cpu_dependent(params: mx.gluon.parameter.ParameterDict):
             scaling_name = name[0:-6] + "scaling"
             if scaling_name in params:
                 if param.dtype == C.DTYPE_INT8:
-                    #Already fully quantized, just rearrange.
-                    weight = mx.nd.contrib.intgemm_prepare_weight(
-                        param.data(), already_quantized = True)
+                    # Already fully quantized, just rearrange.
+                    weight = mx.nd.contrib.intgemm_prepare_weight(param.data(), already_quantized = True)
                 else:
-                    #Use offline scaling factor if available.
+                    # Use offline scaling factor if available.
                     b_max = extract_quant_max(param, params[scaling_name])
-                    weight = mx.nd.contrib.intgemm_prepare_weight(
-                        param.data(),
-                        b_max)
+                    weight = mx.nd.contrib.intgemm_prepare_weight(param.data(), b_max)
                 param.set_data(weight)
                 param.dtype = C.DTYPE_INT8
-
diff --git a/sockeye/quantize.py b/sockeye/quantize.py
@@ -0,0 +1,59 @@
+# Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"). You may not
+# use this file except in compliance with the License. A copy of the License
+# is located at
+#
+#     http://aws.amazon.com/apache2.0/
+#
+# or in the "license" file accompanying this file. This file is distributed on
+# an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
+# express or implied. See the License for the specific language governing
+# permissions and limitations under the License.
+
+import argparse
+import logging
+import os
+
+import sockeye.constants as C
+from sockeye.log import setup_main_logger, log_sockeye_version
+import sockeye.model
+from sockeye.utils import check_condition
+
+logger = logging.getLogger(__name__)
+
+
+def annotate_model_params(model_dir: str):
+    log_sockeye_version(logger)
+
+    params_best = os.path.join(model_dir, C.PARAMS_BEST_NAME)
+    params_best_float32 = os.path.join(model_dir, C.PARAMS_BEST_NAME_FLOAT32)
+    config = os.path.join(model_dir, C.CONFIG_NAME)
+    config_float32 = os.path.join(model_dir, C.CONFIG_NAME_FLOAT32)
+
+    for fname in params_best_float32, config_float32:
+        check_condition(not os.path.exists(fname),
+                        'File "%s" exists, indicating this model has already been quantized.' % fname)
+
+    # Load model and compute scaling factors
+    model = sockeye.model.load_model(model_dir, for_disk_saving='float32', dtype='int8')
+    # Move original params and config files
+    os.rename(params_best, params_best_float32)
+    os.rename(config, config_float32)
+    # Write new params and config files with annotated scaling factors
+    model[0].save_parameters(params_best)
+    model[0].save_config(model_dir)
+
+
+def main():
+    setup_main_logger(console=True, file_logging=False)
+    params = argparse.ArgumentParser(
+        description='Annotate trained model with scaling factors for fast loading/quantization for int8 inference.')
+    params.add_argument('--model', '-m', required=True, help='Trained Sockeye model directory.')
+    args = params.parse_args()
+
+    annotate_model_params(args.model)
+
+
+if __name__ == '__main__':
+    main()