apache · samskalicky · Nov 16, 2020 · Oct 20, 2020 · Nov 6, 2020 · Nov 10, 2020
@@ -88,15 +88,15 @@ The `optimize_for` API takes at least 1 argument, `backend` which is a string th
 For the Gluon API, `hybridize` can be called on HybridBlocks to execute a graph pass on the internal CachedOp Symbol.
 
 ```python
-block.hybridize(backend=None, backend_opts=None, **kwargs)
+block.hybridize(backend=None, **kwargs)
 ```
 
-The `hybridize` function prepares the HybridBlock to be converted into a backend symbol. The `backend` argument is a string that identifies which pass that will be executed on the model. The `backend_opts` takes other user-specified options that will be passed to the backend APIs. The actual pass runs once just before the first the forward pass.
+The `hybridize` function prepares the HybridBlock to be converted into a backend symbol. The `backend` argument is a string that identifies which pass that will be executed on the model. `**kwargs` might contain other user-specified options that will be passed to the backend APIs. The actual pass runs once just before the first the forward pass.
 
 If you just want to run a graph pass on the HybridBlock but not run a complete forward pass, you can use the `optimize_for` API that combines the work done in the `hybridize` API with part of the work done in the forward pass.
 
 ```python
-block.optimize_for(x, backend=None, backend_opts=None, **kwargs)
+block.optimize_for(x, backend=None, **kwargs)
 ```
 
 When the `optimize_for` API is called on a HybridBlock it runs the graph pass immediately. This lets users export the modified model without running a complete forward pass.

@@ -107,15 +107,15 @@ The `optimize_for` API takes at least 1 argument, `backend` which is a string th
 For the Gluon API, `hybridize` can be called on HybridBlocks to partition the internal CachedOp Symbol.
 
 ```python
-block.hybridize(backend=None, backend_opts=None, clear=True, **kwargs)
+block.hybridize(backend=None, clear=True, **kwargs)
 ```
 
-The `hybridize` function prepares the HybridBlock to be converted into a backend symbol. The `backend` argument is a string that identifies which backend that will partition the model. The `backend_opts` are other user-specified options (as a Python dictionary of strings mapped to strings) that will be passed to the backend partitioning APIs. The `clear` argument defaults to `True` and clears any previous optimizations done on the block. If you want to chain optimizations together, set `clear` to `False`. The actual partitioning takes place during the forward pass. If you want to use `hybridize` to chain multiple optimizations, be sure to execute a forward pass after each call to `hybridize`. 
+The `hybridize` function prepares the HybridBlock to be converted into a backend symbol. The `backend` argument is a string that identifies which backend that will partition the model. `**kwargs` are other user-specified options (as a Python dictionary of strings mapped to strings) that will be passed to the backend partitioning APIs. The `clear` argument defaults to `False`, so it will chain optimizations together. If you want to clear clear any previous optimizations done on the block, set `clear` to `True`. The actual partitioning takes place during the forward pass. If you want to use `hybridize` to chain multiple optimizations, be sure to execute a forward pass after each call to `hybridize`.
 
 If you just want to partition the HybridBlock but not run a complete forward pass, you can use the `optimize_for` API that combines the work done in the `hybridize` API with part of the work done in the forward pass.
 
 ```python
-block.optimize_for(x, backend=None, backend_opts=None, clear=True, **kwargs)
+block.optimize_for(x, backend=None, clear=False, **kwargs)
 ```
 
 When the `optimize_for` API is called on a HybridBlock it partitions immediately. This lets users export the partitioned model without running a complete forward pass. Chaining multiple optimizations is as simple as calling `optimize_for` multiple times, no need to execute a forward pass (as opposed to `hybridize`).

@@ -92,7 +92,7 @@ def test(backend):
     inputs = [a,b]
     sym_block = nn.SymbolBlock(sym, inputs)
     sym_block.initialize()
-    sym_block.hybridize(backend=backend, backend_opts={'dedup_subgraph':True})
+    sym_block.hybridize(backend=backend, dedup_subgraph=True)
     out2 = sym_block(mx.nd.ones((3,2)),mx.nd.ones((3,2)))
     print(out2)
 
@@ -103,14 +103,14 @@ def test(backend):
     sym_block2 = nn.SymbolBlock(sym, inputs)
     sym_block2.initialize()
     sym_block2.optimize_for(mx.nd.ones((3,2)), mx.nd.ones((3,2)), backend=backend,
-                            backend_opts={'dedup_subgraph':True})
+                            dedup_subgraph=True)
     sym_block2.export('partitioned')
 
     # Test with additional input to subgraph op
     print('-------------------------------')
     print('Testing %s Gluon Hybridize partitioning with extra input' % backend)
     sym_block2.optimize_for(mx.nd.ones((3,2)), mx.nd.ones((3,2)), backend="addInputPass",
-                            clear=False, backend_opts={'dedup_subgraph':True})
+                            dedup_subgraph=True)
     out3 = sym_block2(mx.nd.ones((3,2)),mx.nd.ones((3,2)))
     print(out3)
 

@@ -1059,7 +1059,13 @@ def _call_cached_op(self, *args):
             out = [out]
         return _regroup(out, self._out_format)
 
-    def optimize_for(self, x, *args, backend=None, backend_opts=None, clear=True, **kwargs):
+    def optimize_for(self, x, *args, backend=None, clear=False,
+                     static_alloc=False,
+                     static_shape=False,
+                     inline_limit=2,
+                     forward_bulk_size=None,
+                     backward_bulk_size=None,
+                     **kwargs):
         """Partitions the current HybridBlock and optimizes it for a given backend
         without executing a forward pass. Modifies the HybridBlock in-place.
 
@@ -1087,19 +1093,29 @@ def optimize_for(self, x, *args, backend=None, backend_opts=None, clear=True, **
             other inputs to model
         backend : str
             The name of backend, as registered in `SubgraphBackendRegistry`, default None
-        backend_opts : dict of user-specified options to pass to the backend for partitioning, optional
-            Passed on to `PrePartition` and `PostPartition` functions of `SubgraphProperty`
-        clear : clears any previous optimizations
+        clear : bool, default False
+            Clears any previous optimizations
         static_alloc : bool, default False
             Statically allocate memory to improve speed. Memory usage may increase.
         static_shape : bool, default False
             Optimize for invariant input shapes between iterations. Must also
             set static_alloc to True. Change of input shapes is still allowed
             but slower.
+        inline_limit : optional int, default 2
+            Maximum number of operators that can be inlined.
+        forward_bulk_size : optional int, default None
+            Segment size of bulk execution during forward pass.
+        backward_bulk_size : optional int, default None
+            Segment size of bulk execution during forward pass.
+        **kwargs: The backend options, optional
+            Passed on to `PrePartition` and `PostPartition` functions of `SubgraphProperty`
         """
+        if len(kwargs) > 0:
+            self._backend_opts = kwargs
 
-        # do hybrize API call
-        self.hybridize(True, backend, backend_opts, clear, **kwargs)
+        if clear or not self._active:
+            self.hybridize(True, backend, clear, static_alloc, static_shape,
+                           inline_limit, forward_bulk_size, backward_bulk_size, kwargs)
 
         # do part of forward API call
         has_symbol, has_ndarray, ctx_set, _ = _gather_type_ctx_info([x] + list(args))
@@ -1137,7 +1153,12 @@ def register_child(self, block, name=None):
         super(HybridBlock, self).register_child(block, name)
         self._clear_cached_op()
 
-    def hybridize(self, active=True, backend=None, backend_opts=None, clear=True, **kwargs):
+    def hybridize(self, active=True, backend=None, clear=True,
+                  static_alloc=False, static_shape=False,
+                  inline_limit=2,
+                  forward_bulk_size=None,
+                  backward_bulk_size=None,
+                  **kwargs):
         """Activates or deactivates :py:class:`HybridBlock` s recursively. Has no effect on
         non-hybrid children.
 
@@ -1147,32 +1168,47 @@ def hybridize(self, active=True, backend=None, backend_opts=None, clear=True, **
             Whether to turn hybrid on or off.
         backend : str
             The name of backend, as registered in `SubgraphBackendRegistry`, default None
-        backend_opts : dict of user-specified options to pass to the backend for partitioning, optional
-            Passed on to `PrePartition` and `PostPartition` functions of `SubgraphProperty`
-        clear : clears any previous optimizations
-        static_alloc : bool, default False
+        clear : bool, default True
+            Clears any previous optimizations
+        static_alloc : optional bool, default False
             Statically allocate memory to improve speed. Memory usage may increase.
-        static_shape : bool, default False
+        static_shape : optional bool, default False
             Optimize for invariant input shapes between iterations. Must also
             set static_alloc to True. Change of input shapes is still allowed
             but slower.
+        inline_limit : optional int, default 2
+            Maximum number of operators that can be inlined.
+        forward_bulk_size : optional int, default None
+            Segment size of bulk execution during forward pass.
+        backward_bulk_size : optional int, default None
+            Segment size of bulk execution during forward pass.
+        **kwargs:  optional
+            Backend options.
         """
+        if len(kwargs) > 0:
+            self._backend_opts = kwargs
 
         self._backend = backend
-        if backend_opts is not None:
-            assert isinstance(backend_opts, dict), \
-            "HybridBlock hybridize requires backend_opts to be a dictionary."
-            self._backend_opts = backend_opts
 
         self._active = active
-        self._flags = list(kwargs.items())
+        self._flags = [("static_alloc", static_alloc), ("static_shape", static_shape),
+                       ("inline_limit", inline_limit)]
+        if forward_bulk_size is not None:
+            self._flags.append(("forward_bulk_size", forward_bulk_size))
+        if backward_bulk_size is not None:
+            self._flags.append(("backward_bulk_size", backward_bulk_size))
         if clear:
             self._clear_cached_op()
         if active and self._forward_hooks or self._forward_pre_hooks:
             warnings.warn('"{block}" is being hybridized while still having forward hook/pre-hook. '
                           'If "{block}" is a child of HybridBlock, the hooks will not take effect.'
                           .format(block=self))
-        super(HybridBlock, self).hybridize(active, **kwargs)
+        super(HybridBlock, self).hybridize(active,
+                                           static_alloc=static_alloc,
+                                           static_shape=static_shape,
+                                           inline_limit=inline_limit,
+                                           forward_bulk_size=forward_bulk_size,
+                                           backward_bulk_size=backward_bulk_size)
 
     def cast(self, dtype):
         self._clear_cached_op()

diff --git a/tests/python/unittest/test_gluon.py b/tests/python/unittest/test_gluon.py
@@ -2020,10 +2020,6 @@ def hybrid_forward(self, F, in1):
             res = t(d1)
             assert_almost_equal(res.asnumpy(), d1.asnumpy())
 
-    param = deepcopy(params[2])
-    param['param_indices'] = (1)
-    param['data_indices'] = (0)
-    params.append(param)
     # Test the case that inputs and outputs of a backward graph share NDArrays.
     for param in params:
         t = TestIOBackward()