Sort new pool before memory matching. Remove shared pool in PlanMemor…

…y. Add test case for bucketing
apache · Feb 24, 2017 · 71cd198 · 71cd198
1 parent cd4da5b
commit 71cd198
Show file tree

Hide file tree

Showing 7 changed files with 78 additions and 21 deletions.
diff --git a/example/rnn/lstm_bucketing.py b/example/rnn/lstm_bucketing.py
@@ -34,7 +34,8 @@
 def tokenize_text(fname, vocab=None, invalid_label=-1, start_label=0):
     lines = open(fname).readlines()
     lines = [filter(None, i.split(' ')) for i in lines]
-    sentences, vocab = mx.rnn.encode_sentences(lines, vocab=vocab, invalid_label=invalid_label, start_label=start_label)
+    sentences, vocab = mx.rnn.encode_sentences(lines, vocab=vocab, invalid_label=invalid_label,
+                                               start_label=start_label)
     return sentences, vocab
 
 

diff --git a/python/mxnet/module/base_module.py b/python/mxnet/module/base_module.py
@@ -133,6 +133,7 @@ def __init__(self, logger=logging):
         self.params_initialized = False
         self.optimizer_initialized = False
         self._symbol = None
+        self.total_exec_bytes = 0
 
     ################################################################################
     # High Level API

diff --git a/python/mxnet/module/executor_group.py b/python/mxnet/module/executor_group.py
@@ -122,7 +122,8 @@ def __init__(self, symbol, contexts, workload, data_shapes, label_shapes, param_
         self.inputs_need_grad = inputs_need_grad
 
         self.logger = logger
-
+        #TODO need a cleaner interface to profile memory per device (haibin)
+        self.total_exec_bytes = 0
         self.fixed_param_names = fixed_param_names
         if self.fixed_param_names is None:
             self.fixed_param_names = []
@@ -558,6 +559,8 @@ def _get_or_reshape(name, shared_data_arrays, arg_shape, arg_type, context, logg
         executor = self.symbol.bind(ctx=context, args=arg_arrays,
                                     args_grad=grad_arrays, aux_states=aux_arrays,
                                     grad_req=self.grad_req, shared_exec=shared_exec)
+        # Get the total bytes allocated for this executor
+        self.total_exec_bytes += int(executor.debug_str().split('\n')[-3].split()[1])
         return executor
 
     def _sliced_shape(self, shapes, i, major_axis):

diff --git a/python/mxnet/module/module.py b/python/mxnet/module/module.py
@@ -343,6 +343,7 @@ def bind(self, data_shapes, label_shapes=None, for_training=True,
                                                      shared_group, logger=self.logger,
                                                      fixed_param_names=self._fixed_param_names,
                                                      grad_req=grad_req)
+        self.total_exec_bytes = self._exec_group.total_exec_bytes
         if shared_module is not None:
             self.params_initialized = True
             self._arg_params = shared_module._arg_params

diff --git a/src/executor/graph_executor.cc b/src/executor/graph_executor.cc
@@ -321,17 +321,9 @@ void GraphExecutor::Init(nnvm::Symbol symbol,
                          const std::vector<OpReqType>& grad_req_type,
                          const std::vector<NDArray>& aux_states,
                          Executor* shared_exec) {
-  std::vector<SharedStorageEntry> shared_pool;
-  if (shared_exec != nullptr) {
-     for (auto& nd : dynamic_cast<GraphExecutor*>(shared_exec)->data_pool_) {
-       size_t bytes = nd.shape().Size() * mshadow::mshadow_sizeof(nd.dtype());
-       shared_pool.emplace_back(nd.ctx().dev_id, bytes); 
-     }
-  }
-
   nnvm::Graph g = InitGraph(symbol, default_ctx,
                             ctx_map, in_args, arg_grad_store,
-                            grad_req_type, aux_states, shared_pool);
+                            grad_req_type, aux_states);
   g = AttachOpExecs(g);
   g = AttachOpResources(g);
   graph_ = std::move(g);
@@ -364,11 +356,9 @@ Graph GraphExecutor::InitGraph(nnvm::Symbol symbol,
                                const std::vector<NDArray>& in_args,
                                const std::vector<NDArray>& arg_grad_store,
                                const std::vector<OpReqType>& grad_req_type,
-                               const std::vector<NDArray>& aux_states,
-                               const std::vector<SharedStorageEntry> shared_pool) {
+                               const std::vector<NDArray>& aux_states) {
   // setup gradient
   nnvm::Graph g = InitFullGraph(symbol, grad_req_type, arg_grad_store);
-  g.attrs["shared_pool"] = std::make_shared<nnvm::any>(shared_pool);
   g = AssignContext(g, default_ctx, ctx_map,
                     in_args,
                     grad_store_,
@@ -493,13 +483,23 @@ void GraphExecutor::InitDataEntryMemory(std::vector<NDArray>* shared_pool) {
   }
   // remake the data pool
   data_pool_.clear();
-  for (size_t i = 0; i < pool_info.size(); ++i) {
+  data_pool_.resize(pool_info.size());
+
+  // sort the pool info the descending order before allocating memory
+  std::vector<size_t> sorted_pool_index(pool_info.size());
+  std::iota(sorted_pool_index.begin(), sorted_pool_index.end(), 0);
+  auto pool_comparator = [&pool_info](int lhs, int rhs){
+    return pool_info[lhs].second > pool_info[rhs].second;
+  };
+  std::sort(sorted_pool_index.begin(), sorted_pool_index.end(), pool_comparator);
+
+  for (size_t i : sorted_pool_index) {
     const Context& ctx = pool_info[i].first;
     size_t bytes = pool_info[i].second;
     bool allocated = false;
     for (auto it = free_pool.lower_bound(bytes); it != free_pool.end(); ++it) {
       if (it->second.ctx() == ctx && it->first >= bytes) {
-        data_pool_.push_back(it->second);
+        data_pool_[i] = it->second;
         free_pool.erase(it);
         allocated = true;
         break;
@@ -511,12 +511,11 @@ void GraphExecutor::InitDataEntryMemory(std::vector<NDArray>* shared_pool) {
       // allocate float arrays
       TShape shape{index_t(nword)};
       NDArray nd(shape, ctx);
-      data_pool_.push_back(nd);
+      data_pool_[i] = nd;
       // put the new allocated arrays to shared pool
       if (shared_pool != nullptr)  {
         shared_pool->push_back(nd);
       }
-      num_new_ndarray++;
     }
   }
   CHECK_EQ(data_pool_.size(), pool_info.size());

diff --git a/src/executor/graph_executor.h b/src/executor/graph_executor.h
@@ -28,7 +28,6 @@ using nnvm::Graph;
 class GraphExecutor : public Executor {
  public:
   using Executor::MonitorCallback;
-  using SharedStorageEntry = std::pair<int, size_t>;
 
   virtual ~GraphExecutor();
   void Forward(bool is_train) override;
@@ -68,8 +67,7 @@ class GraphExecutor : public Executor {
                   const std::vector<NDArray>& in_args,
                   const std::vector<NDArray>& arg_grad_store,
                   const std::vector<OpReqType>& grad_req_type,
-                  const std::vector<NDArray>& aux_states,
-                  const std::vector<SharedStorageEntry> shared_pool);
+                  const std::vector<NDArray>& aux_states);
   // initialize the full graph, including gradient.
   Graph InitFullGraph(nnvm::Symbol symbol,
                       const std::vector<OpReqType>& grad_req_type,

diff --git a/tests/python/unittest/test_module.py b/tests/python/unittest/test_module.py
@@ -82,8 +82,62 @@ def test_module_reshape():
     assert mod.get_outputs()[0].shape == dshape
     assert (mod.get_params()[0]['fc_bias'].asnumpy() == -3).all()
 
+def test_module_switch_bucket():
+    vocab_dim = 5000
+    num_hidden = 100
+    num_embedding = 100
+    num_layer = 2
+    default_key = 10
+    test_key = 5
+    batch_size = 32
+    contexts = [mx.cpu(0)]
+    initializer = mx.init.Xavier(factor_type="in", magnitude=2.34)
+
+    #generate symbols for an LSTM network
+    def sym_gen(seq_len):
+        data = mx.sym.Variable('data')
+        label = mx.sym.Variable('softmax_label')
+        embed = mx.sym.Embedding(data=data, input_dim=vocab_dim,
+                                 output_dim=num_embedding, name='embed')
+        stack = mx.rnn.SequentialRNNCell()
+        for i in range(num_layer):
+            stack.add(mx.rnn.LSTMCell(num_hidden=num_hidden, prefix='lstm_l%d_'%i))
+        outputs, states = stack.unroll(seq_len, inputs=embed, merge_outputs=True)
+
+        pred = mx.sym.Reshape(outputs, shape=(-1, num_hidden))
+        pred = mx.sym.FullyConnected(data=pred, num_hidden=vocab_dim, name='pred')
+
+        label = mx.sym.Reshape(label, shape=(-1,))
+        pred = mx.sym.SoftmaxOutput(data=pred, label=label, name='softmax')
+
+        return pred, ('data',), ('softmax_label',)
+
+    def create_bucketing_module(key):
+        model = mx.mod.BucketingModule(
+            sym_gen             = sym_gen,
+            default_bucket_key  = key,
+            context             = contexts)
+        model.bind([('data', (batch_size, key))],
+                    [('softmax_label', (batch_size, key))], True, False)
+        model.init_params(initializer=initializer)
+        return model
+    #initialize the bucketing module with the default bucket key
+    bucketing_model = create_bucketing_module(default_key)
+    #switch to test_key
+    bucketing_model.switch_bucket(test_key, [('data', (batch_size, test_key))],
+                                  [('softmax_label', (batch_size, test_key))])
+    total_bytes_before = bucketing_model._buckets[default_key].total_exec_bytes
+
+    #remove test_key and switch again
+    del bucketing_model._buckets[test_key]
+    bucketing_model.switch_bucket(test_key, [('data', (batch_size, test_key))],
+                                  [('softmax_label', (batch_size, test_key))])
+    total_bytes_after = bucketing_model._buckets[default_key].total_exec_bytes
+    #the default bucket is expected to reuse the bytes allocated
+    assert total_bytes_after == total_bytes_before
 
 if __name__ == '__main__':
     test_module_reshape()
     test_save_load()
     test_module_layout()
+    test_module_switch_bucket()