Skip to content
This repository has been archived by the owner on Nov 17, 2023. It is now read-only.

Add graph pass and backward option #20924

Open
wants to merge 22 commits into
base: zero_sharding
Choose a base branch
from
Open
9 changes: 7 additions & 2 deletions cpp-package/include/mxnet-cpp/executor.h
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,9 @@ class Executor {
1,
nullptr,
nullptr),
0);
0,
nullptr,
nullptr);
} else {
CHECK_EQ(MXAutogradBackwardEx(out_handles.size(),
out_handles.data(),
Expand All @@ -144,7 +146,10 @@ class Executor {
1,
nullptr,
nullptr),
0);
0,
0,
nullptr,
nullptr);
}
grad_arrays.clear();
grad_arrays.reserve(arg_arrays.size());
Expand Down
2 changes: 1 addition & 1 deletion example/extensions/lib_pass/pass_lib.cc
Original file line number Diff line number Diff line change
Expand Up @@ -49,4 +49,4 @@ MXReturnValue initialize(int version) {
MX_ERROR_MSG << "MXNet version " << version << " not supported" << std::endl;
return MX_FAIL;
}
}
}
2 changes: 1 addition & 1 deletion example/extensions/lib_pass/test_pass.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,4 +67,4 @@ def test_model(pass_name):
sym_block2.optimize_for(mx.nd.ones((3,2)), mx.nd.ones((3,2)), backend=pass_name)
sym_block2.export('modified')

test_model('myPass')
test_model('myPass')
24 changes: 24 additions & 0 deletions example/extensions/lib_reduce_gradient/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.

all: pass_lib

pass_lib:
g++ -shared -fPIC -std=c++11 add_reduce_op.cc ../../../src/lib_api.cc -o add_reduce_op_lib.so -I ../../../include

clean:
rm -rf libpass_lib.so
76 changes: 76 additions & 0 deletions example/extensions/lib_reduce_gradient/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
<!--
~ Licensed to the Apache Software Foundation (ASF) under one
~ or more contributor license agreements. See the NOTICE file
~ distributed with this work for additional information
~ regarding copyright ownership. The ASF licenses this file
~ to you under the Apache License, Version 2.0 (the
~ "License"); you may not use this file except in compliance
~ with the License. You may obtain a copy of the License at
~
~ http://www.apache.org/licenses/LICENSE-2.0
~
~ Unless required by applicable law or agreed to in writing,
~ software distributed under the License is distributed on an
~ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
~ KIND, either express or implied. See the License for the
~ specific language governing permissions and limitations
~ under the License.
~
-->

Add Reduce operation to computation Graph
=======================================

## Introduction
This is the part of work of transferring [DeepSpeed's work](https://arxiv.org/abs/1910.02054) into MXNet.
Since the difference between symbolic and imperative, we divide the whole proecss into two phases:

phase 1: Add reduce operation into graph. The reduce operation will do nothing
in forward but reduce the gradient to the right GPU(according to POS-trainer).

phase2: In backward graph, delete the outputs in arrays so the memory planner can reuse such memory.

## Getting start
### Prepare NCCL and horovod
Since we use horovod to communicate, please firstly install horovod. And we use NCCL reduce, please also install it.

### Complie the Graph Pass and load
Please firstly compile it like [lib pass](../lib_pass/). Run `make` and it will generate dynamic library
**add_reduce_op_lib.so** which is compiled from the `add_reduce_op.cc` file. Then load such file in your python code like
```python
import mxnet as mx
mx.library.load('add_reduce_op_lib.so')
```

### Prepare options
Then we need know the correct partition of parameters and gradients about their GPUs.
So please use **POS_Trainer** from `pos_trainer.py` like normal trainer in MXNet.
```python
from pos_trainer import POS_Trainer
trainer = POS_Trainer(params_dict, "adam", optimizer_params)
```
Then trainer can generate corresponding options like:
```python
options = trainer.generate_graph_pass_options()
backward_options = trainer.generate_backward_options()
```
### modify graph
Before forward, we use
```python
model.optimize_for(x, backend = "add_reduce_op", **options)
```
to insert reduce operation into graphs.
![example add reduce](addreduce.png)

Then we call backward option as
```python
loss.backward(backward_option = backward_options)
```
### Simple Example
Please see `test_reduce.py`

### Current problem
1. The reduce operation will cause deadlock (it won't happen in NaiveEngine). Moreover, it will meet invalid address
problem in complex model like Bert-Base.
2. We do remove outputs from backward graph using backward option. But we need to verify whether it decrease the memory
consumption.
115 changes: 115 additions & 0 deletions example/extensions/lib_reduce_gradient/add_reduce_op.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

/*!
* \file subgraph_lib.cc
* \brief subgraph operator implementation library file
*/

#include <cmath>
#include <iostream>
#include <algorithm>
#include <string>
#include "mxnet/lib_api.h"

using namespace mxnet::ext;



MXReturnValue add_reduce_op(mxnet::ext::Graph* g,
const std::unordered_map<std::string, std::string>& options) {
std::string cur_rank = "";

std::string num_gpus = "";
std::string nccl_unique_id = "";

for (auto kv : options) {
std::cout << "option: " << kv.first << " ==> " << kv.second << std::endl;
if (kv.first == "rank")
{
cur_rank = kv.second.c_str();
}
if (kv.first == "nccl_unique_id")
nccl_unique_id = kv.second.c_str();
if (kv.first == "num_gpus")
num_gpus = kv.second.c_str();
}
size_t length = g->size();
mxnet::ext::Node *tmp;
std::string root_rank;
mxnet::ext::Node *target_node;
int index = 0;
for (int i = 0;i < length; i += 1)
{
target_node = g->getNode(i);
//std::cout<<"deal with:" << target_node->name<<std::endl;
auto it = options.find(target_node->name);
if (it == options.end()) {continue;} // req_grad == null
root_rank = it->second;
mxnet::ext::Node *new_reduce = g->addNode("ncclreduce_" + target_node->name,"_contrib_NCCLReduce");
index += 1;
auto new_attrs = &new_reduce->attrs;
auto old_attrs = target_node->attrs;
for (auto it = old_attrs.begin(); it!=old_attrs.end(); it++)
{
if (it->first == "__ext_dtype__" || it->first == "__ext_shape__" || it->first == "__profiler_scope__")
{
new_attrs ->insert({{it->first, it->second}});
}
}
new_attrs->insert({{"nccl_unique_id", nccl_unique_id}});
new_attrs->insert({{"num_gpus", num_gpus}});
new_attrs->insert({{"rank", cur_rank}});
new_attrs->insert({{"root_rank", root_rank}});

for (int i=0;i<target_node->outputs.size(); i++)
{
new_reduce->outputs.push_back(target_node->outputs[i]);
mxnet::ext::Node *output_node = target_node->outputs[i].node;
int index = target_node->outputs[i].entry;
//std::cout<<"try change:"<<output_node->name<<":"<<output_node->inputs.size()<<std::endl;
output_node->inputs[index].node = new_reduce;
}
for (int i=0;i<target_node->outputs.size(); i++)
{
target_node->outputs.pop_back();
}
target_node->outputs.push_back({new_reduce, 0});
new_reduce->inputs.push_back({target_node, 0});

}
g->print();


return MX_SUCCESS;
}



REGISTER_PASS(add_reduce_op).setBody(add_reduce_op);

MXReturnValue initialize(int version) {
if (version >= 10700) {
std::cout << "MXNet version " << version << " supported" << std::endl;
return MX_SUCCESS;
} else {
MX_ERROR_MSG << "MXNet version " << version << " not supported" << std::endl;
return MX_FAIL;
}
}
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading