Skip to content
This repository has been archived by the owner on Nov 17, 2023. It is now read-only.

GPU implementation of cast_storage (dense to csr) #7081

Merged
merged 8 commits into from
Jul 27, 2017
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
70 changes: 70 additions & 0 deletions benchmark/python/cast_storage.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
import ctypes

from mxnet.test_utils import *
import os
import time
import argparse

from mxnet.base import check_call, _LIB

parser = argparse.ArgumentParser(description="Benchmark cast storage operators",
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--num-omp-threads', type=int, default=1, help='number of omp threads to set in MXNet')
args = parser.parse_args()

def measure_cost(repeat, f, *args, **kwargs):
start = time.time()
results = []
for i in range(repeat):
(f(*args, **kwargs)).wait_to_read()
end = time.time()
diff = end - start
return diff / repeat


def run_cast_storage_synthetic():
def dns_to_csr(m, n, density, ctx, repeat):
set_default_context(ctx)
data_shape = (m, n)
dns_data = rand_ndarray(data_shape, 'csr', density).todense()
dns_data.wait_to_read()

# do one warm up run, verify correctness
assert same(mx.nd.cast_storage(dns_data, stype='csr').asnumpy(), dns_data.asnumpy())

# start benchmarking
cost = measure_cost(repeat, mx.nd.cast_storage, dns_data, stype='csr')
results = '{:10.1f} {:>10} {:8d} {:8d} {:10.2f}'.format(density*100, str(ctx), m, n, cost*1000)
print(results)

check_call(_LIB.MXSetNumOMPThreads(ctypes.c_int(args.num_omp_threads)))

# params
# m number of rows
# n number of columns
# density density of the matrix
# num_repeat number of benchmark runs to average over
# contexts mx.cpu(), mx.gpu()
# note: benchmark different contexts separately; to benchmark cpu, compile without CUDA
m = [ 512, 512]
n = [50000, 100000]
density = [1.00, 0.80, 0.60, 0.40, 0.20, 0.10, 0.05, 0.02, 0.01]
num_repeat = 10
contexts = [mx.gpu()]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you add mx.cpu() as well?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should I add a comment then, saying that cpu benchmark results significantly differ when compiling with cuda vs. compiling without, so that benchmark results for cpu are only taken from compiling without cuda flags.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's a good idea.


# run benchmark
print("==================================================")
print(" cast_storage benchmark: dense to csr, size m x n ")
print("==================================================")
headline = '{:>10} {:>10} {:>8} {:>8} {:>10}'.format('density(%)', 'context', 'm', 'n', 'time(ms)')
print(headline)
for i in range(len(n)):
for ctx in contexts:
for den in density:
dns_to_csr(m[i], n[i], den, ctx, num_repeat)
print("")
print("==================================================")


if __name__ == "__main__":
run_cast_storage_synthetic()
6 changes: 2 additions & 4 deletions src/common/utils.cc
Original file line number Diff line number Diff line change
Expand Up @@ -10,14 +10,12 @@
namespace mxnet {
namespace common {


template<>
void CastStorageDispatch<cpu>(mshadow::Stream<cpu>* s,
void CastStorageDispatch<cpu>(const OpContext& ctx,
const NDArray& input,
const NDArray& output) {
mxnet::op::CastStorageComputeImpl(s, input, output);
mxnet::op::CastStorageComputeImpl<cpu>(ctx, input, output);
}


} // namespace common
} // namespace mxnet
4 changes: 2 additions & 2 deletions src/common/utils.cu
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,10 @@ namespace mxnet {
namespace common {

template<>
void CastStorageDispatch<gpu>(mshadow::Stream<gpu>* s,
void CastStorageDispatch<gpu>(const OpContext& ctx,
const NDArray& input,
const NDArray& output) {
mxnet::op::CastStorageComputeImpl(s, input, output);
mxnet::op::CastStorageComputeImpl<gpu>(ctx, input, output);
}

} // namespace common
Expand Down
7 changes: 3 additions & 4 deletions src/common/utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,11 +24,10 @@
#include <functional>

namespace mxnet {

namespace common {

template<typename xpu>
void CastStorageDispatch(mshadow::Stream<xpu>* s, const NDArray& input, const NDArray& output);
void CastStorageDispatch(const OpContext& ctx, const NDArray& input, const NDArray& output);

/*
* \brief Get the corresponding tensor blobs from default storage NDArrays.
Expand All @@ -55,7 +54,7 @@ inline bool GetDefaultBlobs(const std::vector<NDArray>& nds,
<< "doesn't support NDArray inputs with non-default storage.";
}
NDArray temp(nd.shape(), nd.ctx(), false);
CastStorageDispatch<xpu>(ctx.get_stream<xpu>(), nd, temp);
CastStorageDispatch<xpu>(ctx, nd, temp);
temps->push_back(temp);
blobs->push_back(temp.data());
casted = true;
Expand Down Expand Up @@ -91,7 +90,7 @@ inline void CastNonDefaultStorage(const std::vector<NDArray>& dst,
<< "You are probably executing an operator which "
<< "doesn't support NDArray inputs with non-default storage.";
}
CastStorageDispatch<xpu>(ctx.get_stream<xpu>(), src[src_idx++], dst[i]);
CastStorageDispatch<xpu>(ctx, src[src_idx++], dst[i]);
}
}
CHECK_EQ(src_idx, src.size()) << "Not all src NDArrays are casted";
Expand Down
26 changes: 18 additions & 8 deletions src/ndarray/ndarray.cc
Original file line number Diff line number Diff line change
Expand Up @@ -392,7 +392,7 @@ inline void CopyFromToDnsImpl(const NDArray from, NDArray *to, RunContext ctx) {

// Make a copy of an NDArray based on storage type
template<typename from_xpu, typename to_xpu>
void CopyFromToImpl(const NDArray from, NDArray *to, RunContext ctx) {
void CopyFromToImpl(const NDArray from, NDArray *to, RunContext rctx) {
using namespace std;
using namespace mshadow;
// if storage type doesn't match, cast the storage first
Expand All @@ -405,10 +405,20 @@ void CopyFromToImpl(const NDArray from, NDArray *to, RunContext ctx) {
<< " to stype = " << to_stype << " is not supported";
const auto from_ctx = from.ctx();
const auto to_ctx = to->ctx();
auto s = ctx.get_stream<from_xpu>();
auto s = rctx.get_stream<from_xpu>();
bool is_train = mxnet::autograd::AutogradRuntime::Get()->IsTraining();
std::vector<Resource> requested;
if (is_same<from_xpu, mshadow::gpu>::value && from_stype != to_stype) {
requested.push_back(ResourceManager::Get()->Request(from_ctx,
ResourceRequest(ResourceRequest::kTempSpace)));
}
OpContext opctx{is_train,
rctx,
engine::CallbackOnComplete(),
requested};
if (from_ctx == to_ctx && from_stype != to_stype) {
// same ctx, different stypes, use cast op directly without copying
common::CastStorageDispatch<from_xpu>(s, from, *to);
common::CastStorageDispatch<from_xpu>(opctx, from, *to);
} else {
NDArray casted_nd; // an intermediate result before copying from to to
if (from_stype == to_stype) {
Expand All @@ -421,22 +431,22 @@ void CopyFromToImpl(const NDArray from, NDArray *to, RunContext ctx) {
casted_nd = NDArray(to_stype, shape, from_ctx);
}
// convert from_nd to the same stype as to_nd
common::CastStorageDispatch<from_xpu>(s, from, casted_nd);
common::CastStorageDispatch<from_xpu>(opctx, from, casted_nd);
}

if (to_stype == kDefaultStorage) {
CopyFromToDnsImpl<from_xpu, to_xpu>(casted_nd, to, ctx);
CopyFromToDnsImpl<from_xpu, to_xpu>(casted_nd, to, rctx);
} else if (to_stype == kRowSparseStorage) {
CopyFromToRspImpl<from_xpu, to_xpu>(casted_nd, to, ctx);
CopyFromToRspImpl<from_xpu, to_xpu>(casted_nd, to, rctx);
} else if (to_stype == kCSRStorage) {
CopyFromToCsrImpl<from_xpu, to_xpu>(casted_nd, to, ctx);
CopyFromToCsrImpl<from_xpu, to_xpu>(casted_nd, to, rctx);
} else {
LOG(FATAL) << "unknown storage type" << to_stype;
}
}
if (is_same<from_xpu, mshadow::gpu>::value || is_same<to_xpu, mshadow::gpu>::value) {
// Wait GPU kernel to complete
ctx.get_stream<gpu>()->Wait();
rctx.get_stream<gpu>()->Wait();
}
}

Expand Down
Loading