forked from pytorch/pytorch
-
Notifications
You must be signed in to change notification settings - Fork 0
/
minmax_ops.cu
56 lines (48 loc) · 1.55 KB
/
minmax_ops.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
#include "caffe2/operators/minmax_ops.h"
#include "caffe2/core/context_gpu.h"
#include "caffe2/utils/math.h"
namespace caffe2 {
namespace {
template <typename T>
__global__ void SelectGradientCUDAKernel(
const int N,
const T* dY,
const T* X,
const T* Y,
T* dX) {
const int i = blockIdx.x * CAFFE_CUDA_NUM_THREADS + threadIdx.x;
if (i < N) {
#if __CUDA_ARCH__ >= 350 || defined(__HIP_PLATFORM_HCC__)
dX[i] = __ldg(X + i) == __ldg(Y + i) ? __ldg(dY + i) : T(0);
#else
dX[i] = X[i] == Y[i] ? dY[i] : T(0);
#endif
}
}
} // namespace
template <>
bool SelectGradientOpBase<float, CUDAContext>::RunOnDevice() {
const auto& Y = Input(0);
const auto& dY = Input(1);
const int N = Y.numel();
const int M = math::DivUp(N, CAFFE_CUDA_NUM_THREADS);
const float* dY_data = dY.data<float>();
const float* Y_data = Y.data<float>();
for (int i = 0; i < OutputSize(); i++) {
const auto& Xi = Input(i + 2);
auto* dXi = Output(i, Xi.sizes(), at::dtype<float>());
const float* Xi_data = Xi.data<float>();
float* dXi_data = dXi->mutable_data<float>();
if (N > 0) {
SelectGradientCUDAKernel<float>
<<<M, CAFFE_CUDA_NUM_THREADS, 0, context_.cuda_stream()>>>(
N, dY_data, Xi_data, Y_data, dXi_data);
}
}
return true;
}
REGISTER_CUDA_OPERATOR(Min, MinOp<float, CUDAContext>);
REGISTER_CUDA_OPERATOR(MinGradient, MinGradientOp<float, CUDAContext>);
REGISTER_CUDA_OPERATOR(Max, MaxOp<float, CUDAContext>);
REGISTER_CUDA_OPERATOR(MaxGradient, MaxGradientOp<float, CUDAContext>);
} // namespace caffe2