forked from pytorch/pytorch
-
Notifications
You must be signed in to change notification settings - Fork 0
/
MSECriterion.cu
126 lines (101 loc) · 4.3 KB
/
MSECriterion.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
#ifndef THC_GENERIC_FILE
#define THC_GENERIC_FILE "THCUNN/generic/MSECriterion.cu"
#else
void THNN_(MSECriterion_updateOutput)(
THCState *state,
THCTensor *input,
THCTensor *target,
THCTensor *output,
int64_t reduction)
{
THCUNN_check_shape(state, input, target);
THCUNN_assertSameGPU(state, 3, input, target, output);
if (reduction != Reduction::None) {
THCTensor_(resize0d)(state, output);
ptrdiff_t size = THCTensor_(nElement)(state, input);
input = THCTensor_(newContiguous)(state, input);
target = THCTensor_(newContiguous)(state, target);
THCThrustAllocator thrustAlloc(state);
thrust::device_ptr<scalar_t> input_data(THCTensor_(data)(state, input));
thrust::device_ptr<scalar_t> target_data(THCTensor_(data)(state, target));
accreal sum = thrust::inner_product(
#if CUDA_VERSION >= 7000 || defined __HIP_PLATFORM_HCC__
thrust::cuda::par(thrustAlloc).on(THCState_getCurrentStream(state)),
#endif
input_data, input_data+size, target_data, (accreal) 0,
thrust::plus<accreal>(), mse_functor<scalar_t, accreal>());
if (reduction == Reduction::Mean)
sum /= size;
THCTensor_(free)(state, input);
THCTensor_(free)(state, target);
THCTensor_(set0d)(state, output, ScalarConvert<accreal, scalar_t>::to(sum));
return;
}
THCTensor_(resizeAs)(state, output, input);
THC_pointwiseApply3<scalar_t, scalar_t, scalar_t>(
state,
input,
target,
output,
mse_updateOutput_functor<scalar_t>());
}
void THNN_(MSECriterion_updateGradInput)(
THCState *state,
THCTensor *input,
THCTensor *target,
THCTensor *gradOutput,
THCTensor *gradInput,
int64_t reduction)
{
THCUNN_check_shape(state, input, target);
THCUNN_assertSameGPU(state, 4, input, target, gradInput, gradOutput);
if (reduction != Reduction::None) {
ptrdiff_t size = THCTensor_(nElement)(state, input);
THCUNN_check_dim_size(state, gradOutput, 1, 0, 1);
accreal norm = reduction == Reduction::Mean ? (accreal)(2)/size : (accreal)(2);
norm *= ScalarConvert<scalar_t, accreal>::to(THCTensor_(get0d)(state, gradOutput));
input = THCTensor_(newContiguous)(state, input);
target = THCTensor_(newContiguous)(state, target);
THCTensor_(resizeAs)(state, gradInput, input);
THCThrustAllocator thrustAlloc(state);
thrust::device_ptr<scalar_t> input_data(THCTensor_(data)(state, input));
thrust::device_ptr<scalar_t> target_data(THCTensor_(data)(state, target));
thrust::device_ptr<scalar_t> gradInput_data(THCTensor_(data)(state, gradInput));
thrust::transform(
#if CUDA_VERSION >= 7000 || defined __HIP_PLATFORM_HCC__
thrust::cuda::par(thrustAlloc).on(THCState_getCurrentStream(state)),
#endif
input_data, input_data+size, target_data, gradInput_data,
mse_updateGradInput_functor<scalar_t, accreal>(norm));
THCTensor_(free)(state, input);
THCTensor_(free)(state, target);
return;
}
THCUNN_check_shape(state, input, gradOutput);
ptrdiff_t size = THCTensor_(nElement)(state, input);
input = THCTensor_(newContiguous)(state, input);
target = THCTensor_(newContiguous)(state, target);
gradOutput = THCTensor_(newContiguous)(state, gradOutput);
THCTensor_(resizeAs)(state, gradInput, input);
THCThrustAllocator thrustAlloc(state);
thrust::device_ptr<scalar_t> input_data(THCTensor_(data)(state, input));
thrust::device_ptr<scalar_t> target_data(THCTensor_(data)(state, target));
thrust::device_ptr<scalar_t> gradOutput_data(THCTensor_(data)(state, gradOutput));
thrust::device_ptr<scalar_t> gradInput_data(THCTensor_(data)(state, gradInput));
thrust::transform(
#if CUDA_VERSION >= 7000 || defined __HIP_PLATFORM_HCC__
thrust::cuda::par(thrustAlloc).on(THCState_getCurrentStream(state)),
#endif
input_data, input_data+size, target_data, gradInput_data,
mse_updateGradInput_functor<scalar_t, accreal>(2));
thrust::transform(
#if CUDA_VERSION >= 7000 || defined __HIP_PLATFORM_HCC__
thrust::cuda::par(thrustAlloc).on(THCState_getCurrentStream(state)),
#endif
gradInput_data, gradInput_data+size, gradOutput_data, gradInput_data,
thrust::multiplies<scalar_t>());
THCTensor_(free)(state, input);
THCTensor_(free)(state, target);
THCTensor_(free)(state, gradOutput);
}
#endif