forked from pytorch/pytorch
-
Notifications
You must be signed in to change notification settings - Fork 0
/
reverse_packed_segs_op.cu
89 lines (69 loc) · 2.47 KB
/
reverse_packed_segs_op.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
#include "caffe2/core/context_gpu.h"
#include "caffe2/operators/reverse_packed_segs_op.h"
namespace caffe2 {
namespace {
template <typename T, typename LengthType>
__global__
void ReversePackedSegments_kernel(
size_t max_length,
size_t batch_size,
size_t block_size,
const LengthType* lengths_ptr,
const T* data_ptr,
T* rev_data_ptr) {
const int block_id = blockIdx.x;
// index into [0, batch_size)
const int batch = block_id / max_length;
// index into [0, segment)
const int segment = block_id % max_length;
if (batch >= batch_size || segment >= max_length) return;
const int seg_length = lengths_ptr[batch];
// unique data pointer for this CTA
const T* local_data_ptr = data_ptr + (segment * batch_size + batch) * block_size;
// unique pointer for result
T* local_rev_data_ptr;
if (segment < seg_length) {
local_rev_data_ptr = rev_data_ptr + ((seg_length - 1 - segment) * batch_size + batch) * block_size;
} else {
local_rev_data_ptr = rev_data_ptr + (segment * batch_size + batch) * block_size;
}
// copy using 1 element / thread for now
for (int idx = threadIdx.x; idx < block_size; idx+=blockDim.x) {
local_rev_data_ptr[idx] = local_data_ptr[idx];
}
}
} // namespace
// specialization of DoRunWithLengthType
template <>
template <typename T, typename LengthType>
void ReversePackedSegsOp<CUDAContext>::DoRunWithLengthType() {
const auto& data = Input(DATA);
const auto& lengths = Input(LENGTHS);
CAFFE_ENFORCE(
data.dim() == 3,
"DATA should be 3-D tensor <lengths, "
"segments, embeddings>");
CAFFE_ENFORCE(lengths.dim() == 1, "LENGTH should be 1-D");
auto* output = Output(0, data.sizes(), at::dtype<T>());
const auto max_length = data.size(0);
const auto batch_size = data.size(1);
const auto block_size = data.size(2);
CAFFE_ENFORCE(
lengths.sizes()[0] == batch_size,
"lenths size should be"
" equal to batch size");
const T* data_ptr = data.template data<T>();
const LengthType* lengths_ptr = lengths.template data<LengthType>();
// reversed data
T* rev_data_ptr = output->template mutable_data<T>();
const int grid = max_length * batch_size;
ReversePackedSegments_kernel<T,LengthType><<<grid, 512, 0, context_.cuda_stream()>>>(
max_length,
batch_size,
block_size,
lengths_ptr,
data_ptr,
rev_data_ptr);
}
REGISTER_CUDA_OPERATOR(ReversePackedSegs, ReversePackedSegsOp<CUDAContext>);
} // namespace caffe2