This repository has been archived by the owner on Nov 17, 2023. It is now read-only.
/
tensor.h
1230 lines (1216 loc) · 42.2 KB
/
tensor.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
/*!
* \file tensor.h
* \brief header file of tensor data structure and functions
* This lib requires explicit memory allocation and de-allocation
* all the data structure Tensor<cpu,1>, Tensor<gpu,1> are like handles(pointers),
* no memory allocation is happening during calculation
*
* For STL style tensor, see tensor_container.h
* \author Bing Xu, Tianqi Chen
*/
#ifndef MSHADOW_TENSOR_H_
#define MSHADOW_TENSOR_H_
#include <string>
#include <iostream>
#include "./base.h"
#include "./expression.h"
namespace mshadow {
/*! \brief device name CPU */
struct cpu {
/*! \brief whether this device is CPU or not */
static const bool kDevCPU = true;
/*! \brief device flag number, identifies this device */
static const int kDevMask = 1 << 0;
};
/*! \brief device name GPU */
struct gpu {
/*! \brief whether this device is CPU or not */
static const bool kDevCPU = false;
/*! \brief device flag number, identifies this device */
static const int kDevMask = 1 << 1;
};
template <typename xpu>
struct LapackIndex {
using IndexT = lapack_index_t;
};
template <>
struct LapackIndex <gpu> {
using IndexT = int;
};
template<int ndim>
struct Shape;
/*!
* \brief allow string printing of the shape
* \param os the output stream
* \param shape the shape
* \return the ostream
*/
template<int ndim>
inline std::ostream &operator<<(std::ostream &os, const Shape<ndim> &shape); // NOLINT(*)
/*!
* \brief shape of a tensor
* \tparam dimension dimension of tensor
*/
template<int dimension>
struct Shape {
/*! \brief dimension of current shape */
static const int kDimension = dimension;
/*! \brief dimension of current shape minus one */
static const int kSubdim = dimension - 1;
/*! \brief storing the dimension information */
index_t shape_[kDimension];
/*! \brief default constructor, do nothing */
MSHADOW_XINLINE Shape(void) {}
/*! \brief constuctor */
MSHADOW_XINLINE Shape(const Shape<kDimension> &s) {
#pragma unroll
for (int i = 0; i < kDimension; ++i) {
this->shape_[i] = s[i];
}
}
/*!
* \brief get corresponding index
* \param idx dimension index
* \return the corresponding dimension size
*/
MSHADOW_XINLINE index_t &operator[](int idx) {
return shape_[idx];
}
/*!
* \brief get corresponding index
* \param idx dimension index
* \return the corresponding dimension size
*/
MSHADOW_XINLINE const index_t &operator[](int idx) const {
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Warray-bounds"
return shape_[idx];
#pragma GCC diagnostic pop
}
/*!
* \return whether two shape equals
* \param s the shape to compare against
*/
MSHADOW_XINLINE bool operator==(const Shape<kDimension> &s) const {
#pragma unroll
for (int i = 0; i < kDimension; ++i) {
if (s.shape_[i] != this->shape_[i]) return false;
}
return true;
}
/*!
* \return whether two shape not equal
* \param s the shape to compare against
*/
MSHADOW_XINLINE bool operator!=(const Shape<kDimension> &s) const {
return !(*this == s);
}
/*!
* flatten the tensor, return a 1D shape
* \return the flat 1d shape
*/
MSHADOW_XINLINE Shape<1> FlatTo1D(void) const {
Shape<1> s;
s[0] = this->Size();
return s;
}
/*!
* flatten the higher dimension to second dimension, return a 2D shape
* \return the flat 2d shape
*/
MSHADOW_XINLINE Shape<2> FlatTo2D(void) const {
Shape<2> s;
s.shape_[1] = this->shape_[kDimension - 1];
index_t ymax = 1;
#pragma unroll
for (int i = 0; i < kDimension - 1; ++i) {
ymax *= this->shape_[i];
}
s.shape_[0] = ymax;
return s;
}
/*! \return number of valid elements */
MSHADOW_XINLINE index_t Size(void) const {
index_t size = this->shape_[0];
#pragma unroll
for (int i = 1; i < kDimension; ++i) {
size *= this->shape_[i];
}
return size;
}
/*!
* \return product shape in [dimstart,dimend)
* \param dimstart start dimension
* \param dimend end dimension
*/
MSHADOW_XINLINE index_t ProdShape(int dimstart, int dimend) const {
index_t num = 1;
#pragma unroll
for (int i = dimstart; i < dimend; ++i) {
num *= this->shape_[i];
}
return num;
}
/*!
* \brief get subshape that takes off largest dimension
v * \return subshape
*/
MSHADOW_XINLINE Shape<kSubdim> SubShape(void) const {
Shape<kSubdim> s;
// for cuda
#pragma unroll
for (int i = 0; i < kSubdim; ++i) {
s.shape_[i] = this->shape_[i + 1];
}
return s;
}
/*!
* \brief slice the shape from start to end
* \tparam dimstart start dimension
* \tparam dimend end dimension
* \return the sliced shape
*/
template<int dimstart, int dimend>
MSHADOW_XINLINE Shape<dimend - dimstart> Slice(void) const {
Shape<dimend - dimstart> s;
#pragma unroll
for (int i = dimstart; i < dimend; ++i) {
s[i - dimstart] = this->shape_[i];
}
return s;
}
//! \cond Doxygen_Suppress
template<int dim>
friend std::ostream &operator<<(std::ostream &os, const Shape<dim> &shape); // NOLINT(*)
//! \endcond
}; // Shape
//------------------------------------------------
// useful construction functions to generate shape
//-------------------------------------------------
/*!
* \brief construct a one dimension shape, stride will equal s0
* \param s0 size of dimension 0
* \return the shape construction
*/
MSHADOW_XINLINE Shape<1> Shape1(index_t s0) {
Shape<1> s; s[0] = s0;
return s;
}
/*!
* \brief construct a two dimension shape, stride will equal s0
* \param s0 size of dimension 0
* \param s1 size of dimension 1
* \return the shape construction
*/
MSHADOW_XINLINE Shape<2> Shape2(index_t s0, index_t s1) {
Shape<2> s; s[0] = s0; s[1] = s1;
return s;
}
/*!
* \brief construct a three dimension shape, stride will equal s0
* \param s0 size of dimension 0
* \param s1 size of dimension 1
* \param s2 size of dimension 2
* \return the shape construction
*/
MSHADOW_XINLINE Shape<3> Shape3(index_t s0, index_t s1, index_t s2) {
Shape<3> s;
s[0] = s0; s[1] = s1; s[2] = s2;
return s;
}
/*!
* \brief construct a four dimension shape, stride will equal s0
* \param s0 size of dimension 0
* \param s1 size of dimension 1
* \param s2 size of dimension 2
* \param s3 size of dimension 3
* \return the shape construction
*/
MSHADOW_XINLINE Shape<4> Shape4(index_t s0, index_t s1,
index_t s2, index_t s3) {
Shape<4> s;
s[0] = s0; s[1] = s1; s[2] = s2; s[3] = s3;
return s;
}
/*!
* \brief construct a five dimension shape, stride will equal s0
* \param s0 size of dimension 0
* \param s1 size of dimension 1
* \param s2 size of dimension 2
* \param s3 size of dimension 3
* \param s4 size of dimension 4
* \return the shape construction
*/
MSHADOW_XINLINE Shape<5> Shape5(index_t s0, index_t s1, index_t s2,
index_t s3, index_t s4) {
Shape<5> s;
s[0] = s0; s[1] = s1; s[2] = s2; s[3] = s3; s[4] = s4;
return s;
}
/*!
* \brief Convert shape in src_layout to shape in dst_layout
* \param src original shape
* \param src_layout layout of original shape
* \param dst_layout target layout
* \return shape in target layout
*/
inline Shape<3> ConvertLayout(const Shape<3>& src, int src_layout, int dst_layout) {
Shape<3> dst;
switch (src_layout) {
case kNCW:
dst = src;
break;
case kNWC:
dst[0] = src[0];
dst[1] = src[2];
dst[2] = src[1];
break;
default:
LOG(FATAL) << "Invalid layout for 3d shape " << src_layout;
}
switch (dst_layout) {
case kNCW:
return dst;
case kNWC:
{
index_t tmp = dst[1];
dst[1] = dst[2];
dst[2] = tmp;
}
break;
default:
LOG(FATAL) << "Invalid layout for 3d shape " << src_layout;
}
return dst;
}
/*!
* \brief Convert shape in src_layout to shape in dst_layout
* \param src original shape
* \param src_layout layout of original shape
* \param dst_layout target layout
* \return shape in target layout
*/
inline Shape<4> ConvertLayout(const Shape<4>& src, int src_layout, int dst_layout) {
Shape<4> dst;
switch (src_layout) {
case kNCHW:
dst = src;
break;
case kNHWC:
dst[0] = src[0];
dst[2] = src[1];
dst[3] = src[2];
dst[1] = src[3];
break;
default:
LOG(FATAL) << "Invalid layout for 4d shape " << src_layout;
dst = src; // fixes compiler warning
}
Shape<4> dst2;
switch (dst_layout) {
case kNCHW:
return dst;
case kNHWC:
dst2[0] = dst[0];
dst2[1] = dst[2];
dst2[2] = dst[3];
dst2[3] = dst[1];
break;
default:
LOG(FATAL) << "Invalid layout for 4d shape " << src_layout;
dst2 = src; // fixes compiler warning
}
return dst2;
}
/*!
* \brief Convert shape in src_layout to shape in dst_layout
* \param src original shape
* \param src_layout layout of original shape
* \param dst_layout target layout
* \return shape in target layout
*/
inline Shape<5> ConvertLayout(const Shape<5>& src, int src_layout, int dst_layout) {
Shape<5> dst;
switch (src_layout) {
case kNCDHW:
dst = src;
break;
case kNDHWC:
dst[0] = src[0];
dst[2] = src[1];
dst[3] = src[2];
dst[4] = src[3];
dst[1] = src[4];
break;
default:
LOG(FATAL) << "Invalid layout for 5d shape " << src_layout;
}
Shape<5> dst2;
switch (dst_layout) {
case kNCDHW:
return dst;
case kNDHWC:
dst2[0] = dst[0];
dst2[1] = dst[2];
dst2[2] = dst[3];
dst2[3] = dst[4];
dst2[4] = dst[1];
break;
default:
LOG(FATAL) << "Invalid layout for 5d shape " << src_layout;
}
return dst2;
}
/*!
* \brief returns axes of transpose operation
* that needs to be performed between src layout and dst
* \param src_layout input layout
* \param dst_layout output layout
* \return vector of required type describing axes of a transpose operation
*/
template <typename dim_t>
inline std::vector<dim_t> getTranspAxes(const LayoutFlag src_layout, const LayoutFlag dst_layout) {
auto apply = [](const std::vector<dim_t>& v, const std::vector<dim_t>& op) {
CHECK_EQ(v.size(), op.size()) << "Layout ndims does not match";
std::vector<dim_t> ret(v.size());
for (size_t i = 0; i < v.size(); i++) {
ret[i] = v[op[i]];
}
return ret;
};
std::vector<dim_t> axes;
// transpose from `case` to ND?H?WC
switch (src_layout) {
case kUNKNOWN:
LOG(FATAL) << "Unknown source layout";
break;
case kNHWC:
axes = std::vector<dim_t>({0, 1, 2, 3});
break;
case kNCHW:
axes = std::vector<dim_t>({0, 2, 3, 1});
break;
case kCHWN:
axes = std::vector<dim_t>({3, 1, 2, 0});
break;
case kNWC:
axes = std::vector<dim_t>({0, 1, 2});
break;
case kNCW:
axes = std::vector<dim_t>({0, 2, 1});
break;
case kCWN:
axes = std::vector<dim_t>({2, 1, 0});
break;
case kNDHWC:
axes = std::vector<dim_t>({0, 1, 2, 3, 4});
break;
case kNCDHW:
axes = std::vector<dim_t>({0, 2, 3, 4, 1});
break;
case kCDHWN:
axes = std::vector<dim_t>({4, 1, 2, 3, 0});
break;
default:
LOG(FATAL) << "Invalid source layout " << src_layout;
}
// transpose from ND?H?WC to `case`
switch (dst_layout) {
case kUNKNOWN:
LOG(FATAL) << "Unknown destination layout";
break;
case kNHWC:
axes = apply(axes, {0, 1, 2, 3});
break;
case kNCHW:
axes = apply(axes, {0, 3, 1, 2});
break;
case kCHWN:
axes = apply(axes, {3, 1, 2, 0});
break;
case kNWC:
axes = apply(axes, {0, 1, 2});
break;
case kNCW:
axes = apply(axes, {0, 2, 1});
break;
case kCWN:
axes = apply(axes, {2, 1, 0});
break;
case kNDHWC:
axes = apply(axes, {0, 1, 2, 3, 4});
break;
case kNCDHW:
axes = apply(axes, {0, 4, 1, 2, 3});
break;
case kCDHWN:
axes = apply(axes, {4, 1, 2, 3, 0});
break;
default:
LOG(FATAL) << "Invalid destination layout " << src_layout;
}
return axes;
}
/*!
* \brief computaion stream structure, used for asynchronous computations
*/
template<typename Device>
struct Stream {
// this is only a dummy implementation for CPU
// for GPU, the actual implementation will be specialized in tensor_gpu-inl.h
/*!
* \brief wait for all the computations associated
* with this stream to complete
*/
inline void Wait(void) {}
/*!
* \brief query whether the the stream is idle
* \return true if the stream is idle and all the jobs have been completed
*/
inline bool CheckIdle(void) {
return true;
}
/*! \brief create a blas handle */
inline void CreateBlasHandle() {}
};
/*!
* \brief Tensor RValue, this is the super type of all kinds of possible tensors
* \tparam Container the tensor type
* \tparam Device which device the tensor is on
* \tparam dimension dimension of the tensor
* \tparam DType the type of elements in the tensor
*/
template<typename Container, typename Device, int dimension, typename DType>
struct TRValue: public expr::RValueExp<Container, DType> {
};
// more compact template
/*!
* \brief general tensor
* \tparam Device which device the tensor is on
* \tparam dimension dimension of the tensor
* \tparam DType the type of elements in the tensor
*/
template<typename Device, int dimension,
typename DType MSHADOW_DEFAULT_DTYPE>
struct Tensor: public TRValue<Tensor<Device, dimension, DType>,
Device, dimension, DType> {
public:
//--------------------------------
// struct memembers
//--------------------------------
/*! \brief whether current type lies in cpu */
static const bool kDevCPU = Device::kDevCPU;
/*! \brief dimension of subtype */
static const int kSubdim = dimension - 1;
//--------------------------------
// struct memembers
//--------------------------------
/*! \brief pointer to the data */
DType *dptr_ = nullptr;
/*! \brief shape of the tensor */
Shape<dimension> shape_;
/*!
* \brief storing the stride information in x dimension
* this is used to deal with pitch allocation in gpu or sse(align x dimension to 64bit) for efficiency
*/
index_t stride_;
/*!
* \brief stream where the computation lies
* stream is a device dependency concept where each computation
*/
Stream<Device> *stream_;
//--------------------------------
// functions
//--------------------------------
/*! \brief default constructor */
MSHADOW_XINLINE Tensor(void) : stream_(NULL) {}
/*! \brief constructor from shape */
MSHADOW_XINLINE Tensor(const Shape<dimension> &shape)
: shape_(shape), stream_(NULL) {}
/*! \brief constructor from data pointer and shape, without stride */
MSHADOW_XINLINE Tensor(DType *dptr, const Shape<dimension> &shape)
: dptr_(dptr), shape_(shape), stride_(shape[kSubdim]), stream_(NULL) {}
/*! \brief constructor from data pointer and shape, without stride */
MSHADOW_XINLINE Tensor(DType *dptr, const Shape<dimension> &shape,
Stream<Device> *stream)
: dptr_(dptr), shape_(shape), stride_(shape[kSubdim]), stream_(stream) {}
/*! \brief constructor from data pointer and shape */
MSHADOW_XINLINE Tensor(DType *dptr,
const Shape<dimension> &shape,
index_t stride, Stream<Device> *stream)
: dptr_(dptr), shape_(shape), stride_(stride), stream_(stream) {}
/*!
* \brief set the stream to do computation of current tensor
* \param stream the computation stream
*/
inline void set_stream(Stream<Device> *stream) {
this->stream_ = stream;
}
/*!
* \return memory cost of the tensor, including the aligned x dimension
* \tparam startdim the starting dimension
*/
template<int startdim>
MSHADOW_XINLINE index_t MemSize(void) const {
index_t memsz = this->stride_;
#pragma unroll
for (int i = startdim; i < kSubdim; ++i) {
memsz *= this->shape_[i];
}
return memsz;
}
/*!
* \return whether the tensor's memory is continuous
* x dimension same as stride
*/
MSHADOW_XINLINE bool CheckContiguous(void) const {
return this->shape_[dimension - 1] == stride_;
}
/*!
* \return memory cost of the tensor, including the aligned x dimension
*/
MSHADOW_XINLINE index_t MSize(void) const {
return this->MemSize<0>();
}
/*!
* \brief return size of i-th dimension, start counting from highest dimension
* \param idx the dimension count from the highest dimensin
* \return the size
*/
MSHADOW_XINLINE index_t size(int idx) const {
return shape_[idx];
}
/*!
* \brief flatten the tensor to 1 dimension
* \return tensor after flatten
*/
MSHADOW_XINLINE Tensor<Device, 1, DType> FlatTo1D(void) const {
return Tensor<Device, 1, DType>(dptr_, shape_.FlatTo1D(), stride_, stream_);
}
/*!
* \brief flatten the tensor to 2 dimension, collapse the higher dimensions together
* \return tensor after flatten
*/
MSHADOW_XINLINE Tensor<Device, 2, DType> FlatTo2D(void) const {
return Tensor<Device, 2, DType>(dptr_, shape_.FlatTo2D(), stride_, stream_);
}
/*!
* \brief get a element of dimension - 1
* \param idx index
* \return the result tensor
*/
MSHADOW_XINLINE Tensor<Device, kSubdim, DType> operator[](index_t idx) const {
return Tensor<Device, kSubdim, DType>(dptr_ + this->MemSize<1>() * idx,
shape_.SubShape(), stride_, stream_);
}
/*!
* \brief slice the tensor in highest dimension [begin,end)
* \param begin begin position of slice
* \param end end position of slice
* \return tensor after slice
*/
MSHADOW_XINLINE Tensor<Device, dimension, DType>
Slice(index_t begin, index_t end) const {
Shape<dimension> s = this->shape_;
s[0] = end - begin;
return Tensor<Device, dimension, DType>(dptr_ + this->MemSize<1>() * begin,
s, stride_, stream_);
}
/*!\brief implement the assignment of same type */
inline Tensor<Device, dimension, DType> &
operator=(const Tensor<Device, dimension, DType> &exp) {
dptr_ = exp.dptr_;
shape_ = exp.shape_;
stride_ = exp.stride_;
stream_ = exp.stream_;
return *this;
}
/*!\brief functions to fit expression template */
template<typename E, int etype>
inline Tensor<Device, dimension, DType> &
operator=(const expr::Exp<E, DType, etype> &exp) {
return this->__assign(exp);
}
/*!\brief functions to fit expression template */
inline Tensor<Device, dimension, DType> &operator=(const DType &exp) {
return this->__assign(exp);
}
};
/*
* respecialized class Tensor1D, thei is due to different implementation in operator[]
*/
template<typename Device, typename DType>
struct Tensor<Device, 1, DType>:
public TRValue<Tensor<Device, 1, DType>, Device, 1, DType> {
public:
DType *dptr_;
Shape<1> shape_;
index_t stride_;
Stream<Device> *stream_;
// constructor
MSHADOW_XINLINE Tensor(void) : stream_(NULL) {}
MSHADOW_XINLINE Tensor(const Shape<1> &shape)
: shape_(shape), stream_(NULL) {}
MSHADOW_XINLINE Tensor(DType *dptr, Shape<1> shape)
: dptr_(dptr), shape_(shape), stride_(shape[0]), stream_(NULL) {}
MSHADOW_XINLINE Tensor(DType *dptr, Shape<1> shape, Stream<Device> *stream)
: dptr_(dptr), shape_(shape), stride_(shape[0]), stream_(stream) {}
MSHADOW_XINLINE Tensor(DType *dptr, Shape<1> shape,
index_t stride, Stream<Device> *stream)
: dptr_(dptr), shape_(shape), stride_(stride), stream_(stream) {}
inline void set_stream(Stream<Device> *stream) {
this->stream_ = stream;
}
MSHADOW_XINLINE Tensor<Device, 1, DType> FlatTo1D(void) const {
return *this;
}
MSHADOW_XINLINE Tensor<Device, 2, DType> FlatTo2D(void) const {
return Tensor<Device, 2, DType>(dptr_, shape_.FlatTo2D(), stride_, stream_);
}
MSHADOW_XINLINE Tensor<Device, 1, DType> Slice(index_t begin, index_t end) const {
Shape<1> s;
s[0] = end - begin;
return Tensor<Device, 1, DType>(dptr_ + begin, s, s[0], stream_);
}
MSHADOW_XINLINE bool CheckContiguous(void) const {
return true;
}
MSHADOW_XINLINE index_t MSize(void) const {
return shape_[0];
}
MSHADOW_XINLINE index_t size(index_t i) const {
return shape_[0];
}
MSHADOW_XINLINE DType &operator[](index_t idx) {
return dptr_[idx];
}
MSHADOW_XINLINE const DType &operator[](index_t idx) const {
return dptr_[idx];
}
/*!\brief implement the assignment of same type */
inline Tensor<Device, 1, DType> &
operator=(const Tensor<Device, 1, DType> &exp) {
dptr_ = exp.dptr_;
shape_ = exp.shape_;
stride_ = exp.stride_;
stream_ = exp.stream_;
return *this;
}
template<typename E, int etype>
inline Tensor<Device, 1, DType> &
operator=(const expr::Exp<E, DType, etype> &exp) {
return this->__assign(exp);
}
inline Tensor<Device, 1, DType> &operator=(const DType &exp) {
return this->__assign(exp);
}
};
//------------------------
// Function Declarations
//-----------------------
/*!
* \brief initialize tensor engine, used to call intialization functions of dependent libs
* this function should be called before all GPU tensor operations,
* for using tensors in CPU, this call is actually not needed
* \param device_id GPU device id to be choosed
* \tparam Device the device type
*/
template<typename Device>
inline void InitTensorEngine(int device_id = 0);
/*!
* \brief Shutdown tensor engine on current device
* this function should be called after all GPU tensor operations,
* for using tensors in CPU, this call is actually not needed
* \tparam Device the device type
*/
template<typename Device>
inline void ShutdownTensorEngine(void);
/*!
* \brief set the device of current thread to work on
* \param devid the device id
* \tparam Device the device type
*/
template<typename Device>
inline void SetDevice(int devid);
/*!
* \brief create a new stream from system
* \param create_blas_handle whether create blas & cusolver handle in stream
* \param create_dnn_handle whether create cudnn handle in stream
* \param dev_id device id
* \return a pointer to the created stream
* \tparam Device the device type
*/
template<typename Device>
inline Stream<Device> *NewStream(bool create_blas_handle,
bool create_dnn_handle,
int dev_id = -1);
/*! \brief default behavior: create cublas handle
* \param dev_id device id
* \return a pointer to the created stream
*/
template<typename Device>
inline Stream<Device> *NewStream(int dev_id) {
return NewStream<Device>(true, false, dev_id);
}
/*!
* \brief delete the computing stream
* \param stream the stream parameter to be deleted
*/
template<typename Device>
inline void DeleteStream(Stream<Device> *stream);
/*!
* \brief CPU/CPU: allocate space for CTensor, according to the shape in the obj
* this function is responsible to set the stride_ in each obj.shape
* \param obj the tensor object, with shape specified
* \param pad whether padding dimension 0, to make last dimension aligned,
* padding may help improve efficiency of matrix multiplications
* if true, will allocate space with stride_ that may not equals shape[0]
* if false, will allocate continuous space
* \tparam dim specify the dim of tensor
* \tparam DType type of element in tensor
*/
template<int dim, typename DType>
inline void AllocSpace(Tensor<cpu, dim, DType> *obj,
bool pad = MSHADOW_ALLOC_PAD);
/*!
* \brief CPU/CPU: allocate space for CTensor, according to the shape in the obj
* this function is responsible to set the stride_ in each obj.shape
* \param obj the tensor object, with shape specified
* \param pad whether padding dimension 0, to make last dimension aligned,
* padding may help improve efficiency of matrix multiplications
* if true, will allocate space with stride_ that may not equals shape[0]
* if false, will allocate continuous space
* \tparam dim specify the dim of tensor
* \tparam DType type of element in tensor
*/
template<int dim, typename DType>
inline void AllocSpace(Tensor<gpu, dim, DType> *obj,
bool pad = MSHADOW_ALLOC_PAD);
/*!
* \brief CPU/GPU: free the space of tensor, will set obj.dptr to NULL
* \param obj the tensor object
* \tparam dim specify the dim of tensor
* \tparam DType type of element in tensor
*/
template<int dim, typename DType>
inline void FreeSpace(Tensor<cpu, dim, DType> *obj);
/*!
* \brief CPU/GPU: free the space of tensor, will set obj.dptr to NULL
* \param obj the tensor object
* \tparam dim specify the dim of tensor
* \tparam DType type of element in tensor
*/
template<int dim, typename DType>
inline void FreeSpace(Tensor<gpu, dim, DType> *obj);
/*!
* \brief CPU/GPU: short cut to allocate and initialize a Tensor
* \param shape: shape of tensor
* \param initv: initialization value
* \param pad : padding option
* \param stream : stream of tensor
* \tparam Device device of tensor
* \tparam DType type of element in tensor
* \tparam dim dimention of tensor
* \return a new allocated tensor
* \sa AllocSpace
*/
template<typename Device, typename DType, int dim>
inline Tensor<Device, dim, DType> NewTensor(const Shape<dim> &shape,
DType initv,
bool pad = MSHADOW_ALLOC_PAD,
Stream<Device> *stream = NULL);
/*!
* \brief copy data from one tensor to another, with same shape
* \param dst target tensor
* \param src source tensor
* \param stream the stream, when specified, the copy can exhibit asynchronize behavior
* \tparam dim specify the dim of tensor
* \tparam DType type of element in tensor
*/
template<int dim, typename DType>
inline void Copy(Tensor<cpu, dim, DType> dst,
const Tensor<cpu, dim, DType> &src,
Stream<cpu> *stream = NULL);
/*!
* \brief copy data from one tensor to another, with same shape
* \param dst target tensor
* \param src source tensor
* \param stream the stream, when specified, the copy can exhibit asynchronize behavior
* \tparam dim specify the dim of tensor
* \tparam DType type of element in tensor
*/
template<int dim, typename DType>
inline void Copy(Tensor<cpu, dim, DType> dst,
const Tensor<gpu, dim, DType> &src,
Stream<gpu> *stream = NULL);
/*!
* \brief copy data from one tensor to another, with same shape
* \param dst target tensor
* \param src source tensor
* \param stream the stream, when specified, the copy can exhibit asynchronize behavior
* \tparam dim specify the dim of tensor
* \tparam DType type of element in tensor
*/
template<int dim, typename DType>
inline void Copy(Tensor<gpu, dim, DType> dst,
const Tensor<cpu, dim, DType> &src,
Stream<gpu> *stream = NULL);
/*!
* \brief copy data from one tensor to another, with same shape
* \param dst target tensor
* \param src source tensor
* \param stream the stream, when specified, the copy can exhibit asynchronize behavior
* \tparam dim specify the dim of tensor
* \tparam DType type of element in tensor
*/
template<int dim, typename DType>
inline void Copy(Tensor<gpu, dim, DType> dst,
const Tensor<gpu, dim, DType> &src,
Stream<gpu> *stream = NULL);
/*!
* \brief CPU/GPU: normalize softmax: dst[i][j] = exp(energy[i][j]) /(sum_j exp(energy[i][j]))
* \param dst destination
* \param energy input energy
*/
template<typename DType>
inline void Softmax(Tensor<cpu, 2, DType> dst, const Tensor<cpu, 2, DType> &energy);
/*!
* \brief CPU/GPU: normalize softmax: dst[i][j] = exp(energy[i][j]) /(sum_j exp(energy[i][j]))
* \param dst destination
* \param energy input energy
*/
template<typename DType>
inline void Softmax(Tensor<gpu, 2, DType> dst, const Tensor<gpu, 2, DType> &energy);
/*!
* \brief CPU/GPU: softmax gradient
* \param dst destination
* \param src source output
* \param label label info
*/
template<typename DType>
inline void SoftmaxGrad(Tensor<cpu, 2, DType> dst,
const Tensor<cpu, 2, DType> &src,
const Tensor<cpu, 1, DType> &label);
/*!
* \brief CPU/GPU: softmax gradient
* \param dst destination
* \param src source output
* \param label label info
*/
template<typename DType>
inline void SoftmaxGrad(const Tensor<gpu, 2, DType> &dst,
const Tensor<gpu, 2, DType> &src,
const Tensor<gpu, 1, DType> &label);
/*!
* \brief CPU/GPU: Gradient accumulate of embedding matrix.
dst[index[i]] += src[i]
Called when the featuredim of src is much larger than the batchsize
* \param dst destination
* \param index index to take
* \param src source output
*/
template<bool clip = true, typename IndexType, typename DType>
inline void AddTakeGrad(Tensor<cpu, 2, DType> dst,
const Tensor<cpu, 1, IndexType>& index,
const Tensor<cpu, 2, DType> &src);
/*!
* \brief CPU/GPU: Gradient accumulate of embedding matrix.
dst[index[i]] += src[i]
Called when the featuredim of src is much larger than the batchsize
* \param dst destination
* \param index index to take
* \param src source output
*/
template<bool clip = true, typename IndexType, typename DType, typename AType>
inline void AddTakeGrad(Tensor<cpu, 2, DType> dst,
Tensor<cpu, 2, AType> temp,
const Tensor<cpu, 1, IndexType>& index,
const Tensor<cpu, 2, DType> &src);
/*!
* \brief CPU/GPU: Gradient accumulate of embedding matrix with safe accumulation.
dst[index[i]] += src[i]
* \param dst destination
* \temp temporal storage for safe accumulation
* \param index index to take
* \param src source output
*/
template<bool clip = true, typename IndexType, typename DType>
inline void AddTakeGrad(Tensor<gpu, 2, DType> dst,
const Tensor<gpu, 1, IndexType>& index,
const Tensor<gpu, 2, DType> &src);
/*!
* \brief CPU/GPU: Gradient accumulate of embedding matrix.
dst[sorted[i]] += src[index[i]]
Called when the batchsize of src is larger than the featuredim
* \param dst destination
* \param sorted the sorted indices
* \param index original index of the sorted indices
* \param src source output
*/
template<bool clip = true, typename IndexType, typename DType, typename AType>
inline void AddTakeGrad(Tensor<gpu, 2, DType> dst,
Tensor<gpu, 2, AType> temp,
const Tensor<gpu, 1, IndexType>& index,
const Tensor<gpu, 2, DType> &src);
/*!
* \brief CPU/GPU: Gradient accumulate of embedding matrix with safe accumulation.
dst[index[i]] += src[i]
* \param dst destination
* \temp temporal storage for safe accumulation
* \param index index to take
* \param src source output
*/
template<typename IndexType, typename DType>
inline void AddTakeGradLargeBatch(Tensor<cpu, 2, DType> dst,
const Tensor<cpu, 1, IndexType>& sorted,
const Tensor<cpu, 1, IndexType>& index,
const Tensor<cpu, 2, DType> &src);
/*!
* \brief CPU/GPU: Gradient accumulate of embedding matrix.