Revert PR 17767 for fixing GPU memory usage regression (#18283) (#18309)

* Revert "Fix and optimize handling of vectorized memory accesses (#17767)" This reverts commit 5542d03. * add license to reverted file
apache · May 29, 2020 · d621e50 · d621e50
1 parent ac3e71b
commit d621e50
Show file tree

Hide file tree

Showing 19 changed files with 464 additions and 1,344 deletions.
diff --git a/3rdparty/mshadow/mshadow/base.h b/3rdparty/mshadow/mshadow/base.h
@@ -295,6 +295,7 @@ extern "C" {
   }
 
 #include "./half.h"
+#include "./half2.h"
 #include "./bfloat.h"
 #define MSHADOW_HALF_BF_OPERATOR(RTYPE, OP)                                               \
   MSHADOW_XINLINE RTYPE operator OP(mshadow::half::half_t a, mshadow::bfloat::bf16_t b) { \
@@ -409,6 +410,11 @@ struct DataType<half::half_t> {
 #endif
 };
 template<>
+struct DataType<half::half2_t> {
+  static const int kFlag = kFloat16;
+  static const int kLanes = 2;
+};
+template<>
 struct DataType<bfloat::bf16_t> {
   static const int kFlag = kBfloat16;
   static const int kLanes = 1;
@@ -1161,6 +1167,48 @@ struct minimum {
   }
 #endif
 
+#define MSHADOW_TYPE_SWITCH_WITH_HALF2(type, DType, ...)  \
+  switch (type) {                                         \
+  case mshadow::kFloat32:                                 \
+    {                                                     \
+      typedef float DType;                                \
+      {__VA_ARGS__}                                       \
+    }                                                     \
+    break;                                                \
+  case mshadow::kFloat64:                                 \
+    {                                                     \
+      typedef double DType;                               \
+      {__VA_ARGS__}                                       \
+    }                                                     \
+    break;                                                \
+  case mshadow::kFloat16:                                 \
+    {                                                     \
+      typedef mshadow::half::half2_t DType;               \
+      {__VA_ARGS__}                                       \
+    }                                                     \
+    break;                                                \
+  case mshadow::kUint8:                                   \
+    {                                                     \
+      typedef uint8_t DType;                              \
+      {__VA_ARGS__}                                       \
+    }                                                     \
+    break;                                                \
+  case mshadow::kInt32:                                   \
+    {                                                     \
+      typedef int32_t DType;                              \
+      {__VA_ARGS__}                                       \
+    }                                                     \
+    break;                                                \
+  case mshadow::kInt64:                                   \
+    {                                                     \
+      typedef int64_t DType;                              \
+      {__VA_ARGS__}                                       \
+    }                                                     \
+    break;                                                \
+  default:                                                \
+    LOG(FATAL) << "Unknown type enum " << type;           \
+  }
+
 #define MSHADOW_SGL_DBL_TYPE_SWITCH(type, DType, ...)  \
   switch (type) {                                      \
   case mshadow::kFloat32:                              \

diff --git a/3rdparty/mshadow/mshadow/half2.h b/3rdparty/mshadow/mshadow/half2.h
@@ -0,0 +1,162 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2017 by Contributors
+ * \file half2.h
+ * \brief definition of vector float16, half2 type.
+ *
+ * \author Antti-Pekka Hynninen
+ */
+#ifndef MSHADOW_HALF2_H_
+#define MSHADOW_HALF2_H_
+
+#if (defined(__CUDACC__) && __CUDA_ARCH__ >= 530 && MSHADOW_USE_CUDA && CUDA_VERSION >= 7050)
+  #define MSHADOW_CUDA_HALF2 1
+  #include <cuda_fp16.h>
+#else
+  #define MSHADOW_CUDA_HALF2 0
+#endif
+
+#include<math.h>
+
+/*! \brief namespace for mshadow */
+namespace mshadow {
+/* \brief name space for host/device portable half-precision floats */
+namespace half {
+
+#define MSHADOW_HALF2_ASSIGNOP(AOP, OP)                                   \
+  template<typename T>                                                    \
+  MSHADOW_XINLINE half2_t operator AOP (const T& a) {                     \
+    return *this = half2_t(*this OP a);  /* NOLINT(*)*/                   \
+  }                                                                       \
+
+class MSHADOW_ALIGNED(4) half2_t {
+ public:
+#if MSHADOW_CUDA_HALF2
+  half2 half2_;
+#else
+  half_t half_t2[2];
+#endif
+
+  MSHADOW_XINLINE half2_t() {}
+
+#if MSHADOW_CUDA_HALF2
+  MSHADOW_XINLINE explicit half2_t(half2 a) : half2_(a) {}
+#else
+  MSHADOW_XINLINE explicit half2_t(half_t a, half_t b) {
+    half_t2[0] = a;
+    half_t2[1] = b;
+  }
+#endif
+
+  MSHADOW_XINLINE explicit half2_t(int a) {
+#if MSHADOW_CUDA_HALF2
+    half2_ = __half2half2(__int2half_rz(a));
+#else
+    half_t2[0] = (half_t)a;
+    half_t2[1] = (half_t)a;
+#endif
+  }
+
+  MSHADOW_XINLINE half2_t operator+() {
+    return *this;
+  }
+
+  MSHADOW_XINLINE half2_t operator-() {
+#if MSHADOW_CUDA_HALF2
+    return half2_t(__hneg2(half2_));
+#else
+    return half2_t(-half_t2[0], -half_t2[1]);
+#endif
+  }
+
+  MSHADOW_XINLINE half2_t operator=(const half2_t& a) {
+#if MSHADOW_CUDA_HALF2
+    half2_ = a.half2_;
+#else
+    half_t2[0] = a.half_t2[0];
+    half_t2[1] = a.half_t2[1];
+#endif
+    return a;
+  }
+
+  MSHADOW_HALF2_ASSIGNOP(+=, +)
+  MSHADOW_HALF2_ASSIGNOP(-=, -)
+  MSHADOW_HALF2_ASSIGNOP(*=, *)
+  MSHADOW_HALF2_ASSIGNOP(/=, /)
+};
+
+/*! \brief overloaded + operator for half2_t */
+MSHADOW_XINLINE half2_t operator+(half2_t a, half2_t b) {
+#if MSHADOW_CUDA_HALF2
+  return half2_t(__floats2half2_rn(__low2float(a.half2_) + __low2float(b.half2_),
+                                   __high2float(a.half2_) + __high2float(b.half2_)));
+#else
+  return half2_t(a.half_t2[0] + b.half_t2[0], a.half_t2[1] + b.half_t2[1]);
+#endif
+}
+/*! \brief overloaded - operator for half2_t */
+MSHADOW_XINLINE half2_t operator-(half2_t a, half2_t b) {
+#if MSHADOW_CUDA_HALF2
+  return half2_t(__floats2half2_rn(__low2float(a.half2_) - __low2float(b.half2_),
+                                   __high2float(a.half2_) - __high2float(b.half2_)));
+#else
+  return half2_t(a.half_t2[0] - b.half_t2[0], a.half_t2[1] - b.half_t2[1]);
+#endif
+}
+/*! \brief overloaded * operator for half2_t */
+MSHADOW_XINLINE half2_t operator*(half2_t a, half2_t b) {
+#if MSHADOW_CUDA_HALF2
+  return half2_t(__floats2half2_rn(__low2float(a.half2_) * __low2float(b.half2_),
+                                   __high2float(a.half2_) * __high2float(b.half2_)));
+#else
+  return half2_t(a.half_t2[0] * b.half_t2[0], a.half_t2[1] * b.half_t2[1]);
+#endif
+}
+/*! \brief overloaded / operator for half2_t */
+MSHADOW_XINLINE half2_t operator/(half2_t a, half2_t b) {
+#if MSHADOW_CUDA_HALF2
+  return half2_t(__floats2half2_rn(__low2float(a.half2_) / __low2float(b.half2_),
+                                   __high2float(a.half2_) / __high2float(b.half2_)));
+#else
+  return half2_t(a.half_t2[0] / b.half_t2[0], a.half_t2[1] / b.half_t2[1]);
+#endif
+}
+/*! \brief overloaded % operator for half2_t */
+MSHADOW_XINLINE half2_t operator%(half2_t a, half2_t b) {
+#if MSHADOW_CUDA_HALF2
+  return half2_t(__floats2half2_rn(::fmod(__low2float(a.half2_), __low2float(b.half2_)),
+                                   ::fmod(__high2float(a.half2_), __high2float(b.half2_))));
+#else
+  return half2_t(::fmod(a.half_t2[0], b.half_t2[0]), ::fmod(a.half_t2[1], b.half_t2[1]));
+#endif
+}
+/*! \brief overloaded == operator for half2_t */
+MSHADOW_XINLINE bool operator==(half2_t a, half2_t b) {
+#if MSHADOW_CUDA_HALF2
+  return __hbeq2(a.half2_, b.half2_);
+#else
+  return (a.half_t2[0] == b.half_t2[0] && a.half_t2[1] == b.half_t2[1]);
+#endif
+}
+
+}  // namespace half
+}  // namespace mshadow
+#endif  // MSHADOW_HALF2_H_