apache · nudles · Mar 17, 2020 · Mar 10, 2020
diff --git a/python/singa/tensor.py b/python/singa/tensor.py
@@ -233,14 +233,23 @@ def reset_like(self, t):
         self.device = t.device
         self.dtype = t.dtype
 
-    '''
     def as_type(self, dtype):
-        Change the data type.
+        '''Change the data type.
 
         Args:
             dtype:
+        '''
+        if dtype == singa.kInt:
+            pass
+        elif dtype == singa.kFloat32:
+            pass
+        elif dtype == 'int':
+            dtype = singa.kInt
+        elif dtype == 'float':
+            dtype = singa.kFloat32
+        else:
+            raise TypeError("invalid data type %s" % dtype)
         self.data.AsType(dtype)
-    '''
 
     def to_device(self, device):
         '''Move the tensor data onto a given device.

diff --git a/src/core/tensor/math_kernel.cu b/src/core/tensor/math_kernel.cu
@@ -184,6 +184,20 @@ __global__ void KernelAbs(const size_t n, const float *in, float *out) {
   }
 }
 
+__global__ void KernelCastFloat2Int(const size_t n, const float *in, int *out) {
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n;
+       i += blockDim.x * gridDim.x) {
+    out[i] = int(in[i]);
+  }
+}
+
+__global__ void KernelCastInt2Float(const size_t n, const int *in, float *out) {
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n;
+       i += blockDim.x * gridDim.x) {
+    out[i] = float(in[i]);
+  }
+}
+
 __global__ void KernelSoftplus(const size_t n, const float *in, float *out) {
   for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n;
        i += blockDim.x * gridDim.x) {
@@ -509,6 +523,14 @@ void abs(const size_t n, const float *in, float *out, cudaStream_t s) {
   KernelAbs <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF, 0, s>>> (n, in, out);
 }
 
+void cast_float_2_int(const size_t n, const float *src, int *dst, cudaStream_t s) {
+  KernelCastFloat2Int <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF, 0, s>>> (n, src, dst);
+}
+
+void cast_int_2_float(const size_t n, const int *src, float *dst, cudaStream_t s) {
+  KernelCastInt2Float <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF, 0, s>>> (n, src, dst);
+}
+
 void sign(const size_t n, const float *in, float *out, cudaStream_t s) {
   KernelSign <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF, 0, s>>> (n, in, out);
 }

diff --git a/src/core/tensor/math_kernel.h b/src/core/tensor/math_kernel.h
@@ -45,6 +45,10 @@ void abs(const size_t n, const float *in, float *out, cudaStream_t s);
 void sign(const size_t n, const float *in, float *out, cudaStream_t s);
 void exp(const size_t n, const float *in, float *out, cudaStream_t s);
 void ceil2(const size_t n, const float *in, float *out, cudaStream_t s);
+void cast_float_2_int(const size_t n, const float *src, int *dst,
+                      cudaStream_t s);
+void cast_int_2_float(const size_t n, const int *src, float *dst,
+                      cudaStream_t s);
 void log(const size_t n, const float *in, float *out, cudaStream_t s);
 void sqrt(const size_t n, const float *in, float *out, cudaStream_t s);
 void square(const size_t n, const float *in, float *out, cudaStream_t s);

diff --git a/src/core/tensor/tensor.cc b/src/core/tensor/tensor.cc
@@ -105,11 +105,73 @@ Tensor Resize(const Tensor &in, const Shape &shape) {
   return out;
 }
 
+#define TYPE_TYPE_LANG_SWITCH(ldtype, LDType, rdtype, RDType, ltype, Lang,     \
+                              ...)                                             \
+  do {                                                                         \
+    const int _SwitchShift = 3;                                                \
+    int _SwitchHash =                                                          \
+        ((ldtype) << _SwitchShift * 2) + ((rdtype) << _SwitchShift) + (ltype); \
+    switch (_SwitchHash) {                                                     \
+      case (((kFloat32) << _SwitchShift * 2) + (kInt << _SwitchShift) +        \
+            kCuda): {                                                          \
+        typedef float LDType;                                                  \
+        typedef int RDType;                                                    \
+        typedef lang::Cuda Lang;                                               \
+        { __VA_ARGS__ }                                                        \
+        break;                                                                 \
+      }                                                                        \
+      case (((kInt) << _SwitchShift * 2) + (kFloat32 << _SwitchShift) +        \
+            kCuda): {                                                          \
+        typedef int LDType;                                                    \
+        typedef float RDType;                                                  \
+        typedef lang::Cuda Lang;                                               \
+        { __VA_ARGS__ }                                                        \
+        break;                                                                 \
+      }                                                                        \
+      case (((kFloat32) << _SwitchShift * 2) + (kInt << _SwitchShift) +        \
+            kCpp): {                                                           \
+        typedef float LDType;                                                  \
+        typedef int RDType;                                                    \
+        typedef lang::Cpp Lang;                                                \
+        { __VA_ARGS__ }                                                        \
+        break;                                                                 \
+      }                                                                        \
+      case (((kInt) << _SwitchShift * 2) + (kFloat32 << _SwitchShift) +        \
+            kCpp): {                                                           \
+        typedef int LDType;                                                    \
+        typedef float RDType;                                                  \
+        typedef lang::Cpp Lang;                                                \
+        { __VA_ARGS__ }                                                        \
+        break;                                                                 \
+      }                                                                        \
+      default:                                                                 \
+        LOG(FATAL) << "Unknown combination of left data type "                 \
+                   << DataType_Name(ldtype) << " and right data type "         \
+                   << DataType_Name(rdtype) << " and language "                \
+                   << LangType_Name(ltype);                                    \
+    }                                                                          \
+  } while (0)
+
 Tensor &Tensor::AsType(const DataType type) {
   if (data_type_ != type) {
-    if (block_ != nullptr && block_->DecRefCount() == 0)
+    if (block_ != nullptr && block_->DecRefCount() == 0) {
+      auto offset = Product(shape_);
+      auto new_block_ =
+          device_->NewBlock((int)(Product(shape_) * SizeOf(type)));
+      TYPE_TYPE_LANG_SWITCH(
+          data_type_, LDType, type, RDType, device_->lang(), Lang, {
+            device_->Exec(
+                [this, new_block_, offset, type](Context *ctx) {
+                  CastAsType<LDType, RDType, Lang>(this, new_block_, offset,
+                                                   ctx);
+                },
+                {}, {});
+          });
       device_->FreeBlock(block_);
-    block_ = device_->NewBlock((int)(Product(shape_) * SizeOf(type)));
+      block_ = new_block_;
+    } else {
+      block_ = device_->NewBlock((int)(Product(shape_) * SizeOf(type)));
+    }
     data_type_ = type;
   }
   return *this;

diff --git a/src/core/tensor/tensor_math.h b/src/core/tensor/tensor_math.h
@@ -86,6 +86,11 @@ void Abs(const Tensor &in, Tensor *out, Context *ctx) {
   LOG(FATAL) << "Abs Not Implemented";
 }
 
+template <typename DTypeSrc, typename DTypeDst, typename Lang>
+void CastAsType(const Tensor *src, Block *dst, int offset, Context *ctx) {
+  LOG(FATAL) << "CastAsType Not Implemented";
+}
+
 template <typename DType, typename Lang>
 void Ceil(const Tensor &in, Tensor *out, Context *ctx) {
   LOG(FATAL) << "Ceil Not Implemented";

diff --git a/src/core/tensor/tensor_math_cpp.h b/src/core/tensor/tensor_math_cpp.h
@@ -240,6 +240,22 @@ void Abs<float, lang::Cpp>(const Tensor &in, Tensor *out, Context *ctx) {
   traverse_unary<float>(in, out, [](float x) { return fabs(x); });
 }
 
+template <>
+void CastAsType<float, int, lang::Cpp>(const Tensor *src, Block *dst,
+                                       int offset, Context *ctx) {
+  int *dst_array = static_cast<int *>(dst->mutable_data());
+  const float *src_array = static_cast<const float *>(src->block()->data());
+  for (int i = 0; i < offset; ++i) dst_array[i] = (int)src_array[i];
+}
+
+template <>
+void CastAsType<int, float, lang::Cpp>(const Tensor *src, Block *dst,
+                                       int offset, Context *ctx) {
+  float *dst_array = static_cast<float *>(dst->mutable_data());
+  const int *src_array = static_cast<const int *>(src->block()->data());
+  for (int i = 0; i < offset; ++i) dst_array[i] = (float)src_array[i];
+}
+
 template <>
 void Ceil<float, lang::Cpp>(const Tensor &in, Tensor *out, Context *ctx) {
   traverse_unary<float>(in, out, [](float x) { return std::ceil(x); });

diff --git a/src/core/tensor/tensor_math_cuda.h b/src/core/tensor/tensor_math_cuda.h
@@ -169,6 +169,23 @@ void Abs<float, lang::Cuda>(const Tensor& in, Tensor* out, Context* ctx) {
   cudnnDestroyTensorDescriptor(in_desc);
 }
 
+template <>
+void CastAsType<float, int, lang::Cuda>(const Tensor* src, Block* dst,
+                                        int offset, Context* ctx) {
+  const float* srcPtr = static_cast<const float*>(src->block()->data());
+  int* dstPtr = static_cast<int*>(dst->mutable_data());
+  const size_t num = src->Size();
+  cuda::cast_float_2_int(num, srcPtr, dstPtr, ctx->stream);
+}
+
+template <>
+void CastAsType<int, float, lang::Cuda>(const Tensor* src, Block* dst,
+                                        int offset, Context* ctx) {
+  const int* srcPtr = static_cast<const int*>(src->block()->data());
+  float* dstPtr = static_cast<float*>(dst->mutable_data());
+  cuda::cast_int_2_float(offset, srcPtr, dstPtr, ctx->stream);
+}
+
 template <>
 void Set<float, lang::Cuda>(const float x, Tensor* out, Context* ctx) {
   float* outPtr = static_cast<float*>(out->block()->mutable_data());

diff --git a/test/python/test_api.py b/test/python/test_api.py
@@ -605,6 +605,33 @@ def test_ceil(self):
             np.testing.assert_array_almost_equal(
                 tensor.to_numpy(_cTensor_to_pyTensor(t2_ct)), np2)
 
+    def test_as_type(self):
+        np1 = np.random.random([3]).astype(np.float32)
+        np1 = np1 * 10 - 5
+        np2 = np1.astype(np.int32)
+        np3 = np2.astype(np.float32)
+
+        for dev in [cpu_dev, gpu_dev]:
+            t1 = tensor.Tensor(device=dev, data=np1)
+
+            t1_ct = t1.data
+
+            self.assertEqual(t1_ct.data_type(), singa_api.kFloat32)
+
+            t1_ct.AsType(singa_api.kInt)
+
+            self.assertEqual(t1_ct.data_type(), singa_api.kInt)
+
+            np.testing.assert_array_almost_equal(
+                tensor.to_numpy(_cTensor_to_pyTensor(t1_ct)), np2)
+
+            t1_ct.AsType(singa_api.kFloat32)
+
+            self.assertEqual(t1_ct.data_type(), singa_api.kFloat32)
+
+            np.testing.assert_array_almost_equal(
+                tensor.to_numpy(_cTensor_to_pyTensor(t1_ct)), np3)
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/test/python/test_tensor.py b/test/python/test_tensor.py
@@ -338,6 +338,21 @@ def test_ceil(self):
 
             np.testing.assert_array_almost_equal(tensor.to_numpy(t2), np2)
 
+    def test_astype(self):
+        for dev in [cpu_dev, gpu_dev]:
+
+            np1 = np.random.random([5, 6, 7, 8]).astype(np.float32)
+            np1 = np1 * 10 - 5
+
+            np2 = np1.astype(np.int32)
+            np3 = np2.astype(np.float32)
+
+            t1 = tensor.Tensor(device=dev, data=np1)
+            t1.as_type('int')
+            np.testing.assert_array_almost_equal(tensor.to_numpy(t1), np2)
+            t1.as_type('float')
+            np.testing.assert_array_almost_equal(tensor.to_numpy(t1), np3)
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/test/singa/test_tensor.cc b/test/singa/test_tensor.cc
@@ -68,11 +68,81 @@ TEST(TensorClass, Reshape) {
   EXPECT_TRUE(o.shape() != t.shape());
 }
 
-TEST(TensorClass, AsType) {
-  Tensor t;
+#ifdef USE_CUDA
+
+TEST(TensorClass, FloatAsTypeIntCuda) {
+  auto cuda = std::make_shared<singa::CudaGPU>();
+
+  Tensor t(Shape{3}, cuda);
+  float data[] = {1.0f, 2.0f, 3.0f};
+  t.CopyDataFromHostPtr(data, 3);
+  EXPECT_EQ(singa::kFloat32, t.data_type());
+
+  t.AsType(singa::kInt);
+
+  EXPECT_EQ(singa::kInt, t.data_type());
+
+  t.ToHost();
+  const int* dptr2 = static_cast<const int*>(t.block()->data());
+  EXPECT_EQ(1, dptr2[0]);
+  EXPECT_EQ(2, dptr2[1]);
+  EXPECT_EQ(3, dptr2[2]);
+}
+
+TEST(TensorClass, IntAsTypeFloatCuda) {
+  auto cuda = std::make_shared<singa::CudaGPU>();
+
+  Tensor t(Shape{3}, cuda, singa::kInt);
+  int data[] = {1, 2, 3};
+  t.CopyDataFromHostPtr(data, 3);
+  EXPECT_EQ(singa::kInt, t.data_type());
+
+  t.AsType(singa::kFloat32);
+
+  EXPECT_EQ(singa::kFloat32, t.data_type());
+
+  t.ToHost();
+  const float* dptr2 = static_cast<const float*>(t.block()->data());
+  EXPECT_EQ(1.0f, dptr2[0]);
+  EXPECT_EQ(2.0f, dptr2[1]);
+  EXPECT_EQ(3.0f, dptr2[2]);
+}
+
+#endif  // USE_CUDA
+
+TEST(TensorClass, FloatAsTypeIntCPU) {
+  Tensor t(Shape{3});
+  float data[] = {1.0f, 2.0f, 3.0f};
+  t.CopyDataFromHostPtr(data, 3);
+  EXPECT_EQ(singa::kFloat32, t.data_type());
+  const float* dptr = static_cast<const float*>(t.block()->data());
+  EXPECT_FLOAT_EQ(1.0f, dptr[0]);
+  EXPECT_FLOAT_EQ(2.0f, dptr[1]);
+  EXPECT_FLOAT_EQ(3.0f, dptr[2]);
+
+  t.AsType(singa::kInt);
+
+  EXPECT_EQ(singa::kInt, t.data_type());
+  const int* dptr2 = static_cast<const int*>(t.block()->data());
+  EXPECT_EQ(1, dptr2[0]);
+  EXPECT_EQ(2, dptr2[1]);
+  EXPECT_EQ(3, dptr2[2]);
+}
+
+TEST(TensorClass, IntAsTypeFloatCPU) {
+  Tensor t(Shape{3}, singa::kInt);
+  int data[] = {1, 2, 3};
+  t.CopyDataFromHostPtr(data, 3);
+  EXPECT_EQ(singa::kInt, t.data_type());
+
+  t.AsType(singa::kFloat32);
+
   EXPECT_EQ(singa::kFloat32, t.data_type());
-  t.AsType(singa::kFloat16);
-  EXPECT_EQ(singa::kFloat16, t.data_type());
+
+  const float* dptr2 = static_cast<const float*>(t.block()->data());
+  EXPECT_EQ(1.0f, dptr2[0]);
+  EXPECT_EQ(2.0f, dptr2[1]);
+  EXPECT_EQ(3.0f, dptr2[2]);
 }
 
 TEST(TensorClass, ToDevice) {