apache · pengzhao-intel · May 22, 2019 · Jan 25, 2019 · Jan 25, 2019 · Feb 22, 2019
diff --git a/src/operator/mkl_functions-inl.h b/src/operator/mkl_functions-inl.h
@@ -0,0 +1,202 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * Copyright (c) 2019 by Contributors
+ * \file mkl_functions-inl.h
+ * \brief Wrapper for MKL VML functions
+ * \author Tao Lv, Shufan Wu
+*/
+#ifndef MXNET_OPERATOR_MKL_FUNCTIONS_INL_H_
+#define MXNET_OPERATOR_MKL_FUNCTIONS_INL_H_
+
+#if MSHADOW_USE_MKL == 1
+#include "mkl_vml.h"
+
+namespace mxnet {
+namespace op {
+namespace mkl_func {
+
+MSHADOW_XINLINE
+static bool check_size(const size_t n) {
+  const size_t MKL_INT_MAX = (sizeof(MKL_INT) == sizeof(int)) ? INT_MAX : LLONG_MAX;
+  return (n <= MKL_INT_MAX);
+}
+
+MSHADOW_XINLINE
+static bool check_type(const int t) {
+  return (t == mshadow::kFloat32 || t == mshadow::kFloat64);
+}
+
+#define MXNET_MKL_UNARY_MATH_FUNC(name, func)                                               \
+struct name {                                                                               \
+  MSHADOW_XINLINE static void Vectorize(const index_t n, const float *src, float *dst) {    \
+    vs##func(static_cast<MKL_INT>(n), src, dst);                                            \
+  }                                                                                         \
+  MSHADOW_XINLINE static void Vectorize(const index_t n, const double *src, double *dst) {  \
+    vd##func(static_cast<MKL_INT>(n), src, dst);                                            \
+  }                                                                                         \
+};
+
+#define MXNET_MKL_BINARY_MATH_FUNC(name, func)                                        \
+struct name {                                                                         \
+  MSHADOW_XINLINE static void Vectorize(const index_t n,                              \
+                                        const float *a,                               \
+                                        const float *b,                               \
+                                        float *c) {                                   \
+    vs##func(static_cast<MKL_INT>(n), a, b, c);                                       \
+  }                                                                                   \
+  MSHADOW_XINLINE static void Vectorize(const index_t n,                              \
+                                        const double *a,                              \
+                                        const double *b,                              \
+                                        double *c) {                                  \
+    vd##func(static_cast<MKL_INT>(n), a, b, c);                                       \
+  }                                                                                   \
+};
+
+MXNET_MKL_UNARY_MATH_FUNC(erf, Erf);
+MXNET_MKL_UNARY_MATH_FUNC(exp, Exp);
+MXNET_MKL_UNARY_MATH_FUNC(exp2, Exp2);
+MXNET_MKL_UNARY_MATH_FUNC(exp10, Exp10);
+MXNET_MKL_UNARY_MATH_FUNC(expm1, Expm1);
+MXNET_MKL_UNARY_MATH_FUNC(log, Ln);
+MXNET_MKL_UNARY_MATH_FUNC(log2, Log2);
+MXNET_MKL_UNARY_MATH_FUNC(log10, Log10);
+MXNET_MKL_UNARY_MATH_FUNC(log1p, Log1p);
+
+MXNET_MKL_UNARY_MATH_FUNC(sin, Sin);
+MXNET_MKL_UNARY_MATH_FUNC(cos, Cos);
+MXNET_MKL_UNARY_MATH_FUNC(tan, Tan);
+MXNET_MKL_UNARY_MATH_FUNC(asin, Asin);
+MXNET_MKL_UNARY_MATH_FUNC(acos, Acos);
+MXNET_MKL_UNARY_MATH_FUNC(atan, Atan);
+
+MXNET_MKL_UNARY_MATH_FUNC(sinh, Sinh);
+MXNET_MKL_UNARY_MATH_FUNC(cosh, Cosh);
+MXNET_MKL_UNARY_MATH_FUNC(tanh, Tanh);
+MXNET_MKL_UNARY_MATH_FUNC(asinh, Asinh);
+MXNET_MKL_UNARY_MATH_FUNC(acosh, Acosh);
+MXNET_MKL_UNARY_MATH_FUNC(atanh, Atanh);
+
+MXNET_MKL_UNARY_MATH_FUNC(sqrt, Sqrt);
+MXNET_MKL_UNARY_MATH_FUNC(abs, Abs);
+MXNET_MKL_UNARY_MATH_FUNC(cbrt, Cbrt);
+MXNET_MKL_UNARY_MATH_FUNC(round, Round);
+MXNET_MKL_UNARY_MATH_FUNC(ceil, Ceil);
+MXNET_MKL_UNARY_MATH_FUNC(floor, Floor);
+MXNET_MKL_UNARY_MATH_FUNC(trunc, Trunc);
+
+MXNET_MKL_UNARY_MATH_FUNC(lgamma, LGamma);
+MXNET_MKL_UNARY_MATH_FUNC(tgamma, TGamma);
+MXNET_MKL_UNARY_MATH_FUNC(square, Sqr);
+
+MXNET_MKL_BINARY_MATH_FUNC(add, Add);
+MXNET_MKL_BINARY_MATH_FUNC(sub, Sub);
+MXNET_MKL_BINARY_MATH_FUNC(mul, Mul);
+MXNET_MKL_BINARY_MATH_FUNC(pow, Pow);
+MXNET_MKL_BINARY_MATH_FUNC(hypot, Hypot);
+
+
+template <typename DType>
+MSHADOW_XINLINE static void sub_(index_t n, DType *in, DType b, DType *dst) {
+  for (index_t i = 0; i < n; i++)
+    dst[i] = in[i] - b;
+}
+
+template <typename DType>
+MSHADOW_XINLINE static void div_(index_t n, DType *in, DType b, DType *dst) {
+  for (index_t i = 0; i < n; i++)
+    dst[i] = in[i] / b;
+}
+
+template <typename DType>
+MSHADOW_XINLINE static void sum_(index_t n, DType *in, DType *dst) {
+  // dst[0] = cblas_sasum(n, in, 1);
+  DType sum = 0.0f;
+  for (index_t i = 0; i < n; i++)
+    sum += in[i];
+
+  dst[0] = sum;
+}
+
+template <typename DType>
+MSHADOW_XINLINE static void max_(index_t n, DType *in, DType *dst) {
+  dst[0] = in[0];
+  for (index_t i = 1; i < n; i++)
+    dst[0] = (dst[0] < in[i]) ? in[i] : dst[0];
+}
+
+// LayerNorm on the last dimension
+template <typename DType>
+MSHADOW_XINLINE static void LayerNormLastDim(index_t m,
+                                             index_t n,
+                                             DType *a,
+                                             DType *b,
+                                             DType *ws,
+                                             DType *gamma,
+                                             DType *beta,
+                                             DType *mean,
+                                             DType *var,
+                                             DType eps) {
+  auto nthreads = engine::OpenMP::Get()->GetRecommendedOMPThreadCount();
+#pragma omp parallel for num_threads(nthreads)
+  for (index_t i = 0; i < m; i++) {
+    DType* in_offset = a + i * n;
+    DType* out_offset = b + i * n;
+    DType* ws_offset = ws + i * n;
+
+    sum_(n, in_offset, &(mean[i]));
+    mean[i] /= n;
+    sub_(n, in_offset, mean[i], out_offset);
+    square::Vectorize(n, out_offset, ws_offset);
+    sum_(n, ws_offset, &(var[i]));
+    var[i] = math::sqrt(var[i] / n + eps);
+
+    mul::Vectorize(n, out_offset, gamma, out_offset);
+    div_(n, out_offset, var[i], out_offset);
+    add::Vectorize(n, out_offset, beta, out_offset);
+  }
+}
+
+template <typename DType>
+MSHADOW_XINLINE static void LogSoftmaxLastDim(index_t m,
+                                              index_t n,
+                                              DType *a,
+                                              DType *b) {
+  auto nthreads = engine::OpenMP::Get()->GetRecommendedOMPThreadCount();
+#pragma omp parallel for num_threads(nthreads)
+  for (index_t i = 0; i < m; i++) {
+    DType* in_offset = a + i * n;
+    DType* out_offset = b + i * n;
+
+    DType b, logsum;
+    max_(n, in_offset, &b);
+    sub_(n, in_offset, b, out_offset);
+    exp::Vectorize(n, out_offset, out_offset);
+    sum_(n, out_offset, &logsum);
+    logsum = b + logf(logsum);
+    sub_(n, in_offset, logsum, out_offset);
+  }
+}
+
+}  // namespace mkl_func
+}  // namespace op
+}  // namespace mxnet
+#endif  // MSHADOW_USE_MKL == 1
+#endif  // MXNET_OPERATOR_MKL_FUNCTIONS_INL_H_
diff --git a/src/operator/nn/layer_norm-inl.h b/src/operator/nn/layer_norm-inl.h
@@ -63,6 +63,9 @@ struct LayerNormParam : public dmlc::Parameter<LayerNormParam> {
   }
 };
 
+static int GetRealAxis(int axis, int ndim) {
+  return axis < 0 ? (axis + ndim) : axis;
+}
 
 template<typename xpu>
 void LayerNormCompute(const nnvm::NodeAttrs& attrs,
@@ -74,10 +77,7 @@ void LayerNormCompute(const nnvm::NodeAttrs& attrs,
   const LayerNormParam& param = nnvm::get<LayerNormParam>(attrs.parsed);
   if (req[0] == kNullOp) return;
   CHECK_NE(req[0], kAddTo);
-  int axis = param.axis;
-  if (axis < 0) {
-    axis += static_cast<int>(inputs[0].ndim());
-  }
+  int axis = GetRealAxis(param.axis, inputs[0].ndim());
   CHECK(axis >= 0 && axis < inputs[0].ndim()) << "Channel axis out of range: " << param.axis;
   CHECK_EQ(inputs.size(), 3U);
   Stream<xpu> *s = ctx.get_stream<xpu>();

diff --git a/src/operator/nn/layer_norm.cc b/src/operator/nn/layer_norm.cc
@@ -27,6 +27,10 @@
 #include <nnvm/op_attr_types.h>
 #include "../elemwise_op_common.h"
 
+#if MSHADOW_USE_MKL == 1
+#include "../mkl_functions-inl.h"
+#endif
+
 namespace mxnet {
 namespace op {
 
@@ -39,10 +43,7 @@ static bool LayerNormShape(const nnvm::NodeAttrs& attrs,
   using namespace mshadow;
   CHECK_EQ(in_shape->size(), 3U) << "Input:[data, gamma, beta]";
   const mxnet::TShape &dshape = in_shape->at(layernorm::kData);
-  int axis = param.axis;
-  if (axis < 0) {
-    axis += dshape.ndim();
-  }
+  int axis = GetRealAxis(param.axis, dshape.ndim());
   CHECK(axis >= 0 && axis < dshape.ndim())
     << "Channel axis out of range: axis=" << param.axis;
 
@@ -64,6 +65,58 @@ static bool LayerNormShape(const nnvm::NodeAttrs& attrs,
   return true;
 }
 
+#if MSHADOW_USE_MKL == 1
+void LayerNormComputeMKL(const nnvm::NodeAttrs& attrs,
+                         const OpContext& ctx,
+                         const std::vector<TBlob>& inputs,
+                         const std::vector<OpReqType>& req,
+                         const std::vector<TBlob>& outputs) {
+  using namespace mshadow;
+  const LayerNormParam& param = nnvm::get<LayerNormParam>(attrs.parsed);
+  if (req[0] == kNullOp) return;
+  CHECK_NE(req[0], kAddTo);
+  CHECK_EQ(inputs.size(), 3U);
+  int axis = GetRealAxis(param.axis, inputs[0].ndim());
+
+  if (axis == (inputs[layernorm::kData].ndim() - 1) ||
+      (inputs[0].type_flag_ != kFloat32 && inputs[0].type_flag_ != kFloat64)) {
+    Stream<cpu> *s = ctx.get_stream<cpu>();
+    // Compute necessary data for the reduce operation.
+    mxnet::TShape red_src_shape, red_dst_shape;
+    BroadcastReduceShapeCompact(inputs[layernorm::kData].shape_, outputs[layernorm::kMean].shape_,
+                                &red_src_shape, &red_dst_shape);
+    const TBlob in_data = inputs[layernorm::kData].reshape(red_src_shape);
+    const TBlob mean_data = outputs[layernorm::kMean].reshape(red_dst_shape);
+    const TBlob std_data = outputs[layernorm::kStd].reshape(red_dst_shape);
+    const int outter_size = red_dst_shape.Size();
+    const int channel_size = red_src_shape.Size() / red_dst_shape.Size();
+
+    // Initialize the workspace
+    Tensor<cpu, 1, char> workspace;
+    size_t workspace_size = 0;
+    MSHADOW_SGL_DBL_TYPE_SWITCH(in_data.type_flag_, DType, {
+      workspace_size = in_data.Size() * sizeof (DType);
+    });
+    workspace = ctx.requested[0].get_space_typed<cpu, 1, char>(Shape1(workspace_size), s);
+
+    // call
+    MSHADOW_SGL_DBL_TYPE_SWITCH(in_data.type_flag_, DType, {
+      mkl_func::LayerNormLastDim(outter_size, channel_size,
+                                 in_data.dptr<DType>(),
+                                 outputs[layernorm::kOut].dptr<DType>(),
+                                 reinterpret_cast<DType*>(workspace.dptr_),
+                                 inputs[layernorm::kGamma].dptr<DType>(),
+                                 inputs[layernorm::kBeta].dptr<DType>(),
+                                 outputs[layernorm::kMean].dptr<DType>(),
+                                 outputs[layernorm::kStd].dptr<DType>(),
+                                 static_cast<DType>(param.eps));
+    });
+  } else {
+    // fallback
+    LayerNormCompute<cpu>(attrs, ctx, inputs, req, outputs);
+  }
+}
+#endif
 
 NNVM_REGISTER_OP(LayerNorm)
 .describe(R"code(Layer normalization.
@@ -110,7 +163,11 @@ axis to be the last item in the input shape.
 })
 .set_attr<mxnet::FInferShape>("FInferShape", LayerNormShape)
 .set_attr<nnvm::FInferType>("FInferType", ElemwiseType<3, 3>)
+#if MSHADOW_USE_MKL == 1
+.set_attr<FCompute>("FCompute<cpu>", LayerNormComputeMKL)
+#else
 .set_attr<FCompute>("FCompute<cpu>", LayerNormCompute<cpu>)
+#endif
 .set_attr<nnvm::FGradient>("FGradient", [](const nnvm::NodePtr& n,
                                            const std::vector<nnvm::NodeEntry>& ograds) {
   std::vector<nnvm::NodeEntry> heads;