Merge pull request #4 from alibaba/master

merge MNN
alibaba · Mar 3, 2021 · 6ac98b5 · 6ac98b5
2 parents 097b0c8 + 6bc0a0c
commit 6ac98b5
Show file tree

Hide file tree

Showing 15 changed files with 349 additions and 118 deletions.
diff --git a/3rd_party/OpenCLHeaders/CL/cl2.hpp b/3rd_party/OpenCLHeaders/CL/cl2.hpp
@@ -2485,6 +2485,42 @@ class Platform : public detail::Wrapper<cl_platform_id>
             return detail::errHandler(err, __GET_PLATFORM_IDS_ERR);
         }
 
+        // more than one gpu card
+        if (n > 1) {
+            // first select nvidia gpu as discrete card, if multi gpu cards are available, x86_64 platform
+            //const char* integrate_gpu = "Intel";
+            const char* discrete_gpu = "NVIDIA";
+            for (cl_uint i = 0; i < n; ++i) {
+                // get the length of platform name
+                size_t platform_name_length = 0;
+                err = clGetPlatformInfo(ids[i], CL_PLATFORM_NAME, 0, 0, &platform_name_length);
+                if (err != CL_SUCCESS) {
+                    return detail::errHandler(err, __GET_PLATFORM_INFO_ERR);
+                }
+                // get platform name
+                char* platform_name = new char[platform_name_length];
+                err = clGetPlatformInfo(ids[i], CL_PLATFORM_NAME, platform_name_length, platform_name, 0);
+                if (err != CL_SUCCESS) {
+                    delete[] platform_name;
+                    return detail::errHandler(err, __GET_PLATFORM_INFO_ERR);
+                }
+                // if nvidia card is detected, set it as default ids[0]
+                if (strstr(platform_name, discrete_gpu)) {
+                    if (i == 0) {
+                        delete[] platform_name;
+                        break;
+                    }
+                    // swap 
+                    cl_platform_id tmp = ids[0];
+                    ids[0] = ids[i];
+                    ids[i] = tmp;
+                    delete[] platform_name;
+                    break;
+                }
+                delete[] platform_name;
+            }
+        }
+
         if (platforms) {
             platforms->resize(ids.size());
 

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -74,7 +74,7 @@ option(MNN_USE_SSE "Use SSE optimization for x86 if possiable" ON)
 option(MNN_BUILD_CODEGEN "Build with codegen" OFF)
 option(MNN_ENABLE_COVERAGE "Build with coverage enable" OFF)
 
-IF(NOT MNN_BUILD_SHARED_LIBS)
+IF(NOT MNN_BUILD_SHARED_LIBS AND MNN_SEP_BUILD)
   message(WARNING "Close MNN_SEP_BUILD for static library")
   SET(MNN_SEP_BUILD OFF CACHE BOOL "<docstring>" FORCE)
 ENDIF()

diff --git a/project/android/testCommon.sh b/project/android/testCommon.sh
@@ -1,2 +1,2 @@
 #!/bin/bash
-adb shell "cd /data/local/tmp/MNN&&export LD_LIBRARY_PATH=.:$LD_LIBRARY_PATH&& $1 $2 $3 $4 $5 $6 $7"
+adb shell "cd /data/local/tmp/MNN&&export LD_LIBRARY_PATH=.:$LD_LIBRARY_PATH&& $1 $2 $3 $4 $5 $6 $7 $8 $9"
diff --git a/schema/current/CaffeOp_generated.h b/schema/current/CaffeOp_generated.h
@@ -436,6 +436,7 @@ struct Convolution2DCommonT : public flatbuffers::NativeTable {
   bool relu6;
   std::vector<int32_t> pads;
   std::vector<int32_t> outPads;
+  bool hasOutputShape;
   Convolution2DCommonT()
       : padX(0),
         padY(0),
@@ -450,7 +451,8 @@ struct Convolution2DCommonT : public flatbuffers::NativeTable {
         outputCount(0),
         inputCount(0),
         relu(false),
-        relu6(false) {
+        relu6(false),
+        hasOutputShape(false) {
   }
 };
 
@@ -475,7 +477,8 @@ struct Convolution2DCommon FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table
     VT_RELU = 28,
     VT_RELU6 = 30,
     VT_PADS = 32,
-    VT_OUTPADS = 34
+    VT_OUTPADS = 34,
+    VT_HASOUTPUTSHAPE = 36
   };
   int32_t padX() const {
     return GetField<int32_t>(VT_PADX, 0);
@@ -525,6 +528,9 @@ struct Convolution2DCommon FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table
   const flatbuffers::Vector<int32_t> *outPads() const {
     return GetPointer<const flatbuffers::Vector<int32_t> *>(VT_OUTPADS);
   }
+  bool hasOutputShape() const {
+    return GetField<uint8_t>(VT_HASOUTPUTSHAPE, 0) != 0;
+  }
   bool Verify(flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyField<int32_t>(verifier, VT_PADX) &&
@@ -545,6 +551,7 @@ struct Convolution2DCommon FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table
            verifier.VerifyVector(pads()) &&
            VerifyOffset(verifier, VT_OUTPADS) &&
            verifier.VerifyVector(outPads()) &&
+           VerifyField<uint8_t>(verifier, VT_HASOUTPUTSHAPE) &&
            verifier.EndTable();
   }
   Convolution2DCommonT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
@@ -603,6 +610,9 @@ struct Convolution2DCommonBuilder {
   void add_outPads(flatbuffers::Offset<flatbuffers::Vector<int32_t>> outPads) {
     fbb_.AddOffset(Convolution2DCommon::VT_OUTPADS, outPads);
   }
+  void add_hasOutputShape(bool hasOutputShape) {
+    fbb_.AddElement<uint8_t>(Convolution2DCommon::VT_HASOUTPUTSHAPE, static_cast<uint8_t>(hasOutputShape), 0);
+  }
   explicit Convolution2DCommonBuilder(flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
@@ -632,7 +642,8 @@ inline flatbuffers::Offset<Convolution2DCommon> CreateConvolution2DCommon(
     bool relu = false,
     bool relu6 = false,
     flatbuffers::Offset<flatbuffers::Vector<int32_t>> pads = 0,
-    flatbuffers::Offset<flatbuffers::Vector<int32_t>> outPads = 0) {
+    flatbuffers::Offset<flatbuffers::Vector<int32_t>> outPads = 0,
+    bool hasOutputShape = false) {
   Convolution2DCommonBuilder builder_(_fbb);
   builder_.add_outPads(outPads);
   builder_.add_pads(pads);
@@ -647,6 +658,7 @@ inline flatbuffers::Offset<Convolution2DCommon> CreateConvolution2DCommon(
   builder_.add_kernelX(kernelX);
   builder_.add_padY(padY);
   builder_.add_padX(padX);
+  builder_.add_hasOutputShape(hasOutputShape);
   builder_.add_relu6(relu6);
   builder_.add_relu(relu);
   builder_.add_padMode(padMode);
@@ -670,7 +682,8 @@ inline flatbuffers::Offset<Convolution2DCommon> CreateConvolution2DCommonDirect(
     bool relu = false,
     bool relu6 = false,
     const std::vector<int32_t> *pads = nullptr,
-    const std::vector<int32_t> *outPads = nullptr) {
+    const std::vector<int32_t> *outPads = nullptr,
+    bool hasOutputShape = false) {
   auto pads__ = pads ? _fbb.CreateVector<int32_t>(*pads) : 0;
   auto outPads__ = outPads ? _fbb.CreateVector<int32_t>(*outPads) : 0;
   return MNN::CreateConvolution2DCommon(
@@ -690,7 +703,8 @@ inline flatbuffers::Offset<Convolution2DCommon> CreateConvolution2DCommonDirect(
       relu,
       relu6,
       pads__,
-      outPads__);
+      outPads__,
+      hasOutputShape);
 }
 
 flatbuffers::Offset<Convolution2DCommon> CreateConvolution2DCommon(flatbuffers::FlatBufferBuilder &_fbb, const Convolution2DCommonT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
@@ -4505,6 +4519,7 @@ inline void Convolution2DCommon::UnPackTo(Convolution2DCommonT *_o, const flatbu
   { auto _e = relu6(); _o->relu6 = _e; };
   { auto _e = pads(); if (_e) { _o->pads.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->pads[_i] = _e->Get(_i); } } };
   { auto _e = outPads(); if (_e) { _o->outPads.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->outPads[_i] = _e->Get(_i); } } };
+  { auto _e = hasOutputShape(); _o->hasOutputShape = _e; };
 }
 
 inline flatbuffers::Offset<Convolution2DCommon> Convolution2DCommon::Pack(flatbuffers::FlatBufferBuilder &_fbb, const Convolution2DCommonT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
@@ -4531,6 +4546,7 @@ inline flatbuffers::Offset<Convolution2DCommon> CreateConvolution2DCommon(flatbu
   auto _relu6 = _o->relu6;
   auto _pads = _o->pads.size() ? _fbb.CreateVector(_o->pads) : 0;
   auto _outPads = _o->outPads.size() ? _fbb.CreateVector(_o->outPads) : 0;
+  auto _hasOutputShape = _o->hasOutputShape;
   return MNN::CreateConvolution2DCommon(
       _fbb,
       _padX,
@@ -4548,7 +4564,8 @@ inline flatbuffers::Offset<Convolution2DCommon> CreateConvolution2DCommon(flatbu
       _relu,
       _relu6,
       _pads,
-      _outPads);
+      _outPads,
+      _hasOutputShape);
 }
 
 inline Convolution3DCommonT *Convolution3DCommon::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
@@ -5950,7 +5967,8 @@ inline const flatbuffers::TypeTable *Convolution2DCommonTypeTable() {
     { flatbuffers::ET_BOOL, 0, -1 },
     { flatbuffers::ET_BOOL, 0, -1 },
     { flatbuffers::ET_INT, 1, -1 },
-    { flatbuffers::ET_INT, 1, -1 }
+    { flatbuffers::ET_INT, 1, -1 },
+    { flatbuffers::ET_BOOL, 0, -1 }
   };
   static const flatbuffers::TypeFunction type_refs[] = {
     PadModeTypeTable
@@ -5971,10 +5989,11 @@ inline const flatbuffers::TypeTable *Convolution2DCommonTypeTable() {
     "relu",
     "relu6",
     "pads",
-    "outPads"
+    "outPads",
+    "hasOutputShape"
   };
   static const flatbuffers::TypeTable tt = {
-    flatbuffers::ST_TABLE, 16, type_codes, type_refs, nullptr, names
+    flatbuffers::ST_TABLE, 17, type_codes, type_refs, nullptr, names
   };
   return &tt;
 }

diff --git a/schema/default/CaffeOp.fbs b/schema/default/CaffeOp.fbs
@@ -23,6 +23,7 @@ table Convolution2DCommon {
     relu6:bool=false;
     pads:[int];
     outPads:[int];
+    hasOutputShape:bool = false;
 }
 
 table Convolution3DCommon {

diff --git a/source/backend/arm82/Arm82Relu.cpp b/source/backend/arm82/Arm82Relu.cpp
@@ -6,6 +6,8 @@
 //  Copyright © 2018, Alibaba Group Holding Limited
 //
 #ifdef __aarch64__
+#include <limits>
+
 #include "backend/arm82/Arm82Relu.hpp"
 #include "MNN_generated.h"
 #include "backend/arm82/Arm82Backend.hpp"
@@ -20,7 +22,7 @@
 
 namespace MNN {
 
-static void _MNNArm82ReluWithChannel(FLOAT16 *dst, const FLOAT16 *src, const FLOAT16 *slope, size_t length) {
+static void _MNNArm82PReluWithChannel(FLOAT16 *dst, const FLOAT16 *src, const FLOAT16 *slope, size_t length) {
 #ifdef MNN_USE_NEON
     float16x8_t value_0 = vmovq_n_f16(0);
     float16x8_t slopeV  = vld1q_f16(slope);
@@ -69,7 +71,30 @@ static void _MNNArm82LeakyReluWithChannel(FLOAT16 *dst, const FLOAT16 *src, cons
                 dst[index] = src[index];
             }
         }
+#endif
+    }
+}
+
+static void _MNNArm82ReluWithChannel(FLOAT16 *dst, const FLOAT16 *src, size_t length) {
+#ifdef MNN_USE_NEON
+    float16x8_t value_0 = vmovq_n_f16(0);
+#endif
+
+    for (int i = 0; i < length; ++i) {
+#ifdef MNN_USE_NEON
+        float16x8_t value        = vld1q_f16(src + i * ARMV82_CHANNEL_UNIT);
+        float16x8_t lessThanZero = vcleq_f16(value, value_0);
 
+        vst1q_f16(dst + i * ARMV82_CHANNEL_UNIT, vbslq_f16(lessThanZero, value_0, value));
+#else
+        for (int j = 0; j < ARMV82_CHANNEL_UNIT; ++j) {
+            int index = i * ARMV82_CHANNEL_UNIT + j;
+            if (src[index] < 0) {
+                dst[index] = 0;
+            } else {
+                dst[index] = src[index];
+            }
+        }
 #endif
     }
 }
@@ -93,17 +118,30 @@ ErrorCode Arm82Relu::onExecute(const std::vector<Tensor *> &inputs, const std::v
 
     const auto src = input->host<FLOAT16>();
     auto dst       = output->host<FLOAT16>();
-    FLOAT16 slopeHalf = half_float::half(mSlope);
 
-    mThreadNumbers = static_cast<Arm82Backend *>(backend())->numberThread();
-    MNN_CONCURRENCY_BEGIN(tId, mThreadNumbers)
-    for (int b = (int)tId; b < batchAndChannel; b += mThreadNumbers) {
-        _MNNArm82LeakyReluWithChannel(dst + b * plane * ARMV82_CHANNEL_UNIT, 
-                                      src + b * plane * ARMV82_CHANNEL_UNIT,
-                                      slopeHalf, 
-                                      plane);
+    if (abs(mSlope) < std::numeric_limits<float>::epsilon()) {
+        // relu
+        mThreadNumbers = static_cast<Arm82Backend *>(backend())->numberThread();
+        MNN_CONCURRENCY_BEGIN(tId, mThreadNumbers)
+        for (int b = (int)tId; b < batchAndChannel; b += mThreadNumbers) {
+            _MNNArm82ReluWithChannel(dst + b * plane * ARMV82_CHANNEL_UNIT, 
+                                     src + b * plane * ARMV82_CHANNEL_UNIT,
+                                     plane);
+        }
+        MNN_CONCURRENCY_END();
+    } else {
+        // leakyrelu
+        FLOAT16 slopeHalf = half_float::half(mSlope);
+        mThreadNumbers = static_cast<Arm82Backend *>(backend())->numberThread();
+        MNN_CONCURRENCY_BEGIN(tId, mThreadNumbers)
+        for (int b = (int)tId; b < batchAndChannel; b += mThreadNumbers) {
+            _MNNArm82LeakyReluWithChannel(dst + b * plane * ARMV82_CHANNEL_UNIT, 
+                                        src + b * plane * ARMV82_CHANNEL_UNIT,
+                                        slopeHalf, 
+                                        plane);
+        }
+        MNN_CONCURRENCY_END();
     }
-    MNN_CONCURRENCY_END();
 
     return NO_ERROR;
 }
@@ -144,7 +182,7 @@ ErrorCode Arm82PRelu::onExecute(const std::vector<Tensor *> &inputs, const std::
     MNN_CONCURRENCY_BEGIN(tId, mThreadNumbers)
     for (int b = tId; b < batchAndChannel; ++b) {
         auto curChannel = b % channelDivUnit;
-        _MNNArm82ReluWithChannel(dstPtr + b * plane * ARMV82_CHANNEL_UNIT, srcPtr + b * plane * ARMV82_CHANNEL_UNIT,
+        _MNNArm82PReluWithChannel(dstPtr + b * plane * ARMV82_CHANNEL_UNIT, srcPtr + b * plane * ARMV82_CHANNEL_UNIT,
                                  slopePtr + curChannel * ARMV82_CHANNEL_UNIT, plane);
     }
     MNN_CONCURRENCY_END();

diff --git a/source/backend/vulkan/execution/VulkanConvolutionImpl.cpp b/source/backend/vulkan/execution/VulkanConvolutionImpl.cpp
@@ -290,6 +290,7 @@ class VulkanConvolutionIm2Col : public VulkanBasicExecution {
                     mIm2Col->bind(cmdBuffer->get(), mIm2ColSet[index]->get());
                     vkCmdDispatch(cmdBuffer->get(), UP_DIV(totalNumberInput, VulkanConvolutionCommon::gImage2ColLocal),
                                 1, 1);
+                    cmdBuffer->barrierImageIfNeeded(colImage, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL);
                 }
                 mMultilers[index]->compute(cmdBuffer);
                 if (true) {

diff --git a/source/geometry/GeometryConv2D.cpp b/source/geometry/GeometryConv2D.cpp
@@ -296,11 +296,20 @@ class GeometryConvTranspose2D : public GeometryConv2D {
     }
     virtual bool onCompute(const Op* op, const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs,
                            Context& context, CommandBuffer& res) const override {
-        if (inputs.size() == 1) {
-            // Origin convolution with format converter
-            return GeometryConvUtils::computeSingle(op, inputs, outputs, context, res);
+        if (op->main_as_Convolution2D()->common()->hasOutputShape()) {
+            const std::vector<Tensor*> newInputs(inputs.begin(), inputs.end() - 1);
+            if (newInputs.size() == 1) {
+                // Origin convolution with format converter
+                return GeometryConvUtils::computeSingle(op, newInputs, outputs, context, res);
+            }
+            return computeGEMM_Col2Im(op, newInputs, outputs, context, res);
+        } else {
+            if (inputs.size() == 1) {
+                // Origin convolution with format converter
+                return GeometryConvUtils::computeSingle(op, inputs, outputs, context, res);
+            }
+            return computeGEMM_Col2Im(op, inputs, outputs, context, res);
         }
-        return computeGEMM_Col2Im(op, inputs, outputs, context, res);
     }
 };
 static void _create() {

diff --git a/source/shape/ShapeConvolution.cpp b/source/shape/ShapeConvolution.cpp
@@ -43,6 +43,10 @@ class ConvolutionSizeComputer : public SizeComputer {
         int output_height = 1;
 
         auto input = inputs[0];
+        if (input->dimensions() <= 1) {
+            // Convolution is not valid for dimension <= 1
+            return false;
+        }
         // For Tensorflow Group Convolution, the inputCount is the size of filter's input count
         if (layer->inputCount() > 0 && input->channel() % layer->inputCount() != 0 && OpType_Convolution == op->type()) {
             MNN_ERROR("Error for compute convolution shape, need channel = %d, input channel = %d\n", layer->inputCount(), input->channel());

diff --git a/source/shape/ShapeDeconvolution.cpp b/source/shape/ShapeDeconvolution.cpp
@@ -16,6 +16,18 @@ class DeconvolutionSizeComputer : public SizeComputer {
         auto layer = op->main_as_Convolution2D()->common();
 
         auto inputTensor = inputs[0];
+        int outputHeight = 0, outputWidth = 0;
+        if (layer->hasOutputShape()) {
+            MNN_ASSERT(inputs.size() >= 2);
+            auto outputShape = inputs.back();
+            if (outputShape->length(0) > 2) {
+                outputHeight = outputShape->host<int>()[1];
+                outputWidth  = outputShape->host<int>()[2];
+            } else {
+                outputHeight = outputShape->host<int>()[0];
+                outputWidth  = outputShape->host<int>()[1];
+            }
+        }
 
         int input_width   = inputTensor->width();
         int input_height  = inputTensor->height();
@@ -29,9 +41,12 @@ class DeconvolutionSizeComputer : public SizeComputer {
         int dW            = layer->dilateX();
         int output_width;
         int output_height;
-        auto format = TensorUtils::getDescribe(inputs[0])->dimensionFormat;
+        auto format = TensorUtils::getDescribe(inputTensor)->dimensionFormat;
 
-        if (layer->padMode() == PadMode_SAME) { // Tensorflow support
+        if (outputHeight > 0 && outputWidth > 0) {
+            output_width = outputWidth;
+            output_height = outputHeight;
+        } else if (layer->padMode() == PadMode_SAME) { // Tensorflow support
             output_width  = input_width * sW;
             output_height = input_height * sH;
         } else {
@@ -50,7 +65,7 @@ class DeconvolutionSizeComputer : public SizeComputer {
         }
 
         auto& outputBuffer         = outputs[0]->buffer();
-        outputBuffer.type = inputs[0]->getType();
+        outputBuffer.type = inputTensor->getType();
         outputBuffer.dimensions    = inputTensor->buffer().dimensions;
         outputBuffer.dim[0].extent = inputTensor->buffer().dim[0].extent;
         if (MNN_DATA_FORMAT_NHWC == format) {