Updates in preparation of LIBXSMM 1.7 (tensorflow#8)

* Fixed libxsmm_config_arguments: Fixed the incorrect value supposed to trigger auto-prefetch. Fixed the 0-threshold, which is now accounted for in LIBXSMM (by just populating the default threshold). The problem arised from the assumption "threshold: fallback to BLAS if n*m*k above this", which is wrong (the threshold populates an upper bound until which JIT code is generated). The previous configuration perhaps caused all sorts of issues due to other values derived from the 0-threshold. Note, explicitly JIT'ting code is/was never subject to a threshold. * Upgraded to libxsmm 1.6.5 * Enable the use of libxsmm for matrix multiplications * Enable the use of libxsmm to speedup 1x1 convolutions (which are computed using matrix multiplications) * Make use of TensorFlow's allocation infrastructure even when using LIBXSMM allocation functions. In particular, the (cached) libxsmm_spmdm_init now relies on TF's cpu_allocator(). For C++ code, one can use a libxsmm_scoped_allocator<kind> in order to (temporarily) setup a different allocation mechanism. For instance, using libxsmm_tf_allocator<libxsmm_scratch_allocator> changes LIBXSMM's scratch allocator to rely on TensorFlow. The libxsmm_tf_allocator provides two kinds of c'tors: (1) the no-argument variant adopts TF's cpu_allocator(), whereas the one-argument form (2) adopts the allocator from the given OpKernelContext. Changing the allocator in LIBXSMM with pending buffers (from different allocators) is valid, and all other services in LIBXSMM's "malloc domain" work regardless of the allocation mechanism (e.g., libxsmm_malloc_size). * Simply renamed API items in order to follow changes in LIBXSMM 1.7. This is incomplete as more changes/adjustments are needed. * Account for removed non-check API. * Include libxsmm_malloc.h now that libxsmm_tf_allocator is used. * Renamed libxsmm_dnn_create_conv_handle to libxsmm_dnn_create_conv_layer. * Renamed LIBXSMM_DNN_CONV_FORMAT_* to LIBXSMM_DNN_TENSOR_FORMAT_*. * Renamed libxsmm_dnn_destroy_conv_handle to libxsmm_dnn_destroy_conv_layer. * Include missing header file (libxsmm_malloc.h). * Renamed LIBXSMM_DNN_CONV_KIND_* to LIBXSMM_DNN_COMPUTE_KIND_*. * Account for the fact that datatype_in/out is now only datatype (libxsmm_dnn_conv_desc structure). * Updated to new libxsmm_dnn_link_* functions. * Updated to use new libxsmm_dnn_bind_* functions. * Fixed calling libxsmm_dnn_transpose_filter.
benoitsteiner · Jan 26, 2017 · 640efd8 · 640efd8
1 parent 2f01129
commit 640efd8
Show file tree

Hide file tree

Showing 5 changed files with 49 additions and 44 deletions.
diff --git a/tensorflow/core/kernels/conv_grad_input_ops.cc b/tensorflow/core/kernels/conv_grad_input_ops.cc
@@ -170,12 +170,11 @@ struct LaunchXsmmBackwardInputConvolution<CPUDevice, float> {
     desc.pad_w_out = 0;
     desc.threads = num_threads;
     desc.algo = LIBXSMM_DNN_CONV_ALGO_DIRECT;
-    desc.buffer_format = LIBXSMM_DNN_CONV_FORMAT_NHWC;
-    desc.filter_format = LIBXSMM_DNN_CONV_FORMAT_LIBXSMM;//LIBXSMM_DNN_CONV_FORMAT_RSCK;
+    desc.buffer_format = LIBXSMM_DNN_TENSOR_FORMAT_NHWC;
+    desc.filter_format = LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM;//LIBXSMM_DNN_TENSOR_FORMAT_RSCK;
     desc.fuse_ops = LIBXSMM_DNN_CONV_FUSE_NONE;
     desc.options = LIBXSMM_DNN_CONV_OPTION_NONE;
-    desc.datatype_in = LIBXSMM_DNN_DATATYPE_F32;
-    desc.datatype_out = LIBXSMM_DNN_DATATYPE_F32;
+    desc.datatype = LIBXSMM_DNN_DATATYPE_F32;
 
     auto input_ptr = input_backward.data();
     auto filter_ptr = kernel.data();

diff --git a/tensorflow/core/kernels/conv_ops.cc b/tensorflow/core/kernels/conv_ops.cc
@@ -204,12 +204,11 @@ class LaunchXsmmConvOp<CPUDevice, float> {
     desc.pad_w_out = 0;
     desc.threads = num_threads;
     desc.algo = LIBXSMM_DNN_CONV_ALGO_DIRECT;
-    desc.buffer_format = LIBXSMM_DNN_CONV_FORMAT_NHWC;
-    desc.filter_format = LIBXSMM_DNN_CONV_FORMAT_LIBXSMM;
+    desc.buffer_format = LIBXSMM_DNN_TENSOR_FORMAT_NHWC;
+    desc.filter_format = LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM;
     desc.fuse_ops = LIBXSMM_DNN_CONV_FUSE_NONE;
     desc.options = LIBXSMM_DNN_CONV_OPTION_NONE;
-    desc.datatype_in = LIBXSMM_DNN_DATATYPE_F32;
-    desc.datatype_out = LIBXSMM_DNN_DATATYPE_F32;
+    desc.datatype = LIBXSMM_DNN_DATATYPE_F32;
 
     if (!CanUseXsmmConv2D(desc, data_format)) {
       return false;

diff --git a/tensorflow/core/kernels/sparse_matmul_op.cc b/tensorflow/core/kernels/sparse_matmul_op.cc
@@ -37,6 +37,7 @@ limitations under the License.
 #include "tensorflow/core/platform/types.h"
 #ifdef TENSORFLOW_USE_LIBXSMM
 #include "include/libxsmm_intrinsics_x86.h"
+#include "include/libxsmm_malloc.h"
 #include "include/libxsmm_spmdm.h"
 #endif
 
@@ -896,6 +897,8 @@ class LibxsmmSparseMatMul {
       } else {
         std::unique_ptr<TensorInfoCacheEntry> e{
             new TensorInfoCacheEntry{M, K, N, max_threads, {}, nullptr}};
+        // setup scoped allocator, which uses cpu_allocator() for this scope
+        const libxsmm_tf_allocator<libxsmm_scratch_allocator> tf_allocator;
         libxsmm_spmdm_init(M, N, K, max_threads, &e->handle, &e->output_csr);
         return e;
       }

diff --git a/tensorflow/core/kernels/xsmm_conv2d.cc b/tensorflow/core/kernels/xsmm_conv2d.cc
@@ -33,6 +33,7 @@ void dummy_xsmm_conv2d_ensure_file_is_not_empty(void);
 
 #include "include/libxsmm_cpuid.h"
 #include "libxsmm_dnn_handle.h"
+#include "libxsmm_malloc.h"
 
 namespace tensorflow {
 
@@ -158,11 +159,11 @@ struct HashFunction{
 
 class handles{
   public:
-    libxsmm_dnn_conv_handle* find( const libxsmm_dnn_conv_desc_wrap &w) {
-      std::unordered_map<libxsmm_dnn_conv_desc_wrap , libxsmm_dnn_conv_handle*, HashFunction>::iterator i = libxsmm_handles.find(w);
+    libxsmm_dnn_layer* find( const libxsmm_dnn_conv_desc_wrap &w) {
+      std::unordered_map<libxsmm_dnn_conv_desc_wrap , libxsmm_dnn_layer*, HashFunction>::iterator i = libxsmm_handles.find(w);
       if (i == libxsmm_handles.end()){
         libxsmm_dnn_err_t status;
-        libxsmm_dnn_conv_handle* libxsmm_handle = libxsmm_dnn_create_conv_handle_check(w.d, &status);
+        libxsmm_dnn_layer* libxsmm_handle = libxsmm_dnn_create_conv_layer(w.d, &status);
         chk_libxsmm_err(status, "Create handle");
         libxsmm_handles.insert(std::make_pair(w, libxsmm_handle));
         return libxsmm_handle;
@@ -171,14 +172,14 @@ class handles{
         return i->second;
     }
    ~handles(){
-    std::unordered_map<libxsmm_dnn_conv_desc_wrap , libxsmm_dnn_conv_handle*, HashFunction>::iterator i;
+    std::unordered_map<libxsmm_dnn_conv_desc_wrap , libxsmm_dnn_layer*, HashFunction>::iterator i;
     for (i= libxsmm_handles.begin(); i != libxsmm_handles.end(); i++)
-      chk_libxsmm_err(libxsmm_dnn_destroy_conv_handle(i->second),
+      chk_libxsmm_err(libxsmm_dnn_destroy_conv_layer(i->second),
                     "Destroy handle");
     }
   private:
 
-    std::unordered_map<libxsmm_dnn_conv_desc_wrap , libxsmm_dnn_conv_handle*, HashFunction> libxsmm_handles;
+    std::unordered_map<libxsmm_dnn_conv_desc_wrap , libxsmm_dnn_layer*, HashFunction> libxsmm_handles;
 
 };
 
@@ -187,22 +188,24 @@ static handles libxsmm_handles;
 template <typename InputPtr, typename FilterPtr, typename OutputPtr>
 static bool CallLibxsmmConvGeneric(OpKernelContext* ctx,
                                    const libxsmm_dnn_conv_desc& desc,
-                                   libxsmm_dnn_conv_kind kind, InputPtr input,
+                                   libxsmm_dnn_compute_kind kind, InputPtr input,
                                    FilterPtr filter, OutputPtr output) {
+  // setup scoped allocator, which adopts the allocator from the context
+  const libxsmm_tf_allocator<libxsmm_scratch_allocator> tf_allocator(*ctx);
   libxsmm_dnn_err_t status;
-  libxsmm_dnn_conv_handle* libxsmm_handle;
+  libxsmm_dnn_layer* libxsmm_handle;
   libxsmm_dnn_conv_desc_wrap w(desc);
 
-  if(kind == LIBXSMM_DNN_CONV_KIND_FWD)
+  if(kind == LIBXSMM_DNN_COMPUTE_KIND_FWD)
     libxsmm_handle = libxsmm_handles.find(w);
   else{
-    libxsmm_handle = libxsmm_dnn_create_conv_handle_check(desc, &status);
+    libxsmm_handle = libxsmm_dnn_create_conv_layer(desc, &status);
     chk_libxsmm_err(status, "Create handle");
   }
 
   status = libxsmm_dnn_get_codegen_success(libxsmm_handle, kind);
   if (status == LIBXSMM_DNN_WARN_FALLBACK) {
-    chk_libxsmm_err(libxsmm_dnn_destroy_conv_handle(libxsmm_handle),
+    chk_libxsmm_err(libxsmm_dnn_destroy_conv_layer(libxsmm_handle),
                     "Destroy handle");
     return false;  // Use non-libxsmm code
   }
@@ -224,7 +227,7 @@ static bool CallLibxsmmConvGeneric(OpKernelContext* ctx,
 
   int blocksifm = desc.C%ifmblock ==0 ? desc.C/ifmblock :desc.C/ifmblock + 1;           
   int blocksofm = desc.K%ofmblock ==0 ? desc.K/ofmblock :desc.K/ofmblock + 1;
-  float *native_filter = (float*)libxsmm_aligned_malloc( blocksofm*blocksifm*desc.R*desc.S*ifmblock*ofmblock*sizeof(float), 2097152);
+  float *native_filter = (float*)libxsmm_aligned_scratch( blocksofm*blocksifm*desc.R*desc.S*ifmblock*ofmblock*sizeof(float), 2097152);
 
 
 
@@ -264,28 +267,28 @@ static bool CallLibxsmmConvGeneric(OpKernelContext* ctx,
     count.Wait();
   }
 
-  libxsmm_input = libxsmm_dnn_link_input_buffer_check(
-      libxsmm_handle, input, LIBXSMM_DNN_CONV_FORMAT_NHWC_PTR, &status);
+  libxsmm_input = libxsmm_dnn_link_buffer(
+      libxsmm_handle, LIBXSMM_DNN_INPUT, input, LIBXSMM_DNN_TENSOR_FORMAT_NHWC_PTR, &status);
   chk_libxsmm_err(status, "Link input buffer");
-  libxsmm_output = libxsmm_dnn_link_output_buffer_check(
-      libxsmm_handle, output, LIBXSMM_DNN_CONV_FORMAT_NHWC_PTR, &status);
+  libxsmm_output = libxsmm_dnn_link_buffer(
+      libxsmm_handle, LIBXSMM_DNN_OUTPUT, output, LIBXSMM_DNN_TENSOR_FORMAT_NHWC_PTR, &status);
   chk_libxsmm_err(status, "Link output buffer");
-  libxsmm_filter = libxsmm_dnn_link_filter_check(
-      libxsmm_handle, native_filter,  LIBXSMM_DNN_CONV_FORMAT_LIBXSMM_PTR, &status);
+  libxsmm_filter = libxsmm_dnn_link_filter(
+      libxsmm_handle, LIBXSMM_DNN_FILTER, native_filter, LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM_PTR, &status);
   chk_libxsmm_err(status, "Link filter");
 
   chk_libxsmm_err(libxsmm_dnn_zero_buffer(libxsmm_output), "Zero output");
 
-  chk_libxsmm_err(libxsmm_dnn_bind_input_buffer(libxsmm_handle, libxsmm_input),
+  chk_libxsmm_err(libxsmm_dnn_bind_buffer(libxsmm_handle, libxsmm_input, LIBXSMM_DNN_INPUT),
                   "Bind input");
   chk_libxsmm_err(
-      libxsmm_dnn_bind_output_buffer(libxsmm_handle, libxsmm_output),
+      libxsmm_dnn_bind_buffer(libxsmm_handle, libxsmm_output, LIBXSMM_DNN_OUTPUT),
       "Bind output");
-  chk_libxsmm_err(libxsmm_dnn_bind_filter(libxsmm_handle, libxsmm_filter),
+  chk_libxsmm_err(libxsmm_dnn_bind_filter(libxsmm_handle, libxsmm_filter, LIBXSMM_DNN_FILTER),
                   "Bind filter");
 
-  if (kind == LIBXSMM_DNN_CONV_KIND_BWD) {
-    libxsmm_dnn_transpose_filter(libxsmm_handle);
+  if (kind == LIBXSMM_DNN_COMPUTE_KIND_BWD) {
+    libxsmm_dnn_transpose_filter(libxsmm_handle, LIBXSMM_DNN_FILTER);
   }
 
   BlockingCounter counter(num_threads);
@@ -294,7 +297,7 @@ static bool CallLibxsmmConvGeneric(OpKernelContext* ctx,
 
   for (int i = 0; i < num_threads; ++i) {
     worker_threads->workers->Schedule([=, &counter]() {
-      chk_libxsmm_err(libxsmm_dnn_convolve_st(libxsmm_handle, kind, 0, i),
+      chk_libxsmm_err(libxsmm_dnn_execute_st(libxsmm_handle, kind, 0, i),
                       "Worker");
       counter.DecrementCount();
     });
@@ -304,8 +307,8 @@ static bool CallLibxsmmConvGeneric(OpKernelContext* ctx,
   chk_libxsmm_err(libxsmm_dnn_destroy_buffer(libxsmm_output), "Destroy output");
   chk_libxsmm_err(libxsmm_dnn_destroy_filter(libxsmm_filter), "Destroy filter");
 
-  if(kind != LIBXSMM_DNN_CONV_KIND_FWD)
-    chk_libxsmm_err(libxsmm_dnn_destroy_conv_handle(libxsmm_handle),
+  if(kind != LIBXSMM_DNN_COMPUTE_KIND_FWD)
+    chk_libxsmm_err(libxsmm_dnn_destroy_conv_layer(libxsmm_handle),
                   "Destroy handle");
   libxsmm_free(native_filter);
   return true;  // Succeeded
@@ -315,7 +318,7 @@ template <typename T>
 struct XsmmFwdConv2D<CPUDevice, T> {
   bool operator()(OpKernelContext* ctx, const libxsmm_dnn_conv_desc& desc,
                   const T* input, const T* filter, T* output) {
-    return CallLibxsmmConvGeneric(ctx, desc, LIBXSMM_DNN_CONV_KIND_FWD, input,
+    return CallLibxsmmConvGeneric(ctx, desc, LIBXSMM_DNN_COMPUTE_KIND_FWD, input,
                                   filter, output);
   }
 };
@@ -324,7 +327,7 @@ template <typename T>
 struct XsmmBkwInputConv2D<CPUDevice, T> {
   bool operator()(OpKernelContext* ctx, const libxsmm_dnn_conv_desc& desc,
                   T* input, const T* filter, const T* output) {
-    return CallLibxsmmConvGeneric(ctx, desc, LIBXSMM_DNN_CONV_KIND_BWD, input,
+    return CallLibxsmmConvGeneric(ctx, desc, LIBXSMM_DNN_COMPUTE_KIND_BWD, input,
                                   filter, output);
   }
 };
@@ -333,7 +336,7 @@ template <typename T>
 struct XsmmBkwFilterConv2D<CPUDevice, T> {
   bool operator()(OpKernelContext* ctx, const libxsmm_dnn_conv_desc& desc,
                   const T* input, T* filter, const T* output) {
-    return CallLibxsmmConvGeneric(ctx, desc, LIBXSMM_DNN_CONV_KIND_UPD, input,
+    return CallLibxsmmConvGeneric(ctx, desc, LIBXSMM_DNN_COMPUTE_KIND_UPD, input,
                                   filter, output);
   }
 };

diff --git a/tensorflow/core/kernels/xsmm_conv2d_test.cc b/tensorflow/core/kernels/xsmm_conv2d_test.cc
@@ -188,6 +188,8 @@ class XsmmConv2DTest : public OpsTestBase {
 TEST_F(XsmmConv2DTest, Basic) {
      MakeOp(1);
 
+     // setup scoped allocator, which uses cpu_allocator() for this scope
+     const libxsmm_tf_allocator<libxsmm_scratch_allocator> tf_allocator;
 
      int ifw = 14;           /* input width, "W" */
      int ifh = 14;           /* input height, "H" */
@@ -223,9 +225,9 @@ TEST_F(XsmmConv2DTest, Basic) {
     //Initialization of Filter and Image
 
     /* allocate data */
-     float *naive_input           = (float*)libxsmm_aligned_malloc( nImg*nIfm*ifhp*ifwp*sizeof(float), 2097152);
-     float *naive_output          = (float*)libxsmm_aligned_malloc( nImg*nOfm*ofhp*ofwp*sizeof(float), 2097152);
-     float *naive_filter          = (float*)libxsmm_aligned_malloc( nOfm*nIfm*kh*kw*    sizeof(float), 2097152);
+     float *naive_input           = (float*)libxsmm_aligned_scratch( nImg*nIfm*ifhp*ifwp*sizeof(float), 2097152);
+     float *naive_output          = (float*)libxsmm_aligned_scratch( nImg*nOfm*ofhp*ofwp*sizeof(float), 2097152);
+     float *naive_filter          = (float*)libxsmm_aligned_scratch( nOfm*nIfm*kh*kw*    sizeof(float), 2097152);
      /* initialize data */
      init_buf(naive_input,          nImg*nIfm*ifhp*ifwp, 0, 0);
      zero_buf(naive_output,         nImg*nOfm*ofhp*ofwp);
@@ -322,12 +324,11 @@ TEST(XsmmConv2DTest, Basic) {
     desc.pad_w_out = 0;
     desc.threads = num_threads;
     desc.algo = LIBXSMM_DNN_CONV_ALGO_DIRECT;
-    desc.buffer_format = LIBXSMM_DNN_CONV_FORMAT_NHWC;
-    desc.filter_format = LIBXSMM_DNN_CONV_FORMAT_LIBXSMM;//LIBXSMM_DNN_CONV_FORMAT_RSCK;
+    desc.buffer_format = LIBXSMM_DNN_TENSOR_FORMAT_NHWC;
+    desc.filter_format = LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM;//LIBXSMM_DNN_TENSOR_FORMAT_RSCK;
     desc.fuse_ops = LIBXSMM_DNN_CONV_FUSE_NONE;
     desc.options = LIBXSMM_DNN_CONV_OPTION_NONE;
-    desc.datatype_in = LIBXSMM_DNN_DATATYPE_F32;
-    desc.datatype_out = LIBXSMM_DNN_DATATYPE_F32;
+    desc.datatype = LIBXSMM_DNN_DATATYPE_F32;
  
     if (!CanUseXsmmConv2D(desc, data_format)) {
       return false;