From 640efd87becf8b8f7ed4daadbc3af9164c158eff Mon Sep 17 00:00:00 2001
From: Hans Pabst <hfp@users.noreply.github.com>
Date: Thu, 26 Jan 2017 16:26:46 +0100
Subject: [PATCH] Updates in preparation of LIBXSMM 1.7 (#8)

* Fixed libxsmm_config_arguments: Fixed the incorrect value supposed to trigger auto-prefetch. Fixed the 0-threshold, which is now accounted for in LIBXSMM (by just populating the default threshold). The problem arised from the assumption "threshold: fallback to BLAS if n*m*k above this", which is wrong (the threshold populates an upper bound until which JIT code is generated). The previous configuration perhaps caused all sorts of issues due to other values derived from the 0-threshold. Note, explicitly JIT'ting code is/was never subject to a threshold.

* Upgraded to libxsmm 1.6.5

* Enable the use of libxsmm for matrix multiplications

* Enable the use of libxsmm to speedup 1x1 convolutions (which are
computed using matrix multiplications)

* Make use of TensorFlow's allocation infrastructure even when using LIBXSMM allocation functions. In particular, the (cached) libxsmm_spmdm_init now relies on TF's cpu_allocator().

For C++ code, one can use a libxsmm_scoped_allocator<kind> in order to (temporarily) setup a different allocation mechanism. For instance, using libxsmm_tf_allocator<libxsmm_scratch_allocator> changes LIBXSMM's scratch allocator to rely on TensorFlow. The libxsmm_tf_allocator provides two kinds of c'tors: (1) the no-argument variant adopts TF's cpu_allocator(), whereas the one-argument form (2) adopts the allocator from the given OpKernelContext. Changing the allocator in LIBXSMM with pending buffers (from different allocators) is valid, and all other services in LIBXSMM's "malloc domain" work regardless of the allocation mechanism (e.g., libxsmm_malloc_size).

* Simply renamed API items in order to follow changes in LIBXSMM 1.7. This is incomplete as more changes/adjustments are needed.

* Account for removed non-check API.

* Include libxsmm_malloc.h now that libxsmm_tf_allocator is used.

* Renamed libxsmm_dnn_create_conv_handle to libxsmm_dnn_create_conv_layer.

* Renamed LIBXSMM_DNN_CONV_FORMAT_* to LIBXSMM_DNN_TENSOR_FORMAT_*.

* Renamed libxsmm_dnn_destroy_conv_handle to libxsmm_dnn_destroy_conv_layer.

* Include missing header file (libxsmm_malloc.h).

* Renamed LIBXSMM_DNN_CONV_KIND_* to LIBXSMM_DNN_COMPUTE_KIND_*.

* Account for the fact that datatype_in/out is now only datatype (libxsmm_dnn_conv_desc structure).

* Updated to new libxsmm_dnn_link_* functions.

* Updated to use new libxsmm_dnn_bind_* functions.

* Fixed calling libxsmm_dnn_transpose_filter.
---
 .../core/kernels/conv_grad_input_ops.cc       |  7 +--
 tensorflow/core/kernels/conv_ops.cc           |  7 +--
 tensorflow/core/kernels/sparse_matmul_op.cc   |  3 +
 tensorflow/core/kernels/xsmm_conv2d.cc        | 61 ++++++++++---------
 tensorflow/core/kernels/xsmm_conv2d_test.cc   | 15 ++---
 5 files changed, 49 insertions(+), 44 deletions(-)
diff --git a/tensorflow/core/kernels/conv_grad_input_ops.cc b/tensorflow/core/kernels/conv_grad_input_ops.cc
index 139fb605dfb13c..52710cd9dc3011 100644
--- a/tensorflow/core/kernels/conv_grad_input_ops.cc
+++ b/tensorflow/core/kernels/conv_grad_input_ops.cc
@@ -170,12 +170,11 @@ struct LaunchXsmmBackwardInputConvolution<CPUDevice, float> {
     desc.pad_w_out = 0;
     desc.threads = num_threads;
     desc.algo = LIBXSMM_DNN_CONV_ALGO_DIRECT;
-    desc.buffer_format = LIBXSMM_DNN_CONV_FORMAT_NHWC;
-    desc.filter_format = LIBXSMM_DNN_CONV_FORMAT_LIBXSMM;//LIBXSMM_DNN_CONV_FORMAT_RSCK;
+    desc.buffer_format = LIBXSMM_DNN_TENSOR_FORMAT_NHWC;
+    desc.filter_format = LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM;//LIBXSMM_DNN_TENSOR_FORMAT_RSCK;
     desc.fuse_ops = LIBXSMM_DNN_CONV_FUSE_NONE;
     desc.options = LIBXSMM_DNN_CONV_OPTION_NONE;
-    desc.datatype_in = LIBXSMM_DNN_DATATYPE_F32;
-    desc.datatype_out = LIBXSMM_DNN_DATATYPE_F32;
+    desc.datatype = LIBXSMM_DNN_DATATYPE_F32;
 
     auto input_ptr = input_backward.data();
     auto filter_ptr = kernel.data();
diff --git a/tensorflow/core/kernels/conv_ops.cc b/tensorflow/core/kernels/conv_ops.cc
index 22e48e84d8f169..0518445a9f26ee 100644
--- a/tensorflow/core/kernels/conv_ops.cc
+++ b/tensorflow/core/kernels/conv_ops.cc
@@ -204,12 +204,11 @@ class LaunchXsmmConvOp<CPUDevice, float> {
     desc.pad_w_out = 0;
     desc.threads = num_threads;
     desc.algo = LIBXSMM_DNN_CONV_ALGO_DIRECT;
-    desc.buffer_format = LIBXSMM_DNN_CONV_FORMAT_NHWC;
-    desc.filter_format = LIBXSMM_DNN_CONV_FORMAT_LIBXSMM;
+    desc.buffer_format = LIBXSMM_DNN_TENSOR_FORMAT_NHWC;
+    desc.filter_format = LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM;
     desc.fuse_ops = LIBXSMM_DNN_CONV_FUSE_NONE;
     desc.options = LIBXSMM_DNN_CONV_OPTION_NONE;
-    desc.datatype_in = LIBXSMM_DNN_DATATYPE_F32;
-    desc.datatype_out = LIBXSMM_DNN_DATATYPE_F32;
+    desc.datatype = LIBXSMM_DNN_DATATYPE_F32;
 
     if (!CanUseXsmmConv2D(desc, data_format)) {
       return false;
diff --git a/tensorflow/core/kernels/sparse_matmul_op.cc b/tensorflow/core/kernels/sparse_matmul_op.cc
index 6c4f20a23a7947..5d20b859a82a88 100644
--- a/tensorflow/core/kernels/sparse_matmul_op.cc
+++ b/tensorflow/core/kernels/sparse_matmul_op.cc
@@ -37,6 +37,7 @@ limitations under the License.
 #include "tensorflow/core/platform/types.h"
 #ifdef TENSORFLOW_USE_LIBXSMM
 #include "include/libxsmm_intrinsics_x86.h"
+#include "include/libxsmm_malloc.h"
 #include "include/libxsmm_spmdm.h"
 #endif
 
@@ -896,6 +897,8 @@ class LibxsmmSparseMatMul {
       } else {
         std::unique_ptr<TensorInfoCacheEntry> e{
             new TensorInfoCacheEntry{M, K, N, max_threads, {}, nullptr}};
+        // setup scoped allocator, which uses cpu_allocator() for this scope
+        const libxsmm_tf_allocator<libxsmm_scratch_allocator> tf_allocator;
         libxsmm_spmdm_init(M, N, K, max_threads, &e->handle, &e->output_csr);
         return e;
       }
diff --git a/tensorflow/core/kernels/xsmm_conv2d.cc b/tensorflow/core/kernels/xsmm_conv2d.cc
index 0301ad49e7c0d3..010d3b699a6f7f 100644
--- a/tensorflow/core/kernels/xsmm_conv2d.cc
+++ b/tensorflow/core/kernels/xsmm_conv2d.cc
@@ -33,6 +33,7 @@ void dummy_xsmm_conv2d_ensure_file_is_not_empty(void);
 
 #include "include/libxsmm_cpuid.h"
 #include "libxsmm_dnn_handle.h"
+#include "libxsmm_malloc.h"
 
 namespace tensorflow {
 
@@ -158,11 +159,11 @@ struct HashFunction{
  
 class handles{
   public:
-    libxsmm_dnn_conv_handle* find( const libxsmm_dnn_conv_desc_wrap &w) {
-      std::unordered_map<libxsmm_dnn_conv_desc_wrap , libxsmm_dnn_conv_handle*, HashFunction>::iterator i = libxsmm_handles.find(w);
+    libxsmm_dnn_layer* find( const libxsmm_dnn_conv_desc_wrap &w) {
+      std::unordered_map<libxsmm_dnn_conv_desc_wrap , libxsmm_dnn_layer*, HashFunction>::iterator i = libxsmm_handles.find(w);
       if (i == libxsmm_handles.end()){
         libxsmm_dnn_err_t status;
-        libxsmm_dnn_conv_handle* libxsmm_handle = libxsmm_dnn_create_conv_handle_check(w.d, &status);
+        libxsmm_dnn_layer* libxsmm_handle = libxsmm_dnn_create_conv_layer(w.d, &status);
         chk_libxsmm_err(status, "Create handle");
         libxsmm_handles.insert(std::make_pair(w, libxsmm_handle));
         return libxsmm_handle;
@@ -171,14 +172,14 @@ class handles{
         return i->second;
     }
    ~handles(){
-    std::unordered_map<libxsmm_dnn_conv_desc_wrap , libxsmm_dnn_conv_handle*, HashFunction>::iterator i;
+    std::unordered_map<libxsmm_dnn_conv_desc_wrap , libxsmm_dnn_layer*, HashFunction>::iterator i;
     for (i= libxsmm_handles.begin(); i != libxsmm_handles.end(); i++)
-      chk_libxsmm_err(libxsmm_dnn_destroy_conv_handle(i->second),
+      chk_libxsmm_err(libxsmm_dnn_destroy_conv_layer(i->second),
                     "Destroy handle");
     }
   private:
  
-    std::unordered_map<libxsmm_dnn_conv_desc_wrap , libxsmm_dnn_conv_handle*, HashFunction> libxsmm_handles;
+    std::unordered_map<libxsmm_dnn_conv_desc_wrap , libxsmm_dnn_layer*, HashFunction> libxsmm_handles;
  
 };
 
@@ -187,22 +188,24 @@ static handles libxsmm_handles;
 template <typename InputPtr, typename FilterPtr, typename OutputPtr>
 static bool CallLibxsmmConvGeneric(OpKernelContext* ctx,
                                    const libxsmm_dnn_conv_desc& desc,
-                                   libxsmm_dnn_conv_kind kind, InputPtr input,
+                                   libxsmm_dnn_compute_kind kind, InputPtr input,
                                    FilterPtr filter, OutputPtr output) {
+  // setup scoped allocator, which adopts the allocator from the context
+  const libxsmm_tf_allocator<libxsmm_scratch_allocator> tf_allocator(*ctx);
   libxsmm_dnn_err_t status;
-  libxsmm_dnn_conv_handle* libxsmm_handle;
+  libxsmm_dnn_layer* libxsmm_handle;
   libxsmm_dnn_conv_desc_wrap w(desc);
  
-  if(kind == LIBXSMM_DNN_CONV_KIND_FWD)
+  if(kind == LIBXSMM_DNN_COMPUTE_KIND_FWD)
     libxsmm_handle = libxsmm_handles.find(w);
   else{
-    libxsmm_handle = libxsmm_dnn_create_conv_handle_check(desc, &status);
+    libxsmm_handle = libxsmm_dnn_create_conv_layer(desc, &status);
     chk_libxsmm_err(status, "Create handle");
   }
   
   status = libxsmm_dnn_get_codegen_success(libxsmm_handle, kind);
   if (status == LIBXSMM_DNN_WARN_FALLBACK) {
-    chk_libxsmm_err(libxsmm_dnn_destroy_conv_handle(libxsmm_handle),
+    chk_libxsmm_err(libxsmm_dnn_destroy_conv_layer(libxsmm_handle),
                     "Destroy handle");
     return false;  // Use non-libxsmm code
   }
@@ -224,7 +227,7 @@ static bool CallLibxsmmConvGeneric(OpKernelContext* ctx,
 
   int blocksifm = desc.C%ifmblock ==0 ? desc.C/ifmblock :desc.C/ifmblock + 1;           
   int blocksofm = desc.K%ofmblock ==0 ? desc.K/ofmblock :desc.K/ofmblock + 1;
-  float *native_filter = (float*)libxsmm_aligned_malloc( blocksofm*blocksifm*desc.R*desc.S*ifmblock*ofmblock*sizeof(float), 2097152);
+  float *native_filter = (float*)libxsmm_aligned_scratch( blocksofm*blocksifm*desc.R*desc.S*ifmblock*ofmblock*sizeof(float), 2097152);
  
 
   
@@ -264,28 +267,28 @@ static bool CallLibxsmmConvGeneric(OpKernelContext* ctx,
     count.Wait();
   }
 
-  libxsmm_input = libxsmm_dnn_link_input_buffer_check(
-      libxsmm_handle, input, LIBXSMM_DNN_CONV_FORMAT_NHWC_PTR, &status);
+  libxsmm_input = libxsmm_dnn_link_buffer(
+      libxsmm_handle, LIBXSMM_DNN_INPUT, input, LIBXSMM_DNN_TENSOR_FORMAT_NHWC_PTR, &status);
   chk_libxsmm_err(status, "Link input buffer");
-  libxsmm_output = libxsmm_dnn_link_output_buffer_check(
-      libxsmm_handle, output, LIBXSMM_DNN_CONV_FORMAT_NHWC_PTR, &status);
+  libxsmm_output = libxsmm_dnn_link_buffer(
+      libxsmm_handle, LIBXSMM_DNN_OUTPUT, output, LIBXSMM_DNN_TENSOR_FORMAT_NHWC_PTR, &status);
   chk_libxsmm_err(status, "Link output buffer");
-  libxsmm_filter = libxsmm_dnn_link_filter_check(
-      libxsmm_handle, native_filter,  LIBXSMM_DNN_CONV_FORMAT_LIBXSMM_PTR, &status);
+  libxsmm_filter = libxsmm_dnn_link_filter(
+      libxsmm_handle, LIBXSMM_DNN_FILTER, native_filter, LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM_PTR, &status);
   chk_libxsmm_err(status, "Link filter");
 
   chk_libxsmm_err(libxsmm_dnn_zero_buffer(libxsmm_output), "Zero output");
 
-  chk_libxsmm_err(libxsmm_dnn_bind_input_buffer(libxsmm_handle, libxsmm_input),
+  chk_libxsmm_err(libxsmm_dnn_bind_buffer(libxsmm_handle, libxsmm_input, LIBXSMM_DNN_INPUT),
                   "Bind input");
   chk_libxsmm_err(
-      libxsmm_dnn_bind_output_buffer(libxsmm_handle, libxsmm_output),
+      libxsmm_dnn_bind_buffer(libxsmm_handle, libxsmm_output, LIBXSMM_DNN_OUTPUT),
       "Bind output");
-  chk_libxsmm_err(libxsmm_dnn_bind_filter(libxsmm_handle, libxsmm_filter),
+  chk_libxsmm_err(libxsmm_dnn_bind_filter(libxsmm_handle, libxsmm_filter, LIBXSMM_DNN_FILTER),
                   "Bind filter");
 
-  if (kind == LIBXSMM_DNN_CONV_KIND_BWD) {
-    libxsmm_dnn_transpose_filter(libxsmm_handle);
+  if (kind == LIBXSMM_DNN_COMPUTE_KIND_BWD) {
+    libxsmm_dnn_transpose_filter(libxsmm_handle, LIBXSMM_DNN_FILTER);
   }
 
   BlockingCounter counter(num_threads);
@@ -294,7 +297,7 @@ static bool CallLibxsmmConvGeneric(OpKernelContext* ctx,
 
   for (int i = 0; i < num_threads; ++i) {
     worker_threads->workers->Schedule([=, &counter]() {
-      chk_libxsmm_err(libxsmm_dnn_convolve_st(libxsmm_handle, kind, 0, i),
+      chk_libxsmm_err(libxsmm_dnn_execute_st(libxsmm_handle, kind, 0, i),
                       "Worker");
       counter.DecrementCount();
     });
@@ -304,8 +307,8 @@ static bool CallLibxsmmConvGeneric(OpKernelContext* ctx,
   chk_libxsmm_err(libxsmm_dnn_destroy_buffer(libxsmm_output), "Destroy output");
   chk_libxsmm_err(libxsmm_dnn_destroy_filter(libxsmm_filter), "Destroy filter");
   
-  if(kind != LIBXSMM_DNN_CONV_KIND_FWD)
-    chk_libxsmm_err(libxsmm_dnn_destroy_conv_handle(libxsmm_handle),
+  if(kind != LIBXSMM_DNN_COMPUTE_KIND_FWD)
+    chk_libxsmm_err(libxsmm_dnn_destroy_conv_layer(libxsmm_handle),
                   "Destroy handle");
   libxsmm_free(native_filter);
   return true;  // Succeeded
@@ -315,7 +318,7 @@ template <typename T>
 struct XsmmFwdConv2D<CPUDevice, T> {
   bool operator()(OpKernelContext* ctx, const libxsmm_dnn_conv_desc& desc,
                   const T* input, const T* filter, T* output) {
-    return CallLibxsmmConvGeneric(ctx, desc, LIBXSMM_DNN_CONV_KIND_FWD, input,
+    return CallLibxsmmConvGeneric(ctx, desc, LIBXSMM_DNN_COMPUTE_KIND_FWD, input,
                                   filter, output);
   }
 };
@@ -324,7 +327,7 @@ template <typename T>
 struct XsmmBkwInputConv2D<CPUDevice, T> {
   bool operator()(OpKernelContext* ctx, const libxsmm_dnn_conv_desc& desc,
                   T* input, const T* filter, const T* output) {
-    return CallLibxsmmConvGeneric(ctx, desc, LIBXSMM_DNN_CONV_KIND_BWD, input,
+    return CallLibxsmmConvGeneric(ctx, desc, LIBXSMM_DNN_COMPUTE_KIND_BWD, input,
                                   filter, output);
   }
 };
@@ -333,7 +336,7 @@ template <typename T>
 struct XsmmBkwFilterConv2D<CPUDevice, T> {
   bool operator()(OpKernelContext* ctx, const libxsmm_dnn_conv_desc& desc,
                   const T* input, T* filter, const T* output) {
-    return CallLibxsmmConvGeneric(ctx, desc, LIBXSMM_DNN_CONV_KIND_UPD, input,
+    return CallLibxsmmConvGeneric(ctx, desc, LIBXSMM_DNN_COMPUTE_KIND_UPD, input,
                                   filter, output);
   }
 };
diff --git a/tensorflow/core/kernels/xsmm_conv2d_test.cc b/tensorflow/core/kernels/xsmm_conv2d_test.cc
index f4ab6896ae4ed1..381ea39b77c26e 100644
--- a/tensorflow/core/kernels/xsmm_conv2d_test.cc
+++ b/tensorflow/core/kernels/xsmm_conv2d_test.cc
@@ -188,6 +188,8 @@ class XsmmConv2DTest : public OpsTestBase {
 TEST_F(XsmmConv2DTest, Basic) {
      MakeOp(1);
 
+     // setup scoped allocator, which uses cpu_allocator() for this scope
+     const libxsmm_tf_allocator<libxsmm_scratch_allocator> tf_allocator;
 
      int ifw = 14;           /* input width, "W" */
      int ifh = 14;           /* input height, "H" */
@@ -223,9 +225,9 @@ TEST_F(XsmmConv2DTest, Basic) {
     //Initialization of Filter and Image
     
     /* allocate data */
-     float *naive_input           = (float*)libxsmm_aligned_malloc( nImg*nIfm*ifhp*ifwp*sizeof(float), 2097152);
-     float *naive_output          = (float*)libxsmm_aligned_malloc( nImg*nOfm*ofhp*ofwp*sizeof(float), 2097152);
-     float *naive_filter          = (float*)libxsmm_aligned_malloc( nOfm*nIfm*kh*kw*    sizeof(float), 2097152);
+     float *naive_input           = (float*)libxsmm_aligned_scratch( nImg*nIfm*ifhp*ifwp*sizeof(float), 2097152);
+     float *naive_output          = (float*)libxsmm_aligned_scratch( nImg*nOfm*ofhp*ofwp*sizeof(float), 2097152);
+     float *naive_filter          = (float*)libxsmm_aligned_scratch( nOfm*nIfm*kh*kw*    sizeof(float), 2097152);
      /* initialize data */
      init_buf(naive_input,          nImg*nIfm*ifhp*ifwp, 0, 0);
      zero_buf(naive_output,         nImg*nOfm*ofhp*ofwp);
@@ -322,12 +324,11 @@ TEST(XsmmConv2DTest, Basic) {
     desc.pad_w_out = 0;
     desc.threads = num_threads;
     desc.algo = LIBXSMM_DNN_CONV_ALGO_DIRECT;
-    desc.buffer_format = LIBXSMM_DNN_CONV_FORMAT_NHWC;
-    desc.filter_format = LIBXSMM_DNN_CONV_FORMAT_LIBXSMM;//LIBXSMM_DNN_CONV_FORMAT_RSCK;
+    desc.buffer_format = LIBXSMM_DNN_TENSOR_FORMAT_NHWC;
+    desc.filter_format = LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM;//LIBXSMM_DNN_TENSOR_FORMAT_RSCK;
     desc.fuse_ops = LIBXSMM_DNN_CONV_FUSE_NONE;
     desc.options = LIBXSMM_DNN_CONV_OPTION_NONE;
-    desc.datatype_in = LIBXSMM_DNN_DATATYPE_F32;
-    desc.datatype_out = LIBXSMM_DNN_DATATYPE_F32;
+    desc.datatype = LIBXSMM_DNN_DATATYPE_F32;
  
     if (!CanUseXsmmConv2D(desc, data_format)) {
       return false;