Skip to content

Commit

Permalink
Updates in preparation of LIBXSMM 1.7 (tensorflow#8)
Browse files Browse the repository at this point in the history
* Fixed libxsmm_config_arguments: Fixed the incorrect value supposed to trigger auto-prefetch. Fixed the 0-threshold, which is now accounted for in LIBXSMM (by just populating the default threshold). The problem arised from the assumption "threshold: fallback to BLAS if n*m*k above this", which is wrong (the threshold populates an upper bound until which JIT code is generated). The previous configuration perhaps caused all sorts of issues due to other values derived from the 0-threshold. Note, explicitly JIT'ting code is/was never subject to a threshold.

* Upgraded to libxsmm 1.6.5

* Enable the use of libxsmm for matrix multiplications

* Enable the use of libxsmm to speedup 1x1 convolutions (which are
computed using matrix multiplications)

* Make use of TensorFlow's allocation infrastructure even when using LIBXSMM allocation functions. In particular, the (cached) libxsmm_spmdm_init now relies on TF's cpu_allocator().

For C++ code, one can use a libxsmm_scoped_allocator<kind> in order to (temporarily) setup a different allocation mechanism. For instance, using libxsmm_tf_allocator<libxsmm_scratch_allocator> changes LIBXSMM's scratch allocator to rely on TensorFlow. The libxsmm_tf_allocator provides two kinds of c'tors: (1) the no-argument variant adopts TF's cpu_allocator(), whereas the one-argument form (2) adopts the allocator from the given OpKernelContext. Changing the allocator in LIBXSMM with pending buffers (from different allocators) is valid, and all other services in LIBXSMM's "malloc domain" work regardless of the allocation mechanism (e.g., libxsmm_malloc_size).

* Simply renamed API items in order to follow changes in LIBXSMM 1.7. This is incomplete as more changes/adjustments are needed.

* Account for removed non-check API.

* Include libxsmm_malloc.h now that libxsmm_tf_allocator is used.

* Renamed libxsmm_dnn_create_conv_handle to libxsmm_dnn_create_conv_layer.

* Renamed LIBXSMM_DNN_CONV_FORMAT_* to LIBXSMM_DNN_TENSOR_FORMAT_*.

* Renamed libxsmm_dnn_destroy_conv_handle to libxsmm_dnn_destroy_conv_layer.

* Include missing header file (libxsmm_malloc.h).

* Renamed LIBXSMM_DNN_CONV_KIND_* to LIBXSMM_DNN_COMPUTE_KIND_*.

* Account for the fact that datatype_in/out is now only datatype (libxsmm_dnn_conv_desc structure).

* Updated to new libxsmm_dnn_link_* functions.

* Updated to use new libxsmm_dnn_bind_* functions.

* Fixed calling libxsmm_dnn_transpose_filter.
  • Loading branch information
hfp authored and alheinecke committed Jan 26, 2017
1 parent 2f01129 commit 640efd8
Show file tree
Hide file tree
Showing 5 changed files with 49 additions and 44 deletions.
7 changes: 3 additions & 4 deletions tensorflow/core/kernels/conv_grad_input_ops.cc
Expand Up @@ -170,12 +170,11 @@ struct LaunchXsmmBackwardInputConvolution<CPUDevice, float> {
desc.pad_w_out = 0;
desc.threads = num_threads;
desc.algo = LIBXSMM_DNN_CONV_ALGO_DIRECT;
desc.buffer_format = LIBXSMM_DNN_CONV_FORMAT_NHWC;
desc.filter_format = LIBXSMM_DNN_CONV_FORMAT_LIBXSMM;//LIBXSMM_DNN_CONV_FORMAT_RSCK;
desc.buffer_format = LIBXSMM_DNN_TENSOR_FORMAT_NHWC;
desc.filter_format = LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM;//LIBXSMM_DNN_TENSOR_FORMAT_RSCK;
desc.fuse_ops = LIBXSMM_DNN_CONV_FUSE_NONE;
desc.options = LIBXSMM_DNN_CONV_OPTION_NONE;
desc.datatype_in = LIBXSMM_DNN_DATATYPE_F32;
desc.datatype_out = LIBXSMM_DNN_DATATYPE_F32;
desc.datatype = LIBXSMM_DNN_DATATYPE_F32;

auto input_ptr = input_backward.data();
auto filter_ptr = kernel.data();
Expand Down
7 changes: 3 additions & 4 deletions tensorflow/core/kernels/conv_ops.cc
Expand Up @@ -204,12 +204,11 @@ class LaunchXsmmConvOp<CPUDevice, float> {
desc.pad_w_out = 0;
desc.threads = num_threads;
desc.algo = LIBXSMM_DNN_CONV_ALGO_DIRECT;
desc.buffer_format = LIBXSMM_DNN_CONV_FORMAT_NHWC;
desc.filter_format = LIBXSMM_DNN_CONV_FORMAT_LIBXSMM;
desc.buffer_format = LIBXSMM_DNN_TENSOR_FORMAT_NHWC;
desc.filter_format = LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM;
desc.fuse_ops = LIBXSMM_DNN_CONV_FUSE_NONE;
desc.options = LIBXSMM_DNN_CONV_OPTION_NONE;
desc.datatype_in = LIBXSMM_DNN_DATATYPE_F32;
desc.datatype_out = LIBXSMM_DNN_DATATYPE_F32;
desc.datatype = LIBXSMM_DNN_DATATYPE_F32;

if (!CanUseXsmmConv2D(desc, data_format)) {
return false;
Expand Down
3 changes: 3 additions & 0 deletions tensorflow/core/kernels/sparse_matmul_op.cc
Expand Up @@ -37,6 +37,7 @@ limitations under the License.
#include "tensorflow/core/platform/types.h"
#ifdef TENSORFLOW_USE_LIBXSMM
#include "include/libxsmm_intrinsics_x86.h"
#include "include/libxsmm_malloc.h"
#include "include/libxsmm_spmdm.h"
#endif

Expand Down Expand Up @@ -896,6 +897,8 @@ class LibxsmmSparseMatMul {
} else {
std::unique_ptr<TensorInfoCacheEntry> e{
new TensorInfoCacheEntry{M, K, N, max_threads, {}, nullptr}};
// setup scoped allocator, which uses cpu_allocator() for this scope
const libxsmm_tf_allocator<libxsmm_scratch_allocator> tf_allocator;
libxsmm_spmdm_init(M, N, K, max_threads, &e->handle, &e->output_csr);
return e;
}
Expand Down
61 changes: 32 additions & 29 deletions tensorflow/core/kernels/xsmm_conv2d.cc
Expand Up @@ -33,6 +33,7 @@ void dummy_xsmm_conv2d_ensure_file_is_not_empty(void);

#include "include/libxsmm_cpuid.h"
#include "libxsmm_dnn_handle.h"
#include "libxsmm_malloc.h"

namespace tensorflow {

Expand Down Expand Up @@ -158,11 +159,11 @@ struct HashFunction{

class handles{
public:
libxsmm_dnn_conv_handle* find( const libxsmm_dnn_conv_desc_wrap &w) {
std::unordered_map<libxsmm_dnn_conv_desc_wrap , libxsmm_dnn_conv_handle*, HashFunction>::iterator i = libxsmm_handles.find(w);
libxsmm_dnn_layer* find( const libxsmm_dnn_conv_desc_wrap &w) {
std::unordered_map<libxsmm_dnn_conv_desc_wrap , libxsmm_dnn_layer*, HashFunction>::iterator i = libxsmm_handles.find(w);
if (i == libxsmm_handles.end()){
libxsmm_dnn_err_t status;
libxsmm_dnn_conv_handle* libxsmm_handle = libxsmm_dnn_create_conv_handle_check(w.d, &status);
libxsmm_dnn_layer* libxsmm_handle = libxsmm_dnn_create_conv_layer(w.d, &status);
chk_libxsmm_err(status, "Create handle");
libxsmm_handles.insert(std::make_pair(w, libxsmm_handle));
return libxsmm_handle;
Expand All @@ -171,14 +172,14 @@ class handles{
return i->second;
}
~handles(){
std::unordered_map<libxsmm_dnn_conv_desc_wrap , libxsmm_dnn_conv_handle*, HashFunction>::iterator i;
std::unordered_map<libxsmm_dnn_conv_desc_wrap , libxsmm_dnn_layer*, HashFunction>::iterator i;
for (i= libxsmm_handles.begin(); i != libxsmm_handles.end(); i++)
chk_libxsmm_err(libxsmm_dnn_destroy_conv_handle(i->second),
chk_libxsmm_err(libxsmm_dnn_destroy_conv_layer(i->second),
"Destroy handle");
}
private:

std::unordered_map<libxsmm_dnn_conv_desc_wrap , libxsmm_dnn_conv_handle*, HashFunction> libxsmm_handles;
std::unordered_map<libxsmm_dnn_conv_desc_wrap , libxsmm_dnn_layer*, HashFunction> libxsmm_handles;

};

Expand All @@ -187,22 +188,24 @@ static handles libxsmm_handles;
template <typename InputPtr, typename FilterPtr, typename OutputPtr>
static bool CallLibxsmmConvGeneric(OpKernelContext* ctx,
const libxsmm_dnn_conv_desc& desc,
libxsmm_dnn_conv_kind kind, InputPtr input,
libxsmm_dnn_compute_kind kind, InputPtr input,
FilterPtr filter, OutputPtr output) {
// setup scoped allocator, which adopts the allocator from the context
const libxsmm_tf_allocator<libxsmm_scratch_allocator> tf_allocator(*ctx);
libxsmm_dnn_err_t status;
libxsmm_dnn_conv_handle* libxsmm_handle;
libxsmm_dnn_layer* libxsmm_handle;
libxsmm_dnn_conv_desc_wrap w(desc);

if(kind == LIBXSMM_DNN_CONV_KIND_FWD)
if(kind == LIBXSMM_DNN_COMPUTE_KIND_FWD)
libxsmm_handle = libxsmm_handles.find(w);
else{
libxsmm_handle = libxsmm_dnn_create_conv_handle_check(desc, &status);
libxsmm_handle = libxsmm_dnn_create_conv_layer(desc, &status);
chk_libxsmm_err(status, "Create handle");
}

status = libxsmm_dnn_get_codegen_success(libxsmm_handle, kind);
if (status == LIBXSMM_DNN_WARN_FALLBACK) {
chk_libxsmm_err(libxsmm_dnn_destroy_conv_handle(libxsmm_handle),
chk_libxsmm_err(libxsmm_dnn_destroy_conv_layer(libxsmm_handle),
"Destroy handle");
return false; // Use non-libxsmm code
}
Expand All @@ -224,7 +227,7 @@ static bool CallLibxsmmConvGeneric(OpKernelContext* ctx,

int blocksifm = desc.C%ifmblock ==0 ? desc.C/ifmblock :desc.C/ifmblock + 1;
int blocksofm = desc.K%ofmblock ==0 ? desc.K/ofmblock :desc.K/ofmblock + 1;
float *native_filter = (float*)libxsmm_aligned_malloc( blocksofm*blocksifm*desc.R*desc.S*ifmblock*ofmblock*sizeof(float), 2097152);
float *native_filter = (float*)libxsmm_aligned_scratch( blocksofm*blocksifm*desc.R*desc.S*ifmblock*ofmblock*sizeof(float), 2097152);



Expand Down Expand Up @@ -264,28 +267,28 @@ static bool CallLibxsmmConvGeneric(OpKernelContext* ctx,
count.Wait();
}

libxsmm_input = libxsmm_dnn_link_input_buffer_check(
libxsmm_handle, input, LIBXSMM_DNN_CONV_FORMAT_NHWC_PTR, &status);
libxsmm_input = libxsmm_dnn_link_buffer(
libxsmm_handle, LIBXSMM_DNN_INPUT, input, LIBXSMM_DNN_TENSOR_FORMAT_NHWC_PTR, &status);
chk_libxsmm_err(status, "Link input buffer");
libxsmm_output = libxsmm_dnn_link_output_buffer_check(
libxsmm_handle, output, LIBXSMM_DNN_CONV_FORMAT_NHWC_PTR, &status);
libxsmm_output = libxsmm_dnn_link_buffer(
libxsmm_handle, LIBXSMM_DNN_OUTPUT, output, LIBXSMM_DNN_TENSOR_FORMAT_NHWC_PTR, &status);
chk_libxsmm_err(status, "Link output buffer");
libxsmm_filter = libxsmm_dnn_link_filter_check(
libxsmm_handle, native_filter, LIBXSMM_DNN_CONV_FORMAT_LIBXSMM_PTR, &status);
libxsmm_filter = libxsmm_dnn_link_filter(
libxsmm_handle, LIBXSMM_DNN_FILTER, native_filter, LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM_PTR, &status);
chk_libxsmm_err(status, "Link filter");

chk_libxsmm_err(libxsmm_dnn_zero_buffer(libxsmm_output), "Zero output");

chk_libxsmm_err(libxsmm_dnn_bind_input_buffer(libxsmm_handle, libxsmm_input),
chk_libxsmm_err(libxsmm_dnn_bind_buffer(libxsmm_handle, libxsmm_input, LIBXSMM_DNN_INPUT),
"Bind input");
chk_libxsmm_err(
libxsmm_dnn_bind_output_buffer(libxsmm_handle, libxsmm_output),
libxsmm_dnn_bind_buffer(libxsmm_handle, libxsmm_output, LIBXSMM_DNN_OUTPUT),
"Bind output");
chk_libxsmm_err(libxsmm_dnn_bind_filter(libxsmm_handle, libxsmm_filter),
chk_libxsmm_err(libxsmm_dnn_bind_filter(libxsmm_handle, libxsmm_filter, LIBXSMM_DNN_FILTER),
"Bind filter");

if (kind == LIBXSMM_DNN_CONV_KIND_BWD) {
libxsmm_dnn_transpose_filter(libxsmm_handle);
if (kind == LIBXSMM_DNN_COMPUTE_KIND_BWD) {
libxsmm_dnn_transpose_filter(libxsmm_handle, LIBXSMM_DNN_FILTER);
}

BlockingCounter counter(num_threads);
Expand All @@ -294,7 +297,7 @@ static bool CallLibxsmmConvGeneric(OpKernelContext* ctx,

for (int i = 0; i < num_threads; ++i) {
worker_threads->workers->Schedule([=, &counter]() {
chk_libxsmm_err(libxsmm_dnn_convolve_st(libxsmm_handle, kind, 0, i),
chk_libxsmm_err(libxsmm_dnn_execute_st(libxsmm_handle, kind, 0, i),
"Worker");
counter.DecrementCount();
});
Expand All @@ -304,8 +307,8 @@ static bool CallLibxsmmConvGeneric(OpKernelContext* ctx,
chk_libxsmm_err(libxsmm_dnn_destroy_buffer(libxsmm_output), "Destroy output");
chk_libxsmm_err(libxsmm_dnn_destroy_filter(libxsmm_filter), "Destroy filter");

if(kind != LIBXSMM_DNN_CONV_KIND_FWD)
chk_libxsmm_err(libxsmm_dnn_destroy_conv_handle(libxsmm_handle),
if(kind != LIBXSMM_DNN_COMPUTE_KIND_FWD)
chk_libxsmm_err(libxsmm_dnn_destroy_conv_layer(libxsmm_handle),
"Destroy handle");
libxsmm_free(native_filter);
return true; // Succeeded
Expand All @@ -315,7 +318,7 @@ template <typename T>
struct XsmmFwdConv2D<CPUDevice, T> {
bool operator()(OpKernelContext* ctx, const libxsmm_dnn_conv_desc& desc,
const T* input, const T* filter, T* output) {
return CallLibxsmmConvGeneric(ctx, desc, LIBXSMM_DNN_CONV_KIND_FWD, input,
return CallLibxsmmConvGeneric(ctx, desc, LIBXSMM_DNN_COMPUTE_KIND_FWD, input,
filter, output);
}
};
Expand All @@ -324,7 +327,7 @@ template <typename T>
struct XsmmBkwInputConv2D<CPUDevice, T> {
bool operator()(OpKernelContext* ctx, const libxsmm_dnn_conv_desc& desc,
T* input, const T* filter, const T* output) {
return CallLibxsmmConvGeneric(ctx, desc, LIBXSMM_DNN_CONV_KIND_BWD, input,
return CallLibxsmmConvGeneric(ctx, desc, LIBXSMM_DNN_COMPUTE_KIND_BWD, input,
filter, output);
}
};
Expand All @@ -333,7 +336,7 @@ template <typename T>
struct XsmmBkwFilterConv2D<CPUDevice, T> {
bool operator()(OpKernelContext* ctx, const libxsmm_dnn_conv_desc& desc,
const T* input, T* filter, const T* output) {
return CallLibxsmmConvGeneric(ctx, desc, LIBXSMM_DNN_CONV_KIND_UPD, input,
return CallLibxsmmConvGeneric(ctx, desc, LIBXSMM_DNN_COMPUTE_KIND_UPD, input,
filter, output);
}
};
Expand Down
15 changes: 8 additions & 7 deletions tensorflow/core/kernels/xsmm_conv2d_test.cc
Expand Up @@ -188,6 +188,8 @@ class XsmmConv2DTest : public OpsTestBase {
TEST_F(XsmmConv2DTest, Basic) {
MakeOp(1);

// setup scoped allocator, which uses cpu_allocator() for this scope
const libxsmm_tf_allocator<libxsmm_scratch_allocator> tf_allocator;

int ifw = 14; /* input width, "W" */
int ifh = 14; /* input height, "H" */
Expand Down Expand Up @@ -223,9 +225,9 @@ TEST_F(XsmmConv2DTest, Basic) {
//Initialization of Filter and Image

/* allocate data */
float *naive_input = (float*)libxsmm_aligned_malloc( nImg*nIfm*ifhp*ifwp*sizeof(float), 2097152);
float *naive_output = (float*)libxsmm_aligned_malloc( nImg*nOfm*ofhp*ofwp*sizeof(float), 2097152);
float *naive_filter = (float*)libxsmm_aligned_malloc( nOfm*nIfm*kh*kw* sizeof(float), 2097152);
float *naive_input = (float*)libxsmm_aligned_scratch( nImg*nIfm*ifhp*ifwp*sizeof(float), 2097152);
float *naive_output = (float*)libxsmm_aligned_scratch( nImg*nOfm*ofhp*ofwp*sizeof(float), 2097152);
float *naive_filter = (float*)libxsmm_aligned_scratch( nOfm*nIfm*kh*kw* sizeof(float), 2097152);
/* initialize data */
init_buf(naive_input, nImg*nIfm*ifhp*ifwp, 0, 0);
zero_buf(naive_output, nImg*nOfm*ofhp*ofwp);
Expand Down Expand Up @@ -322,12 +324,11 @@ TEST(XsmmConv2DTest, Basic) {
desc.pad_w_out = 0;
desc.threads = num_threads;
desc.algo = LIBXSMM_DNN_CONV_ALGO_DIRECT;
desc.buffer_format = LIBXSMM_DNN_CONV_FORMAT_NHWC;
desc.filter_format = LIBXSMM_DNN_CONV_FORMAT_LIBXSMM;//LIBXSMM_DNN_CONV_FORMAT_RSCK;
desc.buffer_format = LIBXSMM_DNN_TENSOR_FORMAT_NHWC;
desc.filter_format = LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM;//LIBXSMM_DNN_TENSOR_FORMAT_RSCK;
desc.fuse_ops = LIBXSMM_DNN_CONV_FUSE_NONE;
desc.options = LIBXSMM_DNN_CONV_OPTION_NONE;
desc.datatype_in = LIBXSMM_DNN_DATATYPE_F32;
desc.datatype_out = LIBXSMM_DNN_DATATYPE_F32;
desc.datatype = LIBXSMM_DNN_DATATYPE_F32;
if (!CanUseXsmmConv2D(desc, data_format)) {
return false;
Expand Down

0 comments on commit 640efd8

Please sign in to comment.