caffe2/operators/tt_linear_op.h

#ifndef CAFFE2_OPERATORS_TT_LINEAR_OP_H_
#define CAFFE2_OPERATORS_TT_LINEAR_OP_H_

#ifdef CAFFE2_USE_MKL
#include <mkl.h>
#endif // CAFFE2_USE_MKL

#include "Eigen/Core"
#include "Eigen/Dense"
#include "caffe2/core/context.h"
#include "caffe2/core/operator.h"
#include "caffe2/utils/eigen_utils.h"
#include "caffe2/utils/math.h"

namespace caffe2 {

template <typename T, class Context, class Engine = DefaultEngine>
class TTLinearOp final : public Operator<Context> {
 public:
  USE_OPERATOR_CONTEXT_FUNCTIONS;
  template <class... Args>
  explicit TTLinearOp(Args&&... args)
      : Operator<Context>(std::forward<Args>(args)...),
        inp_sizes_(this->template GetRepeatedArgument<int>("inp_sizes")),
        out_sizes_(this->template GetRepeatedArgument<int>("out_sizes")),
        tt_ranks_(this->template GetRepeatedArgument<int>("tt_ranks")),
        Y_temp_(unique_ptr<Blob>(new Blob())) {}
  ~TTLinearOp() {}

  bool RunOnDevice() override {
    const auto& X = Input(0); // Input array
    const auto& b = Input(1); // Bias array
    const auto& cores = Input(2); // 1D array containing the TT-cores

    CAFFE_ENFORCE(X.dim() > 1, "Number of dimensions in X: ", X.dim());
    CAFFE_ENFORCE(b.dim() == 1, "Number of dimensions in b: ", b.dim());
    CAFFE_ENFORCE(
        inp_sizes_.size() == out_sizes_.size(),
        "inp_sizes has size: ",
        inp_sizes_.size(),
        ", out_sizes has size: ",
        out_sizes_.size());
    CAFFE_ENFORCE(
        cores.dim() == 1, "Number of dimensions in cores: ", cores.dim());
    // batch size
    const int batch_size = X.dim() > 1 ? X.dim32(0) : 1;

    // dimension d of tensors
    const int d = inp_sizes_.size();

    // Keep track of index of current core in multiplication
    int cores_idx = 0;

    // Temporary buffer to facilitate multiplication of TT-cores with input
    auto Y_buf = BlobGetMutableTensor(Y_temp_.get(), Context::GetDeviceType());
    Y_buf->ResizeLike(X);
    Y_buf->CopyFrom(X);
    Tensor* Y;

    // The overall forward pass involves multiplication with each core, where
    // each core has sizes dictated by inp_sizes_ and out_sizes_. Each core thus
    // has size inp_sizes_[i] * tt_ranks_[i] * tt_ranks_[i + 1] * out_sizes_[i].
    for (int i = (d - 1); i >= 0; --i) {
      int curr_rows = inp_sizes_[i] * tt_ranks_[i + 1];
      int curr_cols = tt_ranks_[i] * out_sizes_[i];

      // TODO Replace by Reshape(), once wrappers are written
      Y_buf->Resize(Y_buf->numel() / curr_rows, curr_rows);
      Y = Output(
          0, {Y_buf->numel() / curr_rows, curr_cols}, at::dtype<float>());

      // Defensive checks
      CAFFE_ENFORCE(Y_buf->numel() % curr_rows == 0, Y_buf->numel(), curr_rows);
      CAFFE_ENFORCE(
          cores_idx + curr_rows * curr_cols <= cores.numel(),
          cores_idx + curr_rows * curr_cols,
          cores.numel());

      // Multiply ith core with the intermediate output
      math::Gemm<float, Context, Engine>(
          CblasNoTrans,
          CblasNoTrans,
          Y_buf->numel() / curr_rows,
          curr_cols,
          curr_rows,
          1,
          Y_buf->template data<float>(),
          cores.template data<float>() + cores_idx,
          0,
          Y->template mutable_data<float>(),
          &context_);

      CAFFE_ENFORCE(Y->numel() % out_sizes_[i] == 0, Y->numel(), out_sizes_[i]);

      // TODO Add GPU support by writing a generic wrapper.
      auto Y_mat = EigenMatrixMap<float>(
          Y->template mutable_data<float>(),
          Y->numel() / out_sizes_[i],
          out_sizes_[i]);
      Y_mat = ConstEigenMatrixMap<float>(
                  Y->template data<float>(),
                  out_sizes_[i],
                  Y->numel() / out_sizes_[i])
                  .transpose()
                  .eval();

      // Resize operation
      Y_buf->Resize(Y->dim32(0), Y->dim32(1));
      context_.template CopyFromCPU<float>(
          Y->numel(),
          Y->template data<float>(),
          Y_buf->template mutable_data<float>());

      cores_idx += curr_rows * curr_cols;
    }

    // TODO Add GPU support by writing a generic wrapper.
    auto Y_mat = EigenMatrixMap<float>(
        Y->template mutable_data<float>(), batch_size, Y->numel() / batch_size);
    Y_mat = ConstEigenMatrixMap<float>(
                Y->template data<float>(), Y->numel() / batch_size, batch_size)
                .transpose()
                .eval();
    // TODO Replace by Reshape(), once wrappers are written
    Y = Output(0, {batch_size, Y->numel() / batch_size}, at::dtype<float>());

    // Check that output size of Y is the element-wise product of out_sizes
    int prod_out_sizes = 1;
    for (int i = 0; i < out_sizes_.size(); i++) {
      prod_out_sizes *= out_sizes_[i];
    }
    CAFFE_ENFORCE(
        Y->dim32(1) == prod_out_sizes,
        "Output dimension of Y: ",
        Y->dim32(1),
        ", product of out_sizes: ",
        prod_out_sizes);

    // Add bias term
    if (bias_multiplier_.numel() != batch_size) {
      // If the helper bias multiplier is not M, reshape and fill it with one.
      ReinitializeTensor(
          &bias_multiplier_,
          {batch_size},
          at::dtype<T>().device(Context::GetDeviceType()));
      math::Set<T, Context>(
          batch_size,
          static_cast<T>(1),
          bias_multiplier_.template mutable_data<T>(),
          &context_);
    }
    math::Gemm<T, Context, Engine>(
        CblasNoTrans,
        CblasNoTrans,
        Y->dim32(0),
        Y->dim32(1),
        1,
        1,
        bias_multiplier_.template data<T>(),
        b.template data<T>(),
        1,
        Y->template mutable_data<T>(),
        &context_);
    return true;
  }

 protected:
  Tensor bias_multiplier_;
  std::vector<int> inp_sizes_;
  std::vector<int> out_sizes_;
  std::vector<int> tt_ranks_;
  std::unique_ptr<Blob> Y_temp_;
};

// TODO: Complete after verifying utility of TT-layer's forward pass.
template <typename T, class Context, class Engine = DefaultEngine>
class TTLinearGradientOp : public Operator<Context> {
 public:
  USE_OPERATOR_CONTEXT_FUNCTIONS;
  template <class... Args>
  explicit TTLinearGradientOp(Args&&... args)
      : Operator<Context>(std::forward<Args>(args)...) {}
  ~TTLinearGradientOp() {}

  bool RunOnDevice() override {
    return false;
  }

 protected:
  Tensor bias_multiplier_{Context::GetDeviceType()};
};

} // namespace caffe2

#endif // CAFFE2_OPERATORS_TT_LINEAR_OP_H_