feat: Implement blocked matrix multiplication (#1184)

Eigen uses a built-in cost model to determine how it should perform certain matrix operations. In general, this works quite well, but there are some edge cases where Eigen can struggle to output efficient code. An especially apparent instance of this is when multiplying 8x8 matrices, which we do very commonly in operations such as stepping. In these cases, we observe that Eigen emits code for generalized matrix-matrix multiplication, rather than specialized code for small matrices. This adds significant overhead to the matrix multiplication, and it slows down some parts of our code a lot. Our proposed solution is to implement a wrapper around Eigen's matrix multiplication code that breaks a multiplication down into smaller operations through a technique called blocking. A matrix of size 2ix2j can be split into four matrices of sizes ixj, which can then be multiplied and added to provide a correct multiplication of the parent matrices. In this way, we can encourage Eigen to emit code for its hand-optimized small-matrix operations, which should give speed-up in some cases. The operation is also supported for odd-sized matrices.
acts-project · Mar 15, 2022 · 7034c5a · 7034c5a
1 parent b804b32
commit 7034c5a
Show file tree

Hide file tree

Showing 2 changed files with 144 additions and 0 deletions.
diff --git a/Core/include/Acts/Utilities/Helpers.hpp b/Core/include/Acts/Utilities/Helpers.hpp
@@ -434,4 +434,106 @@ auto matrixToBitset(const Eigen::PlainObjectBase<Derived>& m) {
   return res;
 }
 
+/// @brief Perform a blocked matrix multiplication, avoiding Eigen GEMM
+/// methods.
+///
+/// @tparam A The type of the first matrix, which should be MxN
+/// @tparam B The type of the second matrix, which should be NxP
+///
+/// @param[in] a An MxN matrix of type A
+/// @param[in] b An NxP matrix of type P
+///
+/// @returns The product ab
+template <typename A, typename B>
+inline ActsMatrix<A::RowsAtCompileTime, B::ColsAtCompileTime> blockedMult(
+    const A& a, const B& b) {
+  // Extract the sizes of the matrix types that we receive as template
+  // parameters.
+  constexpr int M = A::RowsAtCompileTime;
+  constexpr int N = A::ColsAtCompileTime;
+  constexpr int P = B::ColsAtCompileTime;
+
+  // Ensure that the second dimension of our first matrix equals the first
+  // dimension of the second matrix, otherwise we cannot multiply.
+  static_assert(N == B::RowsAtCompileTime);
+
+  if constexpr (M <= 4 && N <= 4 && P <= 4) {
+    // In cases where the matrices being multiplied are small, we can rely on
+    // Eigen do to a good job, and we don't really need to do any blocking.
+    return a * b;
+  } else {
+    // Here, we want to calculate the expression: C = AB, Eigen, natively,
+    // doesn't do a great job at this if the matrices A and B are large
+    // (roughly M >= 8, N >= 8, or P >= 8), and applies a slow GEMM operation.
+    // We apply a blocked matrix multiplication operation to decompose the
+    // multiplication into smaller operations, relying on the fact that:
+    //
+    // ┌         ┐   ┌         ┐ ┌         ┐
+    // │ C₁₁ C₁₂ │ = │ A₁₁ A₁₂ │ │ B₁₁ B₁₂ │
+    // │ C₂₁ C₂₂ │ = │ A₂₁ A₂₂ │ │ B₂₁ B₂₂ │
+    // └         ┘   └         ┘ └         ┘
+    //
+    // where:
+    //
+    // C₁₁ = A₁₁ * B₁₁ + A₁₂ * B₂₁
+    // C₁₂ = A₁₁ * B₁₂ + A₁₂ * B₂₂
+    // C₂₁ = A₂₁ * B₁₁ + A₂₂ * B₂₁
+    // C₂₂ = A₂₁ * B₁₂ + A₂₂ * B₂₂
+    //
+    // The sizes of these submatrices are roughly half (in each dimension) that
+    // of the parent matrix. If the size of the parent matrix is even, we can
+    // divide it exactly, If the size of the parent matrix is odd, then some
+    // of the submatrices will be one larger than the others. In general, for
+    // any matrix Q, the sizes of the submatrices are (where / denotes integer
+    // division):
+    //
+    // Q₁₁ : M / 2 × P / 2
+    // Q₁₂ : M / 2 × (P + 1) / 2
+    // Q₂₁ : (M + 1) / 2 × P / 2
+    // Q₂₂ : (M + 1) / 2 × (P + 1) / 2
+    //
+    // See https://csapp.cs.cmu.edu/public/waside/waside-blocking.pdf for a
+    // more in-depth explanation of blocked matrix multiplication.
+    constexpr int M1 = M / 2;
+    constexpr int M2 = (M + 1) / 2;
+    constexpr int N1 = N / 2;
+    constexpr int N2 = (N + 1) / 2;
+    constexpr int P1 = P / 2;
+    constexpr int P2 = (P + 1) / 2;
+
+    // Construct the end result in this matrix, which destroys a few of Eigen's
+    // built-in optimization techniques, but sadly this is necessary.
+    ActsMatrix<M, P> r;
+
+    // C₁₁ = A₁₁ * B₁₁ + A₁₂ * B₂₁
+    r.template topLeftCorner<M1, P1>().noalias() =
+        a.template topLeftCorner<M1, N1>() *
+            b.template topLeftCorner<N1, P1>() +
+        a.template topRightCorner<M1, N2>() *
+            b.template bottomLeftCorner<N2, P1>();
+
+    // C₁₂ = A₁₁ * B₁₂ + A₁₂ * B₂₂
+    r.template topRightCorner<M1, P2>().noalias() =
+        a.template topLeftCorner<M1, N1>() *
+            b.template topRightCorner<N1, P2>() +
+        a.template topRightCorner<M1, N2>() *
+            b.template bottomRightCorner<N2, P2>();
+
+    // C₂₁ = A₂₁ * B₁₁ + A₂₂ * B₂₁
+    r.template bottomLeftCorner<M2, P1>().noalias() =
+        a.template bottomLeftCorner<M2, N1>() *
+            b.template topLeftCorner<N1, P1>() +
+        a.template bottomRightCorner<M2, N2>() *
+            b.template bottomLeftCorner<N2, P1>();
+
+    // C₂₂ = A₂₁ * B₁₂ + A₂₂ * B₂₂
+    r.template bottomRightCorner<M2, P2>().noalias() =
+        a.template bottomLeftCorner<M2, N1>() *
+            b.template topRightCorner<N1, P2>() +
+        a.template bottomRightCorner<M2, N2>() *
+            b.template bottomRightCorner<N2, P2>();
+
+    return r;
+  }
+}
 }  // namespace Acts
diff --git a/Tests/UnitTests/Core/Utilities/HelpersTests.cpp b/Tests/UnitTests/Core/Utilities/HelpersTests.cpp
@@ -155,6 +155,48 @@ BOOST_AUTO_TEST_CASE(test_matrix_dimension_switch) {
   }
 }
 
+typedef std::tuple<
+    // Square matrix tests
+    std::pair<ActsMatrix<4, 4>, ActsMatrix<4, 4>>,
+    std::pair<ActsMatrix<8, 8>, ActsMatrix<8, 8>>,
+
+    // Square odd matrix tests
+    std::pair<ActsMatrix<3, 3>, ActsMatrix<3, 3>>,
+    std::pair<ActsMatrix<7, 7>, ActsMatrix<7, 7>>,
+
+    // Non-square matrix tests
+    std::pair<ActsMatrix<3, 4>, ActsMatrix<4, 3>>,
+    std::pair<ActsMatrix<3, 8>, ActsMatrix<8, 4>>,
+    std::pair<ActsMatrix<4, 3>, ActsMatrix<3, 8>>,
+    std::pair<ActsMatrix<7, 3>, ActsMatrix<3, 7>>,
+    std::pair<ActsMatrix<8, 3>, ActsMatrix<3, 7>>,
+    std::pair<ActsMatrix<8, 7>, ActsMatrix<7, 4>>,
+
+    // Very large matrix tests for sanity
+    std::pair<ActsMatrix<37, 81>, ActsMatrix<81, 59>>,
+    std::pair<ActsMatrix<38, 82>, ActsMatrix<82, 60>>,
+    std::pair<ActsMatrix<37, 82>, ActsMatrix<82, 59>>,
+    std::pair<ActsMatrix<38, 81>, ActsMatrix<81, 60>>>
+    MatrixProductTypes;
+
+BOOST_AUTO_TEST_CASE_TEMPLATE(BlockedMatrixMultiplication, Matrices,
+                              MatrixProductTypes) {
+  using A = typename Matrices::first_type;
+  using B = typename Matrices::second_type;
+  using C = ActsMatrix<A::RowsAtCompileTime, B::ColsAtCompileTime>;
+
+  for (std::size_t i = 0; i < 100; ++i) {
+    A a = A::Random();
+    B b = B::Random();
+
+    C ref = a * b;
+    C res = blockedMult(a, b);
+
+    BOOST_CHECK(ref.isApprox(res));
+    BOOST_CHECK(res.isApprox(ref));
+  }
+}
+
 BOOST_AUTO_TEST_SUITE_END()
 }  // namespace Test
 }  // namespace Acts