Fix aarch64 debug build with GCC (pytorch#126290)

By working around GCCs quirks in instantiating templates that require immediate values. Provide alternative implementation for scaling the output if compiled without any optimizations (both GCC and clang define __OPTIMIZE__ if invoked with anything but -O0) Fixes pytorch#126283 Pull Request resolved: pytorch#126290 Approved by: https://github.com/atalman, https://github.com/seemethere
ZelboK · May 15, 2024 · a961e1a · a961e1a
1 parent 1966612
commit a961e1a
Showing 1 changed file with 8 additions and 0 deletions.
diff --git a/aten/src/ATen/native/cpu/int8mm_kernel.cpp b/aten/src/ATen/native/cpu/int8mm_kernel.cpp
@@ -250,11 +250,19 @@ inline void tinygemm_kernel_(
       });
     }
 
+#if __OPTIMIZE__      
     float32x4_t scale_val = load_as_float32x4(scales);
     c10::ForcedUnroll<BLOCK_N>{}([&](auto i) {
       C[m * ldc + i] = reduce(c_val[i]) * vgetq_lane_f32(scale_val, i);
     });
   }
+#else
+    // Workaround GCCs inability to infer lane index at compile time
+    // See https://github.com/pytorch/pytorch/issues/126283
+    c10::ForcedUnroll<BLOCK_N>{}([&](auto i) {
+      C[m * ldc + i] = reduce(c_val[i]) * float(scales[i]);
+    });    
+#endif
 }
 
 template <int BLOCK_M, int BLOCK_N>