Adding aarch64 neon support.

bitshifter · Mar 23, 2024 · 9f26d6f · 9f26d6f
1 parent 9f694bc
commit 9f26d6f
Show file tree

Hide file tree

Showing 60 changed files with 11,256 additions and 5,152 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -86,3 +86,25 @@ jobs:
 
       - run: ./build_and_test_wasm32_firefox.sh
       - run: ./build_and_test_wasm32_chrome.sh
+
+  test-arm:
+    name: Test Arm
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        target:
+          - aarch64-unknown-linux-gnu
+          - arm-unknown-linux-gnueabi
+    steps:
+      - uses: actions/checkout@v2
+      - uses: actions-rs/toolchain@v1
+        with:
+          toolchain: stable
+          target: ${{matrix.target}}
+          override: true
+      - uses: actions-rs/cargo@v1
+        with:
+          use-cross: true
+          command: test
+          args: --target ${{matrix.target}}
diff --git a/Cargo.toml b/Cargo.toml
@@ -53,6 +53,11 @@ libm = { version = "0.2", optional = true, default-features = false}
 
 [dev-dependencies]
 # rand_xoshiro is required for tests if rand is enabled
+#rand_xoshiro = "0.6"
+#serde_json = "1.0"
+
+[target.'cfg(target_arch = "x86_64")'.dev-dependencies]
+criterion = { version = "0.4", features = ["html_reports"] }
 rand_xoshiro = "0.6"
 # Set a size_xx feature so that this crate compiles properly with --all-targets --all-features
 rkyv = { version = "0.7", default-features = false, features = ["size_32"] }

diff --git a/codegen/src/main.rs b/codegen/src/main.rs
@@ -109,6 +109,9 @@ fn main() -> anyhow::Result<()> {
 
         let full_output_path = workdir.join(output_path);
 
+        let output_dir = full_output_path.parent().unwrap();
+        std::fs::create_dir_all(output_dir)?;
+
         if check {
             match std::fs::read_to_string(&full_output_path) {
                 Ok(original_str) => {

diff --git a/codegen/src/outputs.rs b/codegen/src/outputs.rs
@@ -8,6 +8,7 @@ enum Target {
     Scalar,
     Sse2,
     Wasm32,
+    Neon,
     CoreSimd,
 }
 
@@ -357,6 +358,7 @@ impl ContextBuilder {
         self.0.insert("is_sse2", &(target == Target::Sse2));
         self.0.insert("is_coresimd", &(target == Target::CoreSimd));
         self.0.insert("is_wasm32", &(target == Target::Wasm32));
+        self.0.insert("is_neon", &(target == Target::Neon));
         self.0.insert("is_scalar", &(target == Target::Scalar));
         self
     }
@@ -365,6 +367,10 @@ impl ContextBuilder {
         self.with_target(Target::Sse2)
     }
 
+    pub fn target_neon(self) -> Self {
+        self.with_target(Target::Neon)
+    }
+
     pub fn target_wasm32(self) -> Self {
         self.with_target(Target::Wasm32)
     }
@@ -422,6 +428,10 @@ pub fn build_output_pairs() -> HashMap<&'static str, tera::Context> {
             "src/swizzles/scalar/vec3a_impl.rs",
             ContextBuilder::new_vec3a_swizzle_impl().build(),
         ),
+        (
+            "src/swizzles/neon/vec3a_impl.rs",
+            ContextBuilder::new_vec3a_swizzle_impl().build(),
+        ),
         (
             "src/swizzles/sse2/vec3a_impl.rs",
             ContextBuilder::new_vec3a_swizzle_impl()
@@ -444,6 +454,10 @@ pub fn build_output_pairs() -> HashMap<&'static str, tera::Context> {
             "src/swizzles/scalar/vec4_impl.rs",
             ContextBuilder::new_vec4_swizzle_impl().build(),
         ),
+        (
+            "src/swizzles/neon/vec4_impl.rs",
+            ContextBuilder::new_vec4_swizzle_impl().build(),
+        ),
         (
             "src/swizzles/sse2/vec4_impl.rs",
             ContextBuilder::new_vec4_swizzle_impl()
@@ -574,6 +588,10 @@ pub fn build_output_pairs() -> HashMap<&'static str, tera::Context> {
             "src/bool/wasm32/bvec3a.rs",
             ContextBuilder::new_bvec3a().target_wasm32().build(),
         ),
+        (
+            "src/bool/neon/bvec3a.rs",
+            ContextBuilder::new_bvec3a().target_neon().build(),
+        ),
         (
             "src/bool/coresimd/bvec3a.rs",
             ContextBuilder::new_bvec3a().target_coresimd().build(),
@@ -590,6 +608,10 @@ pub fn build_output_pairs() -> HashMap<&'static str, tera::Context> {
             "src/bool/wasm32/bvec4a.rs",
             ContextBuilder::new_bvec4a().target_wasm32().build(),
         ),
+        (
+            "src/bool/neon/bvec4a.rs",
+            ContextBuilder::new_bvec4a().target_neon().build(),
+        ),
         (
             "src/bool/coresimd/bvec4a.rs",
             ContextBuilder::new_bvec4a().target_coresimd().build(),
@@ -600,6 +622,10 @@ pub fn build_output_pairs() -> HashMap<&'static str, tera::Context> {
             "src/f32/scalar/vec3a.rs",
             ContextBuilder::new_vec3a().build(),
         ),
+        (
+            "src/f32/neon/vec3a.rs",
+            ContextBuilder::new_vec3a().target_neon().build(),
+        ),
         (
             "src/f32/sse2/vec3a.rs",
             ContextBuilder::new_vec3a().target_sse2().build(),
@@ -613,6 +639,10 @@ pub fn build_output_pairs() -> HashMap<&'static str, tera::Context> {
             ContextBuilder::new_vec3a().target_coresimd().build(),
         ),
         ("src/f32/scalar/vec4.rs", ContextBuilder::new_vec4().build()),
+        (
+            "src/f32/neon/vec4.rs",
+            ContextBuilder::new_vec4().target_neon().build(),
+        ),
         (
             "src/f32/sse2/vec4.rs",
             ContextBuilder::new_vec4().target_sse2().build(),
@@ -647,6 +677,10 @@ pub fn build_output_pairs() -> HashMap<&'static str, tera::Context> {
         ("src/u64/u64vec3.rs", ContextBuilder::new_u64vec3().build()),
         ("src/u64/u64vec4.rs", ContextBuilder::new_u64vec4().build()),
         ("src/f32/scalar/quat.rs", ContextBuilder::new_quat().build()),
+        (
+            "src/f32/neon/quat.rs",
+            ContextBuilder::new_quat().target_neon().build(),
+        ),
         (
             "src/f32/sse2/quat.rs",
             ContextBuilder::new_quat().target_sse2().build(),
@@ -661,6 +695,7 @@ pub fn build_output_pairs() -> HashMap<&'static str, tera::Context> {
         ),
         ("src/f64/dquat.rs", ContextBuilder::new_dquat().build()),
         ("src/f32/scalar/mat2.rs", ContextBuilder::new_mat2().build()),
+        ("src/f32/neon/mat2.rs", ContextBuilder::new_mat2().build()),
         (
             "src/f32/sse2/mat2.rs",
             ContextBuilder::new_mat2().target_sse2().build(),
@@ -679,6 +714,10 @@ pub fn build_output_pairs() -> HashMap<&'static str, tera::Context> {
             "src/f32/scalar/mat3a.rs",
             ContextBuilder::new_mat3a().build(),
         ),
+        (
+            "src/f32/neon/mat3a.rs",
+            ContextBuilder::new_mat3a().target_neon().build(),
+        ),
         (
             "src/f32/sse2/mat3a.rs",
             ContextBuilder::new_mat3a().target_sse2().build(),
@@ -692,6 +731,10 @@ pub fn build_output_pairs() -> HashMap<&'static str, tera::Context> {
             ContextBuilder::new_mat3a().target_coresimd().build(),
         ),
         ("src/f32/scalar/mat4.rs", ContextBuilder::new_mat4().build()),
+        (
+            "src/f32/neon/mat4.rs",
+            ContextBuilder::new_mat4().target_neon().build(),
+        ),
         (
             "src/f32/sse2/mat4.rs",
             ContextBuilder::new_mat4().target_sse2().build(),

diff --git a/codegen/templates/quat.rs.tera b/codegen/templates/quat.rs.tera
@@ -8,6 +8,8 @@
         {% set simd_t = "v128" %}
     {% elif is_coresimd %}
         {% set simd_t = "f32x4" %}
+    {% elif is_neon %}
+        {% set simd_t = "float32x4_t" %}
     {% endif %}
 {% endif %}
 
@@ -43,6 +45,8 @@ use crate::{
         wasm32::*,
     {% elif is_coresimd %}
         coresimd::*,
+    {% elif is_neon %}
+        neon::*,
     {% endif %}
 };
 
@@ -55,6 +59,8 @@ use core::arch::x86_64::*;
 use core::arch::wasm32::*;
 {% elif is_coresimd %}
 use core::simd::*;
+{% elif is_neon %}
+use core::arch::aarch64::*;
 {% endif %}
 
 #[cfg(not(target_arch = "spirv"))]
@@ -67,7 +73,7 @@ use core::ops::{
     Add, Div, Mul, MulAssign, Neg, Sub
 };
 
-{% if is_sse2 %}
+{% if is_sse2 or is_neon %}
 #[repr(C)]
 union UnionCast {
     a: [f32; 4],
@@ -146,6 +152,8 @@ impl {{ self_t }} {
             Self(f32x4(x, y, z, w))
         {% elif is_coresimd %}
             Self(f32x4::from_array([x, y, z, w]))
+        {% elif is_neon %}
+            unsafe { UnionCast { a: [x, y, z, w] }.v }
         {% endif %}
     }
 
@@ -197,6 +205,9 @@ impl {{ self_t }} {
         {% if is_sse2 %}
             assert!(slice.len() >= 4);
             Self(unsafe { _mm_loadu_ps(slice.as_ptr()) })
+        {% elif is_neon %}
+            assert!(slice.len() >= 4);
+            Self(unsafe { vld1q_f32(slice.as_ptr()) })
         {% else %}
             Self::from_xyzw(slice[0], slice[1], slice[2], slice[3])
         {% endif %}
@@ -212,6 +223,9 @@ impl {{ self_t }} {
         {% if is_sse2 %}
             assert!(slice.len() >= 4);
             unsafe { _mm_storeu_ps(slice.as_mut_ptr(), self.0) }
+        {% elif is_neon %}
+            assert!(slice.len() >= 4);
+            unsafe { vst1q_f32(slice.as_mut_ptr(), self.0) }
         {% else %}
             slice[0] = self.x;
             slice[1] = self.y;
@@ -533,6 +547,9 @@ impl {{ self_t }} {
         {% elif is_coresimd %}
             const SIGN: f32x4 = f32x4::from_array([-1.0, -1.0, -1.0, 1.0]);
             Self(self.0.mul(SIGN))
+        {% elif is_neon %}
+            const SIGN: float32x4_t = f32x4_from_array([-1.0, -1.0, -1.0, 1.0]);
+            Self(unsafe { vmulq_f32(self.0, SIGN) })
         {% endif %}
     }
 
@@ -736,6 +753,27 @@ impl {{ self_t }} {
             let bias = f32x4_bitand(dot, NEG_ZERO);
             let interpolated = start + ((f32x4_bitxor(end, bias) - start) * f32x4::splat(s));
             {{ self_t }}(interpolated).normalize()
+        {% elif is_neon %}
+            const NEG_ZERO: float32x4_t = f32x4_from_array([-0.0; 4]);
+            let start = self.0;
+            let end = end.0;
+            unsafe {
+                let dot = dot4_into_f32x4(start, end);
+                // Calculate the bias, if the dot product is positive or zero, there is no bias
+                // but if it is negative, we want to flip the 'end' rotation XYZW components
+                let bias = vandq_u32(vreinterpretq_u32_f32(dot), vreinterpretq_u32_f32(NEG_ZERO));
+                let interpolated = vaddq_f32(
+                    vmulq_f32(
+                        vsubq_f32(
+                            vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(end), bias)),
+                            start,
+                        ),
+                        vld1q_dup_f32(&s),
+                    ),
+                    start,
+                );
+                {{ self_t }}(interpolated).normalize()
+            }
         {% endif %}
     }
 
@@ -834,6 +872,23 @@ impl {{ self_t }} {
                 let theta_sin = simd_swizzle!(tmp, [2, 2, 2, 2]);
 
                 Self(self.0.mul(scale1).add(end.0.mul(scale2)).div(theta_sin))
+            {% elif is_neon %}
+                let x = (theta * (1.0 - s)).sin();
+                let y = (theta * s).sin();
+                let z = theta.sin();
+                let w = 0.0;
+                unsafe {
+                    let tmp = vld1q_f32([x, y, z, w].as_ptr());
+
+                    let scale1 = vdupq_laneq_f32(tmp, 0);
+                    let scale2 = vdupq_laneq_f32(tmp, 1);
+                    let theta_sin = vdupq_laneq_f32(tmp, 2);
+
+                    Self(vdivq_f32(
+                        vaddq_f32(vmulq_f32(self.0, scale1), vmulq_f32(end.0, scale2)),
+                        theta_sin,
+                    ))
+                }
             {% endif %}
         }
     }
@@ -874,7 +929,7 @@ impl {{ self_t }} {
         glam_assert!(self.is_normalized());
         glam_assert!(rhs.is_normalized());
 
-        {% if is_scalar %}
+        {% if is_scalar or is_neon %}
             let (x0, y0, z0, w0) = self.into();
             let (x1, y1, z1, w1) = rhs.into();
             Self::from_xyzw(
@@ -1043,6 +1098,20 @@ impl {{ self_t }} {
                     .add(b.mul(dot3_into_f32x4(rhs.0, b).mul(TWO)))
                     .add(Vec3A(b).cross(rhs).0.mul(w.mul(TWO))),
             )
+        {% elif is_neon %}
+            unsafe {
+                const TWO: float32x4_t = f32x4_from_array([2.0; 4]);
+                let w = vdupq_laneq_f32(self.0, 3);
+                let b = self.0;
+                let b2 = dot3_into_f32x4(b, b);
+                Vec3A(vaddq_f32(
+                    vaddq_f32(
+                        vmulq_f32(rhs.0, vsubq_f32(vmulq_f32(w, w), b2)),
+                        vmulq_f32(b, vmulq_f32(dot3_into_f32x4(rhs.0, b), TWO)),
+                    ),
+                    vmulq_f32(Vec3A(b).cross(rhs).into(), vmulq_f32(w, TWO)),
+                ))
+            }
         {% endif %}
     }
 

diff --git a/codegen/templates/swizzle_impl.rs.tera b/codegen/templates/swizzle_impl.rs.tera
@@ -76,7 +76,7 @@ impl Vec{{ dim }}Swizzles for {{ self_t }} {
             {% elif vec3_t == "Vec3A" and is_coresimd %}
                 {{ vec3_t }}(simd_swizzle!(self.0, [{{ l[j0] }}, {{ l[j1] }}, {{ l[j2] }}, {{ l[0] }}]).into())
             {% else %}
-                {{ vec3_t }} { x: self.{{ e[j0] }}, y: self.{{ e[j1] }}, z: self.{{ e[j2] }} }
+                {{ vec3_t }}::new(self.{{ e[j0] }}, self.{{ e[j1] }}, self.{{ e[j2] }})
             {% endif %}
     }
                             {% endif %}