Skip to content

Commit

Permalink
Adding aarch64 neon support.
Browse files Browse the repository at this point in the history
  • Loading branch information
bitshifter committed Mar 23, 2024
1 parent 9f694bc commit 9f26d6f
Show file tree
Hide file tree
Showing 60 changed files with 11,256 additions and 5,152 deletions.
22 changes: 22 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -86,3 +86,25 @@ jobs:

- run: ./build_and_test_wasm32_firefox.sh
- run: ./build_and_test_wasm32_chrome.sh

test-arm:
name: Test Arm
runs-on: ubuntu-latest
strategy:
fail-fast: false
matrix:
target:
- aarch64-unknown-linux-gnu
- arm-unknown-linux-gnueabi
steps:
- uses: actions/checkout@v2
- uses: actions-rs/toolchain@v1
with:
toolchain: stable
target: ${{matrix.target}}
override: true
- uses: actions-rs/cargo@v1
with:
use-cross: true
command: test
args: --target ${{matrix.target}}
5 changes: 5 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,11 @@ libm = { version = "0.2", optional = true, default-features = false}

[dev-dependencies]
# rand_xoshiro is required for tests if rand is enabled
#rand_xoshiro = "0.6"
#serde_json = "1.0"

[target.'cfg(target_arch = "x86_64")'.dev-dependencies]
criterion = { version = "0.4", features = ["html_reports"] }
rand_xoshiro = "0.6"
# Set a size_xx feature so that this crate compiles properly with --all-targets --all-features
rkyv = { version = "0.7", default-features = false, features = ["size_32"] }
Expand Down
3 changes: 3 additions & 0 deletions codegen/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,9 @@ fn main() -> anyhow::Result<()> {

let full_output_path = workdir.join(output_path);

let output_dir = full_output_path.parent().unwrap();
std::fs::create_dir_all(output_dir)?;

if check {
match std::fs::read_to_string(&full_output_path) {
Ok(original_str) => {
Expand Down
43 changes: 43 additions & 0 deletions codegen/src/outputs.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ enum Target {
Scalar,
Sse2,
Wasm32,
Neon,
CoreSimd,
}

Expand Down Expand Up @@ -357,6 +358,7 @@ impl ContextBuilder {
self.0.insert("is_sse2", &(target == Target::Sse2));
self.0.insert("is_coresimd", &(target == Target::CoreSimd));
self.0.insert("is_wasm32", &(target == Target::Wasm32));
self.0.insert("is_neon", &(target == Target::Neon));
self.0.insert("is_scalar", &(target == Target::Scalar));
self
}
Expand All @@ -365,6 +367,10 @@ impl ContextBuilder {
self.with_target(Target::Sse2)
}

pub fn target_neon(self) -> Self {
self.with_target(Target::Neon)
}

pub fn target_wasm32(self) -> Self {
self.with_target(Target::Wasm32)
}
Expand Down Expand Up @@ -422,6 +428,10 @@ pub fn build_output_pairs() -> HashMap<&'static str, tera::Context> {
"src/swizzles/scalar/vec3a_impl.rs",
ContextBuilder::new_vec3a_swizzle_impl().build(),
),
(
"src/swizzles/neon/vec3a_impl.rs",
ContextBuilder::new_vec3a_swizzle_impl().build(),
),
(
"src/swizzles/sse2/vec3a_impl.rs",
ContextBuilder::new_vec3a_swizzle_impl()
Expand All @@ -444,6 +454,10 @@ pub fn build_output_pairs() -> HashMap<&'static str, tera::Context> {
"src/swizzles/scalar/vec4_impl.rs",
ContextBuilder::new_vec4_swizzle_impl().build(),
),
(
"src/swizzles/neon/vec4_impl.rs",
ContextBuilder::new_vec4_swizzle_impl().build(),
),
(
"src/swizzles/sse2/vec4_impl.rs",
ContextBuilder::new_vec4_swizzle_impl()
Expand Down Expand Up @@ -574,6 +588,10 @@ pub fn build_output_pairs() -> HashMap<&'static str, tera::Context> {
"src/bool/wasm32/bvec3a.rs",
ContextBuilder::new_bvec3a().target_wasm32().build(),
),
(
"src/bool/neon/bvec3a.rs",
ContextBuilder::new_bvec3a().target_neon().build(),
),
(
"src/bool/coresimd/bvec3a.rs",
ContextBuilder::new_bvec3a().target_coresimd().build(),
Expand All @@ -590,6 +608,10 @@ pub fn build_output_pairs() -> HashMap<&'static str, tera::Context> {
"src/bool/wasm32/bvec4a.rs",
ContextBuilder::new_bvec4a().target_wasm32().build(),
),
(
"src/bool/neon/bvec4a.rs",
ContextBuilder::new_bvec4a().target_neon().build(),
),
(
"src/bool/coresimd/bvec4a.rs",
ContextBuilder::new_bvec4a().target_coresimd().build(),
Expand All @@ -600,6 +622,10 @@ pub fn build_output_pairs() -> HashMap<&'static str, tera::Context> {
"src/f32/scalar/vec3a.rs",
ContextBuilder::new_vec3a().build(),
),
(
"src/f32/neon/vec3a.rs",
ContextBuilder::new_vec3a().target_neon().build(),
),
(
"src/f32/sse2/vec3a.rs",
ContextBuilder::new_vec3a().target_sse2().build(),
Expand All @@ -613,6 +639,10 @@ pub fn build_output_pairs() -> HashMap<&'static str, tera::Context> {
ContextBuilder::new_vec3a().target_coresimd().build(),
),
("src/f32/scalar/vec4.rs", ContextBuilder::new_vec4().build()),
(
"src/f32/neon/vec4.rs",
ContextBuilder::new_vec4().target_neon().build(),
),
(
"src/f32/sse2/vec4.rs",
ContextBuilder::new_vec4().target_sse2().build(),
Expand Down Expand Up @@ -647,6 +677,10 @@ pub fn build_output_pairs() -> HashMap<&'static str, tera::Context> {
("src/u64/u64vec3.rs", ContextBuilder::new_u64vec3().build()),
("src/u64/u64vec4.rs", ContextBuilder::new_u64vec4().build()),
("src/f32/scalar/quat.rs", ContextBuilder::new_quat().build()),
(
"src/f32/neon/quat.rs",
ContextBuilder::new_quat().target_neon().build(),
),
(
"src/f32/sse2/quat.rs",
ContextBuilder::new_quat().target_sse2().build(),
Expand All @@ -661,6 +695,7 @@ pub fn build_output_pairs() -> HashMap<&'static str, tera::Context> {
),
("src/f64/dquat.rs", ContextBuilder::new_dquat().build()),
("src/f32/scalar/mat2.rs", ContextBuilder::new_mat2().build()),
("src/f32/neon/mat2.rs", ContextBuilder::new_mat2().build()),
(
"src/f32/sse2/mat2.rs",
ContextBuilder::new_mat2().target_sse2().build(),
Expand All @@ -679,6 +714,10 @@ pub fn build_output_pairs() -> HashMap<&'static str, tera::Context> {
"src/f32/scalar/mat3a.rs",
ContextBuilder::new_mat3a().build(),
),
(
"src/f32/neon/mat3a.rs",
ContextBuilder::new_mat3a().target_neon().build(),
),
(
"src/f32/sse2/mat3a.rs",
ContextBuilder::new_mat3a().target_sse2().build(),
Expand All @@ -692,6 +731,10 @@ pub fn build_output_pairs() -> HashMap<&'static str, tera::Context> {
ContextBuilder::new_mat3a().target_coresimd().build(),
),
("src/f32/scalar/mat4.rs", ContextBuilder::new_mat4().build()),
(
"src/f32/neon/mat4.rs",
ContextBuilder::new_mat4().target_neon().build(),
),
(
"src/f32/sse2/mat4.rs",
ContextBuilder::new_mat4().target_sse2().build(),
Expand Down
73 changes: 71 additions & 2 deletions codegen/templates/quat.rs.tera
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
{% set simd_t = "v128" %}
{% elif is_coresimd %}
{% set simd_t = "f32x4" %}
{% elif is_neon %}
{% set simd_t = "float32x4_t" %}
{% endif %}
{% endif %}

Expand Down Expand Up @@ -43,6 +45,8 @@ use crate::{
wasm32::*,
{% elif is_coresimd %}
coresimd::*,
{% elif is_neon %}
neon::*,
{% endif %}
};

Expand All @@ -55,6 +59,8 @@ use core::arch::x86_64::*;
use core::arch::wasm32::*;
{% elif is_coresimd %}
use core::simd::*;
{% elif is_neon %}
use core::arch::aarch64::*;
{% endif %}

#[cfg(not(target_arch = "spirv"))]
Expand All @@ -67,7 +73,7 @@ use core::ops::{
Add, Div, Mul, MulAssign, Neg, Sub
};

{% if is_sse2 %}
{% if is_sse2 or is_neon %}
#[repr(C)]
union UnionCast {
a: [f32; 4],
Expand Down Expand Up @@ -146,6 +152,8 @@ impl {{ self_t }} {
Self(f32x4(x, y, z, w))
{% elif is_coresimd %}
Self(f32x4::from_array([x, y, z, w]))
{% elif is_neon %}
unsafe { UnionCast { a: [x, y, z, w] }.v }
{% endif %}
}

Expand Down Expand Up @@ -197,6 +205,9 @@ impl {{ self_t }} {
{% if is_sse2 %}
assert!(slice.len() >= 4);
Self(unsafe { _mm_loadu_ps(slice.as_ptr()) })
{% elif is_neon %}
assert!(slice.len() >= 4);
Self(unsafe { vld1q_f32(slice.as_ptr()) })
{% else %}
Self::from_xyzw(slice[0], slice[1], slice[2], slice[3])
{% endif %}
Expand All @@ -212,6 +223,9 @@ impl {{ self_t }} {
{% if is_sse2 %}
assert!(slice.len() >= 4);
unsafe { _mm_storeu_ps(slice.as_mut_ptr(), self.0) }
{% elif is_neon %}
assert!(slice.len() >= 4);
unsafe { vst1q_f32(slice.as_mut_ptr(), self.0) }
{% else %}
slice[0] = self.x;
slice[1] = self.y;
Expand Down Expand Up @@ -533,6 +547,9 @@ impl {{ self_t }} {
{% elif is_coresimd %}
const SIGN: f32x4 = f32x4::from_array([-1.0, -1.0, -1.0, 1.0]);
Self(self.0.mul(SIGN))
{% elif is_neon %}
const SIGN: float32x4_t = f32x4_from_array([-1.0, -1.0, -1.0, 1.0]);
Self(unsafe { vmulq_f32(self.0, SIGN) })
{% endif %}
}

Expand Down Expand Up @@ -736,6 +753,27 @@ impl {{ self_t }} {
let bias = f32x4_bitand(dot, NEG_ZERO);
let interpolated = start + ((f32x4_bitxor(end, bias) - start) * f32x4::splat(s));
{{ self_t }}(interpolated).normalize()
{% elif is_neon %}
const NEG_ZERO: float32x4_t = f32x4_from_array([-0.0; 4]);
let start = self.0;
let end = end.0;
unsafe {
let dot = dot4_into_f32x4(start, end);
// Calculate the bias, if the dot product is positive or zero, there is no bias
// but if it is negative, we want to flip the 'end' rotation XYZW components
let bias = vandq_u32(vreinterpretq_u32_f32(dot), vreinterpretq_u32_f32(NEG_ZERO));
let interpolated = vaddq_f32(
vmulq_f32(
vsubq_f32(
vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(end), bias)),
start,
),
vld1q_dup_f32(&s),
),
start,
);
{{ self_t }}(interpolated).normalize()
}
{% endif %}
}

Expand Down Expand Up @@ -834,6 +872,23 @@ impl {{ self_t }} {
let theta_sin = simd_swizzle!(tmp, [2, 2, 2, 2]);

Self(self.0.mul(scale1).add(end.0.mul(scale2)).div(theta_sin))
{% elif is_neon %}
let x = (theta * (1.0 - s)).sin();
let y = (theta * s).sin();
let z = theta.sin();
let w = 0.0;
unsafe {
let tmp = vld1q_f32([x, y, z, w].as_ptr());

let scale1 = vdupq_laneq_f32(tmp, 0);
let scale2 = vdupq_laneq_f32(tmp, 1);
let theta_sin = vdupq_laneq_f32(tmp, 2);

Self(vdivq_f32(
vaddq_f32(vmulq_f32(self.0, scale1), vmulq_f32(end.0, scale2)),
theta_sin,
))
}
{% endif %}
}
}
Expand Down Expand Up @@ -874,7 +929,7 @@ impl {{ self_t }} {
glam_assert!(self.is_normalized());
glam_assert!(rhs.is_normalized());

{% if is_scalar %}
{% if is_scalar or is_neon %}
let (x0, y0, z0, w0) = self.into();
let (x1, y1, z1, w1) = rhs.into();
Self::from_xyzw(
Expand Down Expand Up @@ -1043,6 +1098,20 @@ impl {{ self_t }} {
.add(b.mul(dot3_into_f32x4(rhs.0, b).mul(TWO)))
.add(Vec3A(b).cross(rhs).0.mul(w.mul(TWO))),
)
{% elif is_neon %}
unsafe {
const TWO: float32x4_t = f32x4_from_array([2.0; 4]);
let w = vdupq_laneq_f32(self.0, 3);
let b = self.0;
let b2 = dot3_into_f32x4(b, b);
Vec3A(vaddq_f32(
vaddq_f32(
vmulq_f32(rhs.0, vsubq_f32(vmulq_f32(w, w), b2)),
vmulq_f32(b, vmulq_f32(dot3_into_f32x4(rhs.0, b), TWO)),
),
vmulq_f32(Vec3A(b).cross(rhs).into(), vmulq_f32(w, TWO)),
))
}
{% endif %}
}

Expand Down
2 changes: 1 addition & 1 deletion codegen/templates/swizzle_impl.rs.tera
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ impl Vec{{ dim }}Swizzles for {{ self_t }} {
{% elif vec3_t == "Vec3A" and is_coresimd %}
{{ vec3_t }}(simd_swizzle!(self.0, [{{ l[j0] }}, {{ l[j1] }}, {{ l[j2] }}, {{ l[0] }}]).into())
{% else %}
{{ vec3_t }} { x: self.{{ e[j0] }}, y: self.{{ e[j1] }}, z: self.{{ e[j2] }} }
{{ vec3_t }}::new(self.{{ e[j0] }}, self.{{ e[j1] }}, self.{{ e[j2] }})
{% endif %}
}
{% endif %}
Expand Down

0 comments on commit 9f26d6f

Please sign in to comment.