ios: add NEON implementation of HitSpheres & float4, 5.8 -> 8.5 Mray/s

aras-p · May 30, 2018 · 217352f · 217352f
1 parent b55d8fa
commit 217352f
Show file tree

Hide file tree

Showing 3 changed files with 133 additions and 10 deletions.
diff --git a/Cpp/Source/Config.h b/Cpp/Source/Config.h
@@ -18,8 +18,8 @@
 #define kCSGroupSizeY 8
 #define kCSMaxObjects 64
 
-// Should float3 struct use SSE?
+// Should float3 struct use SSE/NEON?
 #define DO_FLOAT3_WITH_SIMD (!(DO_COMPUTE_GPU) && 1)
 
-// Should HitSpheres function use SSE?
-#define DO_HIT_SPHERES_SSE (!TARGET_OS_IPHONE)
+// Should HitSpheres function use SSE/NEON?
+#define DO_HIT_SPHERES_SIMD 1
diff --git a/Cpp/Source/MathSimd.h b/Cpp/Source/MathSimd.h
@@ -8,7 +8,9 @@
 
 #define kSimdWidth 4
 
-#if !TARGET_OS_IPHONE
+#if !defined(__arm__) && !defined(__arm64__)
+
+// ---- SSE implementation
 
 #include <xmmintrin.h>
 #include <emmintrin.h>
@@ -87,7 +89,104 @@ VM_INLINE __m128i select(__m128i a, __m128i b, bool4 cond)
 #endif
 }
 
-VM_INLINE float4 sqrtf(float4 v) { return float4(_mm_sqrt_ps(v.m)); }
+VM_INLINE float4 sqrtf(float4 v) { return float4(_mm_sqrt_ps(v.m)); }
+
+#else
+
+// ---- NEON implementation
+
+#define USE_NEON 1
+#include <arm_neon.h>
+
+struct float4
+{
+    VM_INLINE float4() {}
+    VM_INLINE explicit float4(const float *p) { m = vld1q_f32(p); }
+    VM_INLINE explicit float4(float x, float y, float z, float w) { float v[4] = {x, y, z, w}; m = vld1q_f32(v); }
+    VM_INLINE explicit float4(float v) { m = vdupq_n_f32(v); }
+    VM_INLINE explicit float4(float32x4_t v) { m = v; }
+
+    VM_INLINE float getX() const { return vgetq_lane_f32(m, 0); }
+    VM_INLINE float getY() const { return vgetq_lane_f32(m, 1); }
+    VM_INLINE float getZ() const { return vgetq_lane_f32(m, 2); }
+    VM_INLINE float getW() const { return vgetq_lane_f32(m, 3); }
+
+    float32x4_t m;
+};
+
+typedef float4 bool4;
+
+VM_INLINE float4 operator+ (float4 a, float4 b) { a.m = vaddq_f32(a.m, b.m); return a; }
+VM_INLINE float4 operator- (float4 a, float4 b) { a.m = vsubq_f32(a.m, b.m); return a; }
+VM_INLINE float4 operator* (float4 a, float4 b) { a.m = vmulq_f32(a.m, b.m); return a; }
+VM_INLINE bool4 operator==(float4 a, float4 b) { a.m = vceqq_f32(a.m, b.m); return a; }
+VM_INLINE bool4 operator!=(float4 a, float4 b) { a.m = a.m = vmvnq_u32(vceqq_f32(a.m, b.m)); return a; }
+VM_INLINE bool4 operator< (float4 a, float4 b) { a.m = vcltq_f32(a.m, b.m); return a; }
+VM_INLINE bool4 operator> (float4 a, float4 b) { a.m = vcgtq_f32(a.m, b.m); return a; }
+VM_INLINE bool4 operator<=(float4 a, float4 b) { a.m = vcleq_f32(a.m, b.m); return a; }
+VM_INLINE bool4 operator>=(float4 a, float4 b) { a.m = vcgeq_f32(a.m, b.m); return a; }
+VM_INLINE bool4 operator&(bool4 a, bool4 b) { a.m = vandq_u32(a.m, b.m); return a; }
+VM_INLINE bool4 operator|(bool4 a, bool4 b) { a.m = vorrq_u32(a.m, b.m); return a; }
+VM_INLINE float4 operator- (float4 a) { a.m = vnegq_f32(a.m); return a; }
+VM_INLINE float4 min(float4 a, float4 b) { a.m = vminq_f32(a.m, b.m); return a; }
+VM_INLINE float4 max(float4 a, float4 b) { a.m = vmaxq_f32(a.m, b.m); return a; }
+
+VM_INLINE float hmin(float4 v)
+{
+    float32x2_t minOfHalfs = vpmin_f32(vget_low_f32(v.m), vget_high_f32(v.m));
+    float32x2_t minOfMinOfHalfs = vpmin_f32(minOfHalfs, minOfHalfs);
+    return vget_lane_f32(minOfMinOfHalfs, 0);
+}
+
+// Returns a 4-bit code where bit0..bit3 is X..W
+VM_INLINE unsigned mask(float4 v)
+{
+    static const uint32x4_t movemask = { 1, 2, 4, 8 };
+    static const uint32x4_t highbit = { 0x80000000, 0x80000000, 0x80000000, 0x80000000 };
+    uint32x4_t t0 = vreinterpretq_u32_f32(v.m);
+    uint32x4_t t1 = vtstq_u32(t0, highbit);
+    uint32x4_t t2 = vandq_u32(t1, movemask);
+    uint32x2_t t3 = vorr_u32(vget_low_u32(t2), vget_high_u32(t2));
+    return vget_lane_u32(t3, 0) | vget_lane_u32(t3, 1);
+}
+// Once we have a comparison, we can branch based on its results:
+VM_INLINE bool any(bool4 v) { return mask(v) != 0; }
+VM_INLINE bool all(bool4 v) { return mask(v) == 15; }
+
+// "select", i.e. hibit(cond) ? b : a
+// on SSE4.1 and up this can be done easily via "blend" instruction;
+// on older SSEs has to do a bunch of hoops, see
+// https://fgiesen.wordpress.com/2016/04/03/sse-mind-the-gap/
 
-#endif // #if !TARGET_OS_IPHONE
+VM_INLINE float4 select(float4 a, float4 b, bool4 cond)
+{
+    a.m = vbslq_f32(cond.m, b.m, a.m);
+    return a;
+}
+VM_INLINE int32x4_t select(int32x4_t a, int32x4_t b, bool4 cond)
+{
+    return vbslq_f32(cond.m, b, a);
+}
 
+VM_INLINE float4 sqrtf(float4 v)
+{
+    float32x4_t V = v.m;
+    float32x4_t S0 = vrsqrteq_f32(V);
+    float32x4_t P0 = vmulq_f32( V, S0 );
+    float32x4_t R0 = vrsqrtsq_f32( P0, S0 );
+    float32x4_t S1 = vmulq_f32( S0, R0 );
+    float32x4_t P1 = vmulq_f32( V, S1 );
+    float32x4_t R1 = vrsqrtsq_f32( P1, S1 );
+    float32x4_t S2 = vmulq_f32( S1, R1 );
+    float32x4_t P2 = vmulq_f32( V, S2 );
+    float32x4_t R2 = vrsqrtsq_f32( P2, S2 );
+    float32x4_t S3 = vmulq_f32( S2, R2 );
+    return float4(vmulq_f32(V, S3));
+}
+
+VM_INLINE float4 splatX(float32x4_t v) { return float4(vdupq_lane_f32(vget_low_f32(v), 0)); }
+VM_INLINE float4 splatY(float32x4_t v) { return float4(vdupq_lane_f32(vget_low_f32(v), 1)); }
+VM_INLINE float4 splatZ(float32x4_t v) { return float4(vdupq_lane_f32(vget_high_f32(v), 0)); }
+VM_INLINE float4 splatW(float32x4_t v) { return float4(vdupq_lane_f32(vget_high_f32(v), 1)); }
+
+#endif
diff --git a/Cpp/Source/Maths.cpp b/Cpp/Source/Maths.cpp
@@ -49,17 +49,28 @@ float3 RandomUnitVector(uint32_t& state)
 
 int HitSpheres(const Ray& r, const SpheresSoA& spheres, float tMin, float tMax, Hit& outHit)
 {
-#if DO_HIT_SPHERES_SSE
+#if DO_HIT_SPHERES_SIMD
     float4 hitT = float4(tMax);
+#if USE_NEON
+    int32x4_t id = vdupq_n_s32(-1);
+#else
     __m128i id = _mm_set1_epi32(-1);
+#endif
 
-#if DO_FLOAT3_WITH_SIMD
+#if DO_FLOAT3_WITH_SIMD && !USE_NEON
     float4 rOrigX = SHUFFLE4(r.orig, 0, 0, 0, 0);
     float4 rOrigY = SHUFFLE4(r.orig, 1, 1, 1, 1);
     float4 rOrigZ = SHUFFLE4(r.orig, 2, 2, 2, 2);
     float4 rDirX = SHUFFLE4(r.dir, 0, 0, 0, 0);
     float4 rDirY = SHUFFLE4(r.dir, 1, 1, 1, 1);
     float4 rDirZ = SHUFFLE4(r.dir, 2, 2, 2, 2);
+#elif DO_FLOAT3_WITH_SIMD
+    float4 rOrigX = splatX(r.orig.m);
+    float4 rOrigY = splatY(r.orig.m);
+    float4 rOrigZ = splatZ(r.orig.m);
+    float4 rDirX = splatX(r.dir.m);
+    float4 rDirY = splatY(r.dir.m);
+    float4 rDirZ = splatZ(r.dir.m);
 #else
     float4 rOrigX = float4(r.orig.x);
     float4 rOrigY = float4(r.orig.y);
@@ -69,7 +80,11 @@ int HitSpheres(const Ray& r, const SpheresSoA& spheres, float tMin, float tMax,
     float4 rDirZ = float4(r.dir.z);
 #endif
     float4 tMin4 = float4(tMin);
+#if USE_NEON
+    int32x4_t curId = vcombine_u32(vcreate_u32(0ULL | (1ULL<<32)), vcreate_u32(2ULL | (3ULL<<32)));
+#else
     __m128i curId = _mm_set_epi32(3, 2, 1, 0);
+#endif
     // process 4 spheres at once
     for (int i = 0; i < spheres.simdCount; i += kSimdWidth)
     {
@@ -101,7 +116,11 @@ int HitSpheres(const Ray& r, const SpheresSoA& spheres, float tMin, float tMax,
             id = select(id, curId, msk);
             hitT = select(hitT, t, msk);
         }
+#if USE_NEON
+        curId = vaddq_s32(curId, vdupq_n_s32(kSimdWidth));
+#else
         curId = _mm_add_epi32(curId, _mm_set1_epi32(kSimdWidth));
+#endif
     }
     // now we have up to 4 hits, find and return closest one
     float minT = hmin(hitT);
@@ -112,8 +131,13 @@ int HitSpheres(const Ray& r, const SpheresSoA& spheres, float tMin, float tMax,
         {
             int id_scalar[4];
             float hitT_scalar[4];
+#if USE_NEON
+            vst1q_s32(id_scalar, id);
+            vst1q_f32(hitT_scalar, hitT.m);
+#else
             _mm_storeu_si128((__m128i *)id_scalar, id);
             _mm_storeu_ps(hitT_scalar, hitT.m);
+#endif
 
             // In general, you would do this with a bit scan (first set/trailing zero count).
             // But who cares, it's only 16 options.
@@ -138,7 +162,7 @@ int HitSpheres(const Ray& r, const SpheresSoA& spheres, float tMin, float tMax,
 
     return -1;
 
-#else // #if DO_HIT_SPHERES_SSE
+#else // #if DO_HIT_SPHERES_SIMD
 
     float hitT = tMax;
     int id = -1;
@@ -175,5 +199,5 @@ int HitSpheres(const Ray& r, const SpheresSoA& spheres, float tMin, float tMax,
     }
     else
         return -1;
-#endif // #else of #if DO_HIT_SPHERES_SSE
+#endif // #else of #if DO_HIT_SPHERES_SIMD
 }