Skip to content
Permalink
Browse files

ios: add NEON implementation of HitSpheres & float4, 5.8 -> 8.5 Mray/s

  • Loading branch information...
aras-p committed May 30, 2018
1 parent b55d8fa commit 217352fee6470e7f3a946a5a3fba1be4d9cbc7b7
Showing with 133 additions and 10 deletions.
  1. +3 −3 Cpp/Source/Config.h
  2. +102 −3 Cpp/Source/MathSimd.h
  3. +28 −4 Cpp/Source/Maths.cpp
@@ -18,8 +18,8 @@
#define kCSGroupSizeY 8
#define kCSMaxObjects 64

// Should float3 struct use SSE?
// Should float3 struct use SSE/NEON?
#define DO_FLOAT3_WITH_SIMD (!(DO_COMPUTE_GPU) && 1)

// Should HitSpheres function use SSE?
#define DO_HIT_SPHERES_SSE (!TARGET_OS_IPHONE)
// Should HitSpheres function use SSE/NEON?
#define DO_HIT_SPHERES_SIMD 1
@@ -8,7 +8,9 @@

#define kSimdWidth 4

#if !TARGET_OS_IPHONE
#if !defined(__arm__) && !defined(__arm64__)

// ---- SSE implementation

#include <xmmintrin.h>
#include <emmintrin.h>
@@ -87,7 +89,104 @@ VM_INLINE __m128i select(__m128i a, __m128i b, bool4 cond)
#endif
}

VM_INLINE float4 sqrtf(float4 v) { return float4(_mm_sqrt_ps(v.m)); }
VM_INLINE float4 sqrtf(float4 v) { return float4(_mm_sqrt_ps(v.m)); }

#else

// ---- NEON implementation

#define USE_NEON 1
#include <arm_neon.h>

struct float4
{
VM_INLINE float4() {}
VM_INLINE explicit float4(const float *p) { m = vld1q_f32(p); }
VM_INLINE explicit float4(float x, float y, float z, float w) { float v[4] = {x, y, z, w}; m = vld1q_f32(v); }
VM_INLINE explicit float4(float v) { m = vdupq_n_f32(v); }
VM_INLINE explicit float4(float32x4_t v) { m = v; }

VM_INLINE float getX() const { return vgetq_lane_f32(m, 0); }
VM_INLINE float getY() const { return vgetq_lane_f32(m, 1); }
VM_INLINE float getZ() const { return vgetq_lane_f32(m, 2); }
VM_INLINE float getW() const { return vgetq_lane_f32(m, 3); }

float32x4_t m;
};

typedef float4 bool4;

VM_INLINE float4 operator+ (float4 a, float4 b) { a.m = vaddq_f32(a.m, b.m); return a; }
VM_INLINE float4 operator- (float4 a, float4 b) { a.m = vsubq_f32(a.m, b.m); return a; }
VM_INLINE float4 operator* (float4 a, float4 b) { a.m = vmulq_f32(a.m, b.m); return a; }
VM_INLINE bool4 operator==(float4 a, float4 b) { a.m = vceqq_f32(a.m, b.m); return a; }
VM_INLINE bool4 operator!=(float4 a, float4 b) { a.m = a.m = vmvnq_u32(vceqq_f32(a.m, b.m)); return a; }
VM_INLINE bool4 operator< (float4 a, float4 b) { a.m = vcltq_f32(a.m, b.m); return a; }
VM_INLINE bool4 operator> (float4 a, float4 b) { a.m = vcgtq_f32(a.m, b.m); return a; }
VM_INLINE bool4 operator<=(float4 a, float4 b) { a.m = vcleq_f32(a.m, b.m); return a; }
VM_INLINE bool4 operator>=(float4 a, float4 b) { a.m = vcgeq_f32(a.m, b.m); return a; }
VM_INLINE bool4 operator&(bool4 a, bool4 b) { a.m = vandq_u32(a.m, b.m); return a; }
VM_INLINE bool4 operator|(bool4 a, bool4 b) { a.m = vorrq_u32(a.m, b.m); return a; }
VM_INLINE float4 operator- (float4 a) { a.m = vnegq_f32(a.m); return a; }
VM_INLINE float4 min(float4 a, float4 b) { a.m = vminq_f32(a.m, b.m); return a; }
VM_INLINE float4 max(float4 a, float4 b) { a.m = vmaxq_f32(a.m, b.m); return a; }

VM_INLINE float hmin(float4 v)
{
float32x2_t minOfHalfs = vpmin_f32(vget_low_f32(v.m), vget_high_f32(v.m));
float32x2_t minOfMinOfHalfs = vpmin_f32(minOfHalfs, minOfHalfs);
return vget_lane_f32(minOfMinOfHalfs, 0);
}

// Returns a 4-bit code where bit0..bit3 is X..W
VM_INLINE unsigned mask(float4 v)
{
static const uint32x4_t movemask = { 1, 2, 4, 8 };
static const uint32x4_t highbit = { 0x80000000, 0x80000000, 0x80000000, 0x80000000 };
uint32x4_t t0 = vreinterpretq_u32_f32(v.m);
uint32x4_t t1 = vtstq_u32(t0, highbit);
uint32x4_t t2 = vandq_u32(t1, movemask);
uint32x2_t t3 = vorr_u32(vget_low_u32(t2), vget_high_u32(t2));
return vget_lane_u32(t3, 0) | vget_lane_u32(t3, 1);
}
// Once we have a comparison, we can branch based on its results:
VM_INLINE bool any(bool4 v) { return mask(v) != 0; }
VM_INLINE bool all(bool4 v) { return mask(v) == 15; }

// "select", i.e. hibit(cond) ? b : a
// on SSE4.1 and up this can be done easily via "blend" instruction;
// on older SSEs has to do a bunch of hoops, see
// https://fgiesen.wordpress.com/2016/04/03/sse-mind-the-gap/

#endif // #if !TARGET_OS_IPHONE
VM_INLINE float4 select(float4 a, float4 b, bool4 cond)
{
a.m = vbslq_f32(cond.m, b.m, a.m);
return a;
}
VM_INLINE int32x4_t select(int32x4_t a, int32x4_t b, bool4 cond)
{
return vbslq_f32(cond.m, b, a);
}

VM_INLINE float4 sqrtf(float4 v)
{
float32x4_t V = v.m;
float32x4_t S0 = vrsqrteq_f32(V);
float32x4_t P0 = vmulq_f32( V, S0 );
float32x4_t R0 = vrsqrtsq_f32( P0, S0 );
float32x4_t S1 = vmulq_f32( S0, R0 );
float32x4_t P1 = vmulq_f32( V, S1 );
float32x4_t R1 = vrsqrtsq_f32( P1, S1 );
float32x4_t S2 = vmulq_f32( S1, R1 );
float32x4_t P2 = vmulq_f32( V, S2 );
float32x4_t R2 = vrsqrtsq_f32( P2, S2 );
float32x4_t S3 = vmulq_f32( S2, R2 );
return float4(vmulq_f32(V, S3));
}

VM_INLINE float4 splatX(float32x4_t v) { return float4(vdupq_lane_f32(vget_low_f32(v), 0)); }
VM_INLINE float4 splatY(float32x4_t v) { return float4(vdupq_lane_f32(vget_low_f32(v), 1)); }
VM_INLINE float4 splatZ(float32x4_t v) { return float4(vdupq_lane_f32(vget_high_f32(v), 0)); }
VM_INLINE float4 splatW(float32x4_t v) { return float4(vdupq_lane_f32(vget_high_f32(v), 1)); }

#endif
@@ -49,17 +49,28 @@ float3 RandomUnitVector(uint32_t& state)

int HitSpheres(const Ray& r, const SpheresSoA& spheres, float tMin, float tMax, Hit& outHit)
{
#if DO_HIT_SPHERES_SSE
#if DO_HIT_SPHERES_SIMD
float4 hitT = float4(tMax);
#if USE_NEON
int32x4_t id = vdupq_n_s32(-1);
#else
__m128i id = _mm_set1_epi32(-1);
#endif

#if DO_FLOAT3_WITH_SIMD
#if DO_FLOAT3_WITH_SIMD && !USE_NEON
float4 rOrigX = SHUFFLE4(r.orig, 0, 0, 0, 0);
float4 rOrigY = SHUFFLE4(r.orig, 1, 1, 1, 1);
float4 rOrigZ = SHUFFLE4(r.orig, 2, 2, 2, 2);
float4 rDirX = SHUFFLE4(r.dir, 0, 0, 0, 0);
float4 rDirY = SHUFFLE4(r.dir, 1, 1, 1, 1);
float4 rDirZ = SHUFFLE4(r.dir, 2, 2, 2, 2);
#elif DO_FLOAT3_WITH_SIMD
float4 rOrigX = splatX(r.orig.m);
float4 rOrigY = splatY(r.orig.m);
float4 rOrigZ = splatZ(r.orig.m);
float4 rDirX = splatX(r.dir.m);
float4 rDirY = splatY(r.dir.m);
float4 rDirZ = splatZ(r.dir.m);
#else
float4 rOrigX = float4(r.orig.x);
float4 rOrigY = float4(r.orig.y);
@@ -69,7 +80,11 @@ int HitSpheres(const Ray& r, const SpheresSoA& spheres, float tMin, float tMax,
float4 rDirZ = float4(r.dir.z);
#endif
float4 tMin4 = float4(tMin);
#if USE_NEON
int32x4_t curId = vcombine_u32(vcreate_u32(0ULL | (1ULL<<32)), vcreate_u32(2ULL | (3ULL<<32)));
#else
__m128i curId = _mm_set_epi32(3, 2, 1, 0);
#endif
// process 4 spheres at once
for (int i = 0; i < spheres.simdCount; i += kSimdWidth)
{
@@ -101,7 +116,11 @@ int HitSpheres(const Ray& r, const SpheresSoA& spheres, float tMin, float tMax,
id = select(id, curId, msk);
hitT = select(hitT, t, msk);
}
#if USE_NEON
curId = vaddq_s32(curId, vdupq_n_s32(kSimdWidth));
#else
curId = _mm_add_epi32(curId, _mm_set1_epi32(kSimdWidth));
#endif
}
// now we have up to 4 hits, find and return closest one
float minT = hmin(hitT);
@@ -112,8 +131,13 @@ int HitSpheres(const Ray& r, const SpheresSoA& spheres, float tMin, float tMax,
{
int id_scalar[4];
float hitT_scalar[4];
#if USE_NEON
vst1q_s32(id_scalar, id);
vst1q_f32(hitT_scalar, hitT.m);
#else
_mm_storeu_si128((__m128i *)id_scalar, id);
_mm_storeu_ps(hitT_scalar, hitT.m);
#endif

// In general, you would do this with a bit scan (first set/trailing zero count).
// But who cares, it's only 16 options.
@@ -138,7 +162,7 @@ int HitSpheres(const Ray& r, const SpheresSoA& spheres, float tMin, float tMax,

return -1;

#else // #if DO_HIT_SPHERES_SSE
#else // #if DO_HIT_SPHERES_SIMD

float hitT = tMax;
int id = -1;
@@ -175,5 +199,5 @@ int HitSpheres(const Ray& r, const SpheresSoA& spheres, float tMin, float tMax,
}
else
return -1;
#endif // #else of #if DO_HIT_SPHERES_SSE
#endif // #else of #if DO_HIT_SPHERES_SIMD
}

0 comments on commit 217352f

Please sign in to comment.
You can’t perform that action at this time.