Permalink
Browse files

simd: "simplistic" approach to SIMD by having the float3 use SSE, lar…

…gely based on http://www.codersnotes.com/notes/maths-lib-2016/

- PC 135 -> 134 Mray/s
- Mac 38.7 -> 44.2 Mray/s
  • Loading branch information...
aras-p committed Apr 10, 2018
1 parent 86dcf23 commit 69df050ca37d079b2447e344029fe218d7c4c7cd
Showing with 149 additions and 29 deletions.
  1. +1 −1 Cpp/Mac/Renderer.mm
  2. +1 −1 Cpp/Source/Maths.cpp
  3. +140 −18 Cpp/Source/Maths.h
  4. +6 −8 Cpp/Source/Test.cpp
  5. +1 −1 Cpp/Windows/TestWin.cpp
@@ -6,7 +6,7 @@
#include "../Source/Maths.h"
#include "../Source/Test.h"
#define DO_COMPUTE 1
#define DO_COMPUTE 0
static const NSUInteger kMaxBuffersInFlight = 3;
@@ -32,7 +32,7 @@ float3 RandomInUnitSphere(uint32_t& state)
float3 p;
do {
p = 2.0*float3(RandomFloat01(state),RandomFloat01(state),RandomFloat01(state)) - float3(1,1,1);
} while (p.sqLength() >= 1.0);
} while (sqLength(p) >= 1.0);
return p;
}
@@ -6,48 +6,170 @@
#define kPI 3.1415926f
#if defined(_MSC_VER)
#define VM_INLINE __forceinline
#else
#define VM_INLINE __attribute__((unused, always_inline, nodebug)) inline
#endif
// SSE/SIMD vector largely based on http://www.codersnotes.com/notes/maths-lib-2016/
#define FLOAT3_USE_SSE 1
#if FLOAT3_USE_SSE
#include <xmmintrin.h>
// SHUFFLE3(v, 0,1,2) leaves the vector unchanged (v.xyz).
// SHUFFLE3(v, 0,0,0) splats the X (v.xxx).
#define SHUFFLE3(V, X,Y,Z) float3(_mm_shuffle_ps((V).m, (V).m, _MM_SHUFFLE(Z,Z,Y,X)))
struct float3
{
VM_INLINE float3() {}
VM_INLINE explicit float3(const float *p) { m = _mm_set_ps(p[2], p[2], p[1], p[0]); }
VM_INLINE explicit float3(float x, float y, float z) { m = _mm_set_ps(z, z, y, x); }
VM_INLINE explicit float3(__m128 v) { m = v; }
VM_INLINE float getX() const { return _mm_cvtss_f32(m); }
VM_INLINE float getY() const { return _mm_cvtss_f32(_mm_shuffle_ps(m, m, _MM_SHUFFLE(1, 1, 1, 1))); }
VM_INLINE float getZ() const { return _mm_cvtss_f32(_mm_shuffle_ps(m, m, _MM_SHUFFLE(2, 2, 2, 2))); }
VM_INLINE float3 yzx() const { return SHUFFLE3(*this, 1, 2, 0); }
VM_INLINE float3 zxy() const { return SHUFFLE3(*this, 2, 0, 1); }
VM_INLINE void store(float *p) const { p[0] = getX(); p[1] = getY(); p[2] = getZ(); }
void setX(float x)
{
m = _mm_move_ss(m, _mm_set_ss(x));
}
void setY(float y)
{
__m128 t = _mm_move_ss(m, _mm_set_ss(y));
t = _mm_shuffle_ps(t, t, _MM_SHUFFLE(3, 2, 0, 0));
m = _mm_move_ss(t, m);
}
void setZ(float z)
{
__m128 t = _mm_move_ss(m, _mm_set_ss(z));
t = _mm_shuffle_ps(t, t, _MM_SHUFFLE(3, 0, 1, 0));
m = _mm_move_ss(t, m);
}
__m128 m;
};
typedef float3 bool3;
VM_INLINE float3 operator+ (float3 a, float3 b) { a.m = _mm_add_ps(a.m, b.m); return a; }
VM_INLINE float3 operator- (float3 a, float3 b) { a.m = _mm_sub_ps(a.m, b.m); return a; }
VM_INLINE float3 operator* (float3 a, float3 b) { a.m = _mm_mul_ps(a.m, b.m); return a; }
VM_INLINE float3 operator/ (float3 a, float3 b) { a.m = _mm_div_ps(a.m, b.m); return a; }
VM_INLINE float3 operator* (float3 a, float b) { a.m = _mm_mul_ps(a.m, _mm_set1_ps(b)); return a; }
VM_INLINE float3 operator/ (float3 a, float b) { a.m = _mm_div_ps(a.m, _mm_set1_ps(b)); return a; }
VM_INLINE float3 operator* (float a, float3 b) { b.m = _mm_mul_ps(_mm_set1_ps(a), b.m); return b; }
VM_INLINE float3 operator/ (float a, float3 b) { b.m = _mm_div_ps(_mm_set1_ps(a), b.m); return b; }
VM_INLINE float3& operator+= (float3 &a, float3 b) { a = a + b; return a; }
VM_INLINE float3& operator-= (float3 &a, float3 b) { a = a - b; return a; }
VM_INLINE float3& operator*= (float3 &a, float3 b) { a = a * b; return a; }
VM_INLINE float3& operator/= (float3 &a, float3 b) { a = a / b; return a; }
VM_INLINE float3& operator*= (float3 &a, float b) { a = a * b; return a; }
VM_INLINE float3& operator/= (float3 &a, float b) { a = a / b; return a; }
VM_INLINE bool3 operator==(float3 a, float3 b) { a.m = _mm_cmpeq_ps(a.m, b.m); return a; }
VM_INLINE bool3 operator!=(float3 a, float3 b) { a.m = _mm_cmpneq_ps(a.m, b.m); return a; }
VM_INLINE bool3 operator< (float3 a, float3 b) { a.m = _mm_cmplt_ps(a.m, b.m); return a; }
VM_INLINE bool3 operator> (float3 a, float3 b) { a.m = _mm_cmpgt_ps(a.m, b.m); return a; }
VM_INLINE bool3 operator<=(float3 a, float3 b) { a.m = _mm_cmple_ps(a.m, b.m); return a; }
VM_INLINE bool3 operator>=(float3 a, float3 b) { a.m = _mm_cmpge_ps(a.m, b.m); return a; }
VM_INLINE float3 min(float3 a, float3 b) { a.m = _mm_min_ps(a.m, b.m); return a; }
VM_INLINE float3 max(float3 a, float3 b) { a.m = _mm_max_ps(a.m, b.m); return a; }
VM_INLINE float3 operator- (float3 a) { return float3(_mm_setzero_ps()) - a; }
VM_INLINE float hmin(float3 v)
{
v = min(v, SHUFFLE3(v, 1, 0, 2));
return min(v, SHUFFLE3(v, 2, 0, 1)).getX();
}
VM_INLINE float hmax(float3 v)
{
v = max(v, SHUFFLE3(v, 1, 0, 2));
return max(v, SHUFFLE3(v, 2, 0, 1)).getX();
}
VM_INLINE float3 cross(float3 a, float3 b)
{
// x <- a.y*b.z - a.z*b.y
// y <- a.z*b.x - a.x*b.z
// z <- a.x*b.y - a.y*b.x
// We can save a shuffle by grouping it in this wacky order:
return (a.zxy()*b - a*b.zxy()).zxy();
}
// Returns a 3-bit code where bit0..bit2 is X..Z
VM_INLINE unsigned mask(float3 v) { return _mm_movemask_ps(v.m) & 7; }
// Once we have a comparison, we can branch based on its results:
VM_INLINE bool any(bool3 v) { return mask(v) != 0; }
VM_INLINE bool all(bool3 v) { return mask(v) == 7; }
VM_INLINE float3 clamp(float3 t, float3 a, float3 b) { return min(max(t, a), b); }
VM_INLINE float sum(float3 v) { return v.getX() + v.getY() + v.getZ(); }
VM_INLINE float dot(float3 a, float3 b) { return sum(a*b); }
#else // #if FLOAT3_USE_SSE
struct float3
{
float3() : x(0), y(0), z(0) {}
float3(float x_, float y_, float z_) : x(x_), y(y_), z(z_) {}
float sqLength() const { return x*x+y*y+z*z; }
float length() const { return sqrtf(x*x+y*y+z*z); }
void normalize() { float k = 1.0f / length(); x *= k; y *= k; z *= k; }
float3 operator-() const { return float3(-x, -y, -z); }
float3& operator+=(const float3& o) { x+=o.x; y+=o.y; z+=o.z; return *this; }
float3& operator-=(const float3& o) { x-=o.x; y-=o.y; z-=o.z; return *this; }
float3& operator*=(const float3& o) { x*=o.x; y*=o.y; z*=o.z; return *this; }
float3& operator*=(float o) { x*=o; y*=o; z*=o; return *this; }
VM_INLINE float getX() const { return x; }
VM_INLINE float getY() const { return y; }
VM_INLINE float getZ() const { return z; }
VM_INLINE void store(float *p) const { p[0] = getX(); p[1] = getY(); p[2] = getZ(); }
float x, y, z;
};
inline void AssertUnit(const float3& v)
{
assert(fabsf(v.sqLength() - 1.0f) < 0.01f);
}
inline float3 operator+(const float3& a, const float3& b) { return float3(a.x+b.x,a.y+b.y,a.z+b.z); }
inline float3 operator-(const float3& a, const float3& b) { return float3(a.x-b.x,a.y-b.y,a.z-b.z); }
inline float3 operator*(const float3& a, const float3& b) { return float3(a.x*b.x,a.y*b.y,a.z*b.z); }
inline float3 operator*(const float3& a, float b) { return float3(a.x*b,a.y*b,a.z*b); }
inline float3 operator*(float a, const float3& b) { return float3(a*b.x,a*b.y,a*b.z); }
inline float dot(const float3& a, const float3& b) { return a.x*b.x+a.y*b.y+a.z*b.z; }
inline float3 cross(const float3& a, const float3& b)
VM_INLINE float3 operator+(const float3& a, const float3& b) { return float3(a.x+b.x,a.y+b.y,a.z+b.z); }
VM_INLINE float3 operator-(const float3& a, const float3& b) { return float3(a.x-b.x,a.y-b.y,a.z-b.z); }
VM_INLINE float3 operator*(const float3& a, const float3& b) { return float3(a.x*b.x,a.y*b.y,a.z*b.z); }
VM_INLINE float3 operator*(const float3& a, float b) { return float3(a.x*b,a.y*b,a.z*b); }
VM_INLINE float3 operator*(float a, const float3& b) { return float3(a*b.x,a*b.y,a*b.z); }
VM_INLINE float dot(const float3& a, const float3& b) { return a.x*b.x+a.y*b.y+a.z*b.z; }
VM_INLINE float3 cross(const float3& a, const float3& b)
{
return float3(
a.y*b.z - a.z*b.y,
-(a.x*b.z - a.z*b.x),
a.x*b.y - a.y*b.x
);
}
inline float3 normalize(const float3& v) { float k = 1.0f / v.length(); return float3(v.x*k, v.y*k, v.z*k); }
#endif // #else of #if FLOAT3_USE_SSE
VM_INLINE float length(float3 v) { return sqrtf(dot(v, v)); }
VM_INLINE float sqLength(float3 v) { return dot(v, v); }
VM_INLINE float3 normalize(float3 v) { return v * (1.0f / length(v)); }
VM_INLINE float3 lerp(float3 a, float3 b, float t) { return a + (b-a)*t; }
inline void AssertUnit(const float3& v)
{
assert(fabsf(sqLength(v) - 1.0f) < 0.01f);
}
inline float3 reflect(const float3& v, const float3& n)
{
return v - 2*dot(v,n)*n;
}
inline bool refract(const float3& v, const float3& n, float nint, float3& outRefracted)
{
AssertUnit(v);
@@ -129,7 +251,7 @@ struct Camera
Ray GetRay(float s, float t, uint32_t& state) const
{
float3 rd = lensRadius * RandomInUnitDisk(state);
float3 offset = u * rd.x + v * rd.y;
float3 offset = u * rd.getX() + v * rd.getY();
return Ray(origin + offset, normalize(lowerLeftCorner + s*horizontal + t*vertical - origin - offset));
}
@@ -83,7 +83,7 @@ static bool Scatter(const Material& mat, const Ray& r_in, const Hit& rec, float3
for (int i = 0; i < kSphereCount; ++i)
{
const Material& smat = s_SphereMats[i];
if (smat.emissive.x <= 0 && smat.emissive.y <= 0 && smat.emissive.z <= 0)
if (smat.emissive.getX() <= 0 && smat.emissive.getY() <= 0 && smat.emissive.getZ() <= 0)
continue; // skip non-emissive
if (&mat == &smat)
continue; // skip self
@@ -92,16 +92,16 @@ static bool Scatter(const Material& mat, const Ray& r_in, const Hit& rec, float3
// create a random direction towards sphere
// coord system for sampling: sw, su, sv
float3 sw = normalize(s.center - rec.pos);
float3 su = normalize(cross(fabs(sw.x)>0.01f ? float3(0,1,0):float3(1,0,0), sw));
float3 su = normalize(cross(fabs(sw.getX())>0.01f ? float3(0,1,0):float3(1,0,0), sw));
float3 sv = cross(sw, su);
// sample sphere by solid angle
float cosAMax = sqrtf(1.0f - s.radius*s.radius / (rec.pos-s.center).sqLength());
float cosAMax = sqrtf(1.0f - s.radius*s.radius / sqLength(rec.pos-s.center));
float eps1 = RandomFloat01(state), eps2 = RandomFloat01(state);
float cosA = 1.0f - eps1 + eps1 * cosAMax;
float sinA = sqrtf(1.0f - cosA*cosA);
float phi = 2 * kPI * eps2;
float3 l = su * cosf(phi) * sinA + sv * sin(phi) * sinA + sw * cosA;
l.normalize();
l = normalize(l);
// shoot shadow ray
Hit lightHit;
@@ -212,7 +212,7 @@ static float3 Trace(const Ray& r, int depth, int& inoutRayCount, uint32_t& state
return float3(0.15f,0.21f,0.3f); // easier compare with Mitsuba's constant environment light
#else
float3 unitDir = r.dir;
float t = 0.5f*(unitDir.y + 1.0f);
float t = 0.5f*(unitDir.getY() + 1.0f);
return ((1.0f-t)*float3(1.0f, 1.0f, 1.0f) + t*float3(0.5f, 0.7f, 1.0f)) * 0.3f;
#endif
}
@@ -272,9 +272,7 @@ static void TraceRowJob(uint32_t start, uint32_t end, uint32_t threadnum, void*
float3 prev(backbuffer[0], backbuffer[1], backbuffer[2]);
col = prev * lerpFac + col * (1-lerpFac);
backbuffer[0] = col.x;
backbuffer[1] = col.y;
backbuffer[2] = col.z;
col.store(backbuffer);
backbuffer += 4;
}
}
@@ -4,7 +4,7 @@
#include <windows.h>
#include <d3d11_1.h>
#define DO_COMPUTE 1
#define DO_COMPUTE 0
#include <stdio.h>

0 comments on commit 69df050

Please sign in to comment.