Permalink
Browse files

C++ buffer oriented: more compact RayData struct (FP16 for direction/…

…color), 1125->787MB ray buffers (40->28 bytes/ray).

- PC 133 -> 161 Mray/s
- Mac 39.5 -> 40.5 Mray/s
  • Loading branch information...
aras-p committed Apr 19, 2018
1 parent c6b84a1 commit ada1ff9fdecacc91e201eea4d796b48425f7a0c4
Showing with 55 additions and 7 deletions.
  1. +2 −0 Cpp/Mac/Test.xcodeproj/project.pbxproj
  2. +42 −0 Cpp/Source/Maths.h
  3. +11 −7 Cpp/Source/Test.cpp
@@ -330,6 +330,7 @@
GCC_FAST_MATH = YES;
INFOPLIST_FILE = "$(SRCROOT)/Info.plist";
LD_RUNPATH_SEARCH_PATHS = "$(inherited) @executable_path/../Frameworks";
OTHER_CFLAGS = "-mf16c";
PRODUCT_BUNDLE_IDENTIFIER = com.aras.Test;
PRODUCT_NAME = "$(TARGET_NAME)";
};
@@ -344,6 +345,7 @@
GCC_FAST_MATH = YES;
INFOPLIST_FILE = "$(SRCROOT)/Info.plist";
LD_RUNPATH_SEARCH_PATHS = "$(inherited) @executable_path/../Frameworks";
OTHER_CFLAGS = "-mf16c";
PRODUCT_BUNDLE_IDENTIFIER = com.aras.Test;
PRODUCT_NAME = "$(TARGET_NAME)";
};
View
@@ -296,3 +296,45 @@ struct Camera
float lensRadius;
};
// Use F16C instructions for float<->half conversions. Intel CPUs have had this since Ivy Bridge (2011),
// and AMD since Bulldozer (2011).
#include <immintrin.h>
inline int16_t FloatToHalf(float val)
{
__m128 f = _mm_set1_ps(val);
__m128i h = _mm_cvtps_ph(f, _MM_FROUND_CUR_DIRECTION);
return _mm_extract_epi16(h, 0);
}
inline float HalfToFloat(int16_t val_)
{
__m128i h = _mm_set1_epi16(val_);
__m128 f = _mm_cvtph_ps(h);
return _mm_cvtss_f32(f);
}
inline void Float3ToHalf3(float3 val, int16_t* dst)
{
#if DO_FLOAT3_WITH_SSE
__m128i h = _mm_cvtps_ph(val.m, _MM_FROUND_CUR_DIRECTION);
dst[0] = _mm_extract_epi16(h, 0);
dst[1] = _mm_extract_epi16(h, 1);
dst[2] = _mm_extract_epi16(h, 2);
#else
dst[0] = FloatToHalf(val.getX());
dst[1] = FloatToHalf(val.getY());
dst[2] = FloatToHalf(val.getZ());
#endif
}
inline float3 Half3ToFloat3(const int16_t* src)
{
#if DO_FLOAT3_WITH_SSE
__m128i h = _mm_set_epi16(0,0,0,0,0,src[2],src[1],src[0]);
__m128 f = _mm_cvtph_ps(h);
return float3(f);
#else
return float3(HalfToFloat(src[0]), HalfToFloat(src[1]), HalfToFloat(src[2]));
#endif
}
View
@@ -75,15 +75,19 @@ struct RayData
RayData() {}
RayData(const Ray& r, const float3& atten, uint32_t pixelIndex_, uint32_t lightID_, bool shadow_, bool skipEmission_)
: origX(r.orig.getX()), origY(r.orig.getY()), origZ(r.orig.getZ())
, dirX(r.dir.getX()), dirY(r.dir.getY()), dirZ(r.dir.getZ())
, attenX(atten.getX()), attenY(atten.getY()), attenZ(atten.getZ())
, pixelIndex(pixelIndex_), lightID(lightID_), shadow(shadow_), skipEmission(skipEmission_) {}
, pixelIndex(pixelIndex_), lightID(lightID_), shadow(shadow_), skipEmission(skipEmission_)
{
Float3ToHalf3(r.dir, &dirX);
Float3ToHalf3(atten, &attenX);
}
Ray GetRay() const { return Ray(float3(origX,origY,origZ), Half3ToFloat3(&dirX)); }
float3 GetAtten() const { return Half3ToFloat3(&attenX); }
Ray GetRay() const { return Ray(float3(origX,origY,origZ), float3(dirX,dirY,dirZ)); }
float3 GetAtten() const { return float3(attenX,attenY,attenZ); }
float origX, origY, origZ;
float dirX, dirY, dirZ;
float attenX, attenY, attenZ;
// Store direction & attenuation in FP16 ("half"), to save on memory size for ray buffers
int16_t dirX, dirY, dirZ;
int16_t attenX, attenY, attenZ;
uint32_t pixelIndex : 22; // 1280x720x4 can fit into 22 bits; we use much less since it's job-local
uint32_t lightID : 8;
uint32_t shadow : 1;

0 comments on commit ada1ff9

Please sign in to comment.