Permalink
Browse files

GPU buffer oriented: append global rays in parallel instead of only o…

…n thread 0 of the group; 619 -> 638 Mray/s
  • Loading branch information...
aras-p committed Apr 25, 2018
1 parent 30a62d2 commit 88e3e9ff5dfd4acee33cb14b35c09fbce4990bdb
Showing with 29 additions and 14 deletions.
  1. +12 −7 Cpp/Windows/ComputeShader.hlsl
  2. +7 −3 Cpp/Windows/ComputeShaderBounce.hlsl
  3. +10 −4 Cpp/Windows/ComputeShaderCameraRays.hlsl
@@ -233,6 +233,7 @@ RWStructuredBuffer<RayData> g_RayBufferDst : register(u2);
// a CS group execution is appended into the global one.
groupshared uint s_GroupRayCounter;
groupshared uint s_GroupRayGlobalStart;
#define kMaxGroupRays 768
groupshared RayData s_GroupRays[kMaxGroupRays];
@@ -243,15 +244,19 @@ void PushRayData(RayData rd)
s_GroupRays[index] = rd;
}
void PushGlobalRayData()
void GetGlobalRayDataOffset(uint rayCount)
{
// append new rays into global buffer
uint rayCount = min(s_GroupRayCounter, kMaxGroupRays);
uint rayBufferStart;
g_OutCounts.InterlockedAdd(4, rayCount, rayBufferStart);
for (uint ir = 0; ir < rayCount; ++ir)
g_OutCounts.InterlockedAdd(4, rayCount, s_GroupRayGlobalStart);
}
void PushGlobalRayData(uint threadID, uint rayCount, uint groupSize)
{
uint myRayCount = (rayCount + groupSize - 1) / groupSize;
uint myRayStart = threadID * myRayCount;
for (uint ir = myRayStart; ir < myRayStart + myRayCount; ++ir)
{
g_RayBufferDst[rayBufferStart + ir] = s_GroupRays[ir];
if (ir < rayCount)
g_RayBufferDst[s_GroupRayGlobalStart + ir] = s_GroupRays[ir];
}
}
@@ -37,7 +37,8 @@ RWStructuredBuffer<SplatData> g_SplatBufferDst : register(u3);
[numthreads(kCSRayBatchSize, 1, 1)]
void main(uint3 gid : SV_DispatchThreadID, uint3 tid : SV_GroupThreadID)
{
if (tid.x == 0)
uint threadID = tid.x;
if (threadID == 0)
{
s_GroupRayCounter = 0;
s_GroupSplatCounter = 0;
@@ -91,12 +92,13 @@ void main(uint3 gid : SV_DispatchThreadID, uint3 tid : SV_GroupThreadID)
//if (s_GroupRayCounter > kMaxGroupRays)
// PushSplat(float3(0, 2, 0), pixelIndex);
if (tid.x == 0)
uint rayCount = min(s_GroupRayCounter, kMaxGroupRays);
if (threadID == 0)
{
// total ray counts (for perf indicator)
g_OutCounts.InterlockedAdd(0, kCSRayBatchSize);
PushGlobalRayData();
GetGlobalRayDataOffset(rayCount);
// append new splats into global buffer
uint splatBufferStart;
@@ -106,4 +108,6 @@ void main(uint3 gid : SV_DispatchThreadID, uint3 tid : SV_GroupThreadID)
g_SplatBufferDst[splatBufferStart + is] = s_GroupSplats[is];
}
}
GroupMemoryBarrierWithGroupSync();
PushGlobalRayData(threadID, rayCount, kCSRayBatchSize);
}
@@ -1,9 +1,12 @@
#include "ComputeShader.hlsl"
groupshared uint s_RayBufferStart;
[numthreads(kCSGroupSizeX, kCSGroupSizeY, 1)]
void main(uint3 gid : SV_DispatchThreadID, uint3 tid : SV_GroupThreadID)
{
if (tid.x == 0)
uint threadID = tid.x + tid.y * kCSGroupSizeX;
if (threadID == 0)
{
s_GroupRayCounter = 0;
}
@@ -44,10 +47,13 @@ void main(uint3 gid : SV_DispatchThreadID, uint3 tid : SV_GroupThreadID)
//if (s_GroupRayCounter > kMaxGroupRays)
// dstImage[gid.xy] += float4(2,0,0,0);
if (tid.x == 0 && tid.y == 0)
uint rayCount = min(s_GroupRayCounter, kMaxGroupRays);
if (threadID == 0)
{
g_OutCounts.InterlockedAdd(0, DO_SAMPLES_PER_PIXEL * kCSGroupSizeX * kCSGroupSizeY);
PushGlobalRayData();
GetGlobalRayDataOffset(rayCount);
}
GroupMemoryBarrierWithGroupSync();
PushGlobalRayData(threadID, rayCount, kCSGroupSizeX*kCSGroupSizeY);
}

0 comments on commit 88e3e9f

Please sign in to comment.