Skip to content

Commit

Permalink
GPU buffer oriented: append global rays in parallel instead of only o…
Browse files Browse the repository at this point in the history
…n thread 0 of the group; 619 -> 638 Mray/s
  • Loading branch information
aras-p committed Apr 25, 2018
1 parent 30a62d2 commit 88e3e9f
Show file tree
Hide file tree
Showing 3 changed files with 29 additions and 14 deletions.
19 changes: 12 additions & 7 deletions Cpp/Windows/ComputeShader.hlsl
Expand Up @@ -233,6 +233,7 @@ RWStructuredBuffer<RayData> g_RayBufferDst : register(u2);
// a CS group execution is appended into the global one.

groupshared uint s_GroupRayCounter;
groupshared uint s_GroupRayGlobalStart;
#define kMaxGroupRays 768
groupshared RayData s_GroupRays[kMaxGroupRays];

Expand All @@ -243,15 +244,19 @@ void PushRayData(RayData rd)
s_GroupRays[index] = rd;
}

void PushGlobalRayData()
void GetGlobalRayDataOffset(uint rayCount)
{
// append new rays into global buffer
uint rayCount = min(s_GroupRayCounter, kMaxGroupRays);
uint rayBufferStart;
g_OutCounts.InterlockedAdd(4, rayCount, rayBufferStart);
for (uint ir = 0; ir < rayCount; ++ir)
g_OutCounts.InterlockedAdd(4, rayCount, s_GroupRayGlobalStart);
}

void PushGlobalRayData(uint threadID, uint rayCount, uint groupSize)
{
uint myRayCount = (rayCount + groupSize - 1) / groupSize;
uint myRayStart = threadID * myRayCount;
for (uint ir = myRayStart; ir < myRayStart + myRayCount; ++ir)
{
g_RayBufferDst[rayBufferStart + ir] = s_GroupRays[ir];
if (ir < rayCount)
g_RayBufferDst[s_GroupRayGlobalStart + ir] = s_GroupRays[ir];
}
}

Expand Down
10 changes: 7 additions & 3 deletions Cpp/Windows/ComputeShaderBounce.hlsl
Expand Up @@ -37,7 +37,8 @@ RWStructuredBuffer<SplatData> g_SplatBufferDst : register(u3);
[numthreads(kCSRayBatchSize, 1, 1)]
void main(uint3 gid : SV_DispatchThreadID, uint3 tid : SV_GroupThreadID)
{
if (tid.x == 0)
uint threadID = tid.x;
if (threadID == 0)
{
s_GroupRayCounter = 0;
s_GroupSplatCounter = 0;
Expand Down Expand Up @@ -91,12 +92,13 @@ void main(uint3 gid : SV_DispatchThreadID, uint3 tid : SV_GroupThreadID)
//if (s_GroupRayCounter > kMaxGroupRays)
// PushSplat(float3(0, 2, 0), pixelIndex);

if (tid.x == 0)
uint rayCount = min(s_GroupRayCounter, kMaxGroupRays);
if (threadID == 0)
{
// total ray counts (for perf indicator)
g_OutCounts.InterlockedAdd(0, kCSRayBatchSize);

PushGlobalRayData();
GetGlobalRayDataOffset(rayCount);

// append new splats into global buffer
uint splatBufferStart;
Expand All @@ -106,4 +108,6 @@ void main(uint3 gid : SV_DispatchThreadID, uint3 tid : SV_GroupThreadID)
g_SplatBufferDst[splatBufferStart + is] = s_GroupSplats[is];
}
}
GroupMemoryBarrierWithGroupSync();
PushGlobalRayData(threadID, rayCount, kCSRayBatchSize);
}
14 changes: 10 additions & 4 deletions Cpp/Windows/ComputeShaderCameraRays.hlsl
@@ -1,9 +1,12 @@
#include "ComputeShader.hlsl"

groupshared uint s_RayBufferStart;

[numthreads(kCSGroupSizeX, kCSGroupSizeY, 1)]
void main(uint3 gid : SV_DispatchThreadID, uint3 tid : SV_GroupThreadID)
{
if (tid.x == 0)
uint threadID = tid.x + tid.y * kCSGroupSizeX;
if (threadID == 0)
{
s_GroupRayCounter = 0;
}
Expand Down Expand Up @@ -44,10 +47,13 @@ void main(uint3 gid : SV_DispatchThreadID, uint3 tid : SV_GroupThreadID)
//if (s_GroupRayCounter > kMaxGroupRays)
// dstImage[gid.xy] += float4(2,0,0,0);

if (tid.x == 0 && tid.y == 0)
uint rayCount = min(s_GroupRayCounter, kMaxGroupRays);
if (threadID == 0)
{
g_OutCounts.InterlockedAdd(0, DO_SAMPLES_PER_PIXEL * kCSGroupSizeX * kCSGroupSizeY);

PushGlobalRayData();
GetGlobalRayDataOffset(rayCount);
}
GroupMemoryBarrierWithGroupSync();

PushGlobalRayData(threadID, rayCount, kCSGroupSizeX*kCSGroupSizeY);
}

0 comments on commit 88e3e9f

Please sign in to comment.