Skip to content

Commit

Permalink
Volta Tuning
Browse files Browse the repository at this point in the history
minor tuning to make perft run faster on Volta
  • Loading branch information
ankan-ban committed May 19, 2017
1 parent 4db3f1d commit 7b61a1b
Show file tree
Hide file tree
Showing 4 changed files with 39 additions and 29 deletions.
14 changes: 10 additions & 4 deletions perft.cu
Expand Up @@ -1041,6 +1041,7 @@ void removeNewLine(char *str)
}
}

#if PERFT_RECORDS_MODE == 1
void processPerftRecords(int argc, char *argv[])
{
int depth = 7;
Expand Down Expand Up @@ -1157,7 +1158,7 @@ void processPerftRecords(int argc, char *argv[])

printf("Retry launches: %d\n", numRetryLaunches);
}

#endif


int main(int argc, char *argv[])
Expand Down Expand Up @@ -1254,8 +1255,10 @@ int main(int argc, char *argv[])
uint32 launchDepth = estimateLaunchDepth(&testBB);
launchDepth = min(launchDepth, 11); // don't go too high

#if USE_TRANSPOSITION_TABLE == 0
// for best performance without GPU hash (also set PREALLOCATED_MEMORY_SIZE to 3 x 768MB)
// launchDepth = 6; // ankan - test!
launchDepth = 6; // ankan - test!
#endif

if (argc >= 5)
{
Expand All @@ -1272,19 +1275,22 @@ int main(int argc, char *argv[])
perftLauncher(&testBB, depth, launchDepth);
}


#if USE_TRANSPOSITION_TABLE == 1
freeHashTables();
#endif

for (int g = 0; g < numGPUs; g++)
{
cudaFree(preAllocatedBufferHost[g]);
cudaDeviceReset();
}


#if USE_TRANSPOSITION_TABLE == 1
printf("\nComplete hash sysmem memory usage: %llu bytes\n", ((uint64) chainIndex) * sizeof(CompleteHashEntry));
printf("\nMax tree storage GPU memory usage: %llu bytes\n", maxMemoryUsage);
printf("Regular depth %d Launches: %d\n", GPU_LAUNCH_DEPTH, numRegularLaunches);
printf("Retry launches: %d\n", numRetryLaunches);
#endif

return 0;
}
21 changes: 7 additions & 14 deletions perft_bb.h
Expand Up @@ -101,13 +101,6 @@ uint64 perft_bb(HexaBitBoardPosition *pos, uint32 depth)
}




// can be tuned as per need
// 256 works best for Maxwell
// (also make sure max registers used is set to 47)
#define BLOCK_SIZE 256

// fixed
#define WARP_SIZE 32

Expand Down Expand Up @@ -332,7 +325,7 @@ union sharedMemAllocs
};

#if LIMIT_REGISTER_USE == 1
__launch_bounds__( BLOCK_SIZE, 4 )
__launch_bounds__( BLOCK_SIZE, MIN_BLOCKS_PER_MP)
#endif
__global__ void perft_bb_gpu_single_level(HexaBitBoardPosition **positions, CMove *moves, uint64 *globalPerftCounter, int nThreads)
{
Expand Down Expand Up @@ -374,7 +367,7 @@ __global__ void perft_bb_gpu_single_level(HexaBitBoardPosition **positions, CMov
// positions - array of pointers to old boards
// generatedMoves - moves to be made
#if LIMIT_REGISTER_USE == 1
__launch_bounds__( BLOCK_SIZE, 4)
__launch_bounds__( BLOCK_SIZE, MIN_BLOCKS_PER_MP)
#endif
__global__ void makeMove_and_perft_single_level(HexaBitBoardPosition **positions, CMove *generatedMoves, uint64 *globalPerftCounter, int nThreads)
{
Expand Down Expand Up @@ -414,7 +407,7 @@ __global__ void makeMove_and_perft_single_level(HexaBitBoardPosition **positions

// same as above function but works with indices
#if LIMIT_REGISTER_USE == 1
__launch_bounds__(BLOCK_SIZE, 4)
__launch_bounds__(BLOCK_SIZE, MIN_BLOCKS_PER_MP)
#endif
__global__ void makeMove_and_perft_single_level_indices(HexaBitBoardPosition *positions, int *indices, CMove *moves, uint64 *globalPerftCounter, int nThreads)
{
Expand Down Expand Up @@ -456,7 +449,7 @@ __global__ void makeMove_and_perft_single_level_indices(HexaBitBoardPosition *po
// this version gets seperate perft counter per thread
// perftCounters[] is array of pointers to perft counters - where each thread should atomically add the computed perft
#if LIMIT_REGISTER_USE == 1
__launch_bounds__( BLOCK_SIZE, 4)
__launch_bounds__( BLOCK_SIZE, MIN_BLOCKS_PER_MP)
#endif
__global__ void makeMove_and_perft_single_level(HexaBitBoardPosition **positions, CMove *generatedMoves, uint64 **perftCounters, int nThreads)
{
Expand Down Expand Up @@ -511,7 +504,7 @@ __global__ void makeMove_and_perft_single_level(HexaBitBoardPosition **positions

// this version uses the indices[] array to index into parentPositions[] and parentCounters[] arrays
#if LIMIT_REGISTER_USE == 1
__launch_bounds__( BLOCK_SIZE, 4)
__launch_bounds__( BLOCK_SIZE, MIN_BLOCKS_PER_MP)
#endif
__global__ void makeMove_and_perft_single_level_indices(HexaBitBoardPosition *parentBoards, uint32 *parentCounters,
int *indices, CMove *moves, int nThreads)
Expand Down Expand Up @@ -569,7 +562,7 @@ __global__ void makeMove_and_perft_single_level_indices(HexaBitBoardPosition *pa
// the move counts are returned in moveCounts[] array
template <bool genBoard>
#if LIMIT_REGISTER_USE == 1
__launch_bounds__( BLOCK_SIZE, 4 )
__launch_bounds__( BLOCK_SIZE, MIN_BLOCKS_PER_MP)
#endif
__global__ void makemove_and_count_moves_single_level(HexaBitBoardPosition **positions, CMove *moves, HexaBitBoardPosition *outPositions, uint32 *moveCounts, int nThreads)
{
Expand Down Expand Up @@ -603,7 +596,7 @@ __global__ void makemove_and_count_moves_single_level(HexaBitBoardPosition **pos
// 2. makes the move on parent board to produce current board. Writes it to outPositions[].
// 3. Counts moves at current board position and writes it to moveCounts[].
#if LIMIT_REGISTER_USE == 1
__launch_bounds__(BLOCK_SIZE, 4)
__launch_bounds__(BLOCK_SIZE, MIN_BLOCKS_PER_MP)
#endif
__global__ void makemove_and_count_moves_single_level(HexaBitBoardPosition *parentBoards, int *indices, CMove *moves,
HexaBitBoardPosition *outPositions, int *moveCounts, int nThreads)
Expand Down
8 changes: 4 additions & 4 deletions perft_gpu.vcxproj
Expand Up @@ -35,11 +35,11 @@
</ItemGroup>
<ItemGroup>
<CudaCompile Include="perft.cu">
<CodeGeneration Condition="'$(Configuration)|$(Platform)'=='Release|x64'">compute_50,sm_50</CodeGeneration>
<GenerateRelocatableDeviceCode Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</GenerateRelocatableDeviceCode>
<GenerateRelocatableDeviceCode Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">true</GenerateRelocatableDeviceCode>
<CodeGeneration Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">compute_50,sm_50</CodeGeneration>
<GenerateRelocatableDeviceCode Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</GenerateRelocatableDeviceCode>
<MaxRegCount Condition="'$(Configuration)|$(Platform)'=='Release|x64'">0</MaxRegCount>
</CudaCompile>
</ItemGroup>
<PropertyGroup Label="Globals">
Expand Down Expand Up @@ -75,7 +75,7 @@
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings">
<Import Project="$(VCTargetsPath)\BuildCustomizations\CUDA 8.0.props" />
<Import Project="$(VCTargetsPath14)\BuildCustomizations\CUDA 8.0.props" />
</ImportGroup>
<ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
Expand Down Expand Up @@ -182,11 +182,11 @@ copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)"</Command>
</PostBuildEvent>
<CudaCompile>
<TargetMachinePlatform>64</TargetMachinePlatform>
<CodeGeneration>compute_50,sm_50</CodeGeneration>
<CodeGeneration>compute_60,sm_60</CodeGeneration>
</CudaCompile>
</ItemDefinitionGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets">
<Import Project="$(VCTargetsPath)\BuildCustomizations\CUDA 8.0.targets" />
<Import Project="$(VCTargetsPath14)\BuildCustomizations\CUDA 8.0.targets" />
</ImportGroup>
</Project>
25 changes: 18 additions & 7 deletions switches.h
Expand Up @@ -7,9 +7,18 @@
#endif


// can be tuned as per need
// 256 works best for Maxwell
// 384 best for newer chips!
// (also make sure max registers used is set to 47 on maxwell and 64 on newer chips) or set max registers to 0 and enable LIMIT_REGISTER_USAGE
#define BLOCK_SIZE 384

// limit max used registers to 64 for some kernels
// improves occupancy and performance (but doesn't work with debug info or debug builds)
#define LIMIT_REGISTER_USE 0
#define LIMIT_REGISTER_USE 1

// 3 works best with 384 block size on new chips
#define MIN_BLOCKS_PER_MP 3

// 768 MB preallocated memory size (for holding the perft tree in GPU memory)
// on systems with more video memory (like Titan X), we can use 3x of this to hold bigger trees
Expand All @@ -23,19 +32,21 @@
// the default is to use texture cache via __ldg instruction
// (doesn't really affect performance either way. Maybe just a tiny bit slower with fancy magics)
// Ankan - improves performance on Maxwell a LOT!
#define USE_CONSTANT_MEMORY_FOR_LUT 1
// - but hurts performance on Newer hardware
#define USE_CONSTANT_MEMORY_FOR_LUT 0

// make use of a hash table to avoid duplicate calculations due to transpositions
#define USE_TRANSPOSITION_TABLE 0

#if USE_TRANSPOSITION_TABLE == 1

// flag en-passent capture only when it's possible (in next move)
// default is to flag en-passent on every double pawn push
// This switch works only using using makeMove()
// This helps A LOT when using transposition tables (> 50% improvement in perft(12) time)!
// very slight regression without transposition tables so enable by default only when TT is enabled
#define EXACT_EN_PASSENT_FLAGGING 1

// make use of a hash table to avoid duplicate calculations due to transpositions
#define USE_TRANSPOSITION_TABLE 1

#if USE_TRANSPOSITION_TABLE == 1

// find duplicates in the each level of the parallel breadth first search
// and make sure we explore them only once
#define FIND_DUPLICATES_IN_BFS 1
Expand Down

0 comments on commit 7b61a1b

Please sign in to comment.