Volta Tuning

minor tuning to make perft run faster on Volta
ankan-ban · May 19, 2017 · 7b61a1b · 7b61a1b
1 parent 4db3f1d
commit 7b61a1b
Show file tree

Hide file tree

Showing 4 changed files with 39 additions and 29 deletions.
diff --git a/perft.cu b/perft.cu
@@ -1041,6 +1041,7 @@ void removeNewLine(char *str)
     }
 }
 
+#if PERFT_RECORDS_MODE == 1
 void processPerftRecords(int argc, char *argv[])
 {
     int depth = 7;
@@ -1157,7 +1158,7 @@ void processPerftRecords(int argc, char *argv[])
 
     printf("Retry launches: %d\n", numRetryLaunches);
 }
-
+#endif
 
 
 int main(int argc, char *argv[])
@@ -1254,8 +1255,10 @@ int main(int argc, char *argv[])
     uint32 launchDepth = estimateLaunchDepth(&testBB);
     launchDepth = min(launchDepth, 11); // don't go too high
 
+#if USE_TRANSPOSITION_TABLE == 0
     // for best performance without GPU hash (also set PREALLOCATED_MEMORY_SIZE to 3 x 768MB)
-    // launchDepth = 6;    // ankan - test!
+    launchDepth = 6;    // ankan - test!
+#endif
 
     if (argc >= 5)
     {
@@ -1272,19 +1275,22 @@ int main(int argc, char *argv[])
         perftLauncher(&testBB, depth, launchDepth);
     }
 
-
+#if USE_TRANSPOSITION_TABLE == 1
     freeHashTables();
+#endif
 
     for (int g = 0; g < numGPUs; g++)
     {
         cudaFree(preAllocatedBufferHost[g]);
         cudaDeviceReset();
     }
-
+
+#if USE_TRANSPOSITION_TABLE == 1    
     printf("\nComplete hash sysmem memory usage: %llu bytes\n", ((uint64) chainIndex) * sizeof(CompleteHashEntry));
     printf("\nMax tree storage GPU memory usage: %llu bytes\n", maxMemoryUsage);
     printf("Regular depth %d Launches: %d\n", GPU_LAUNCH_DEPTH, numRegularLaunches);
     printf("Retry launches: %d\n", numRetryLaunches);
+#endif
 
     return 0;
 }
diff --git a/perft_bb.h b/perft_bb.h
@@ -101,13 +101,6 @@ uint64 perft_bb(HexaBitBoardPosition *pos, uint32 depth)
 }
 
 
-
-
-// can be tuned as per need
-// 256 works best for Maxwell
-// (also make sure max registers used is set to 47)
-#define BLOCK_SIZE 256
-
 // fixed
 #define WARP_SIZE 32
 
@@ -332,7 +325,7 @@ union sharedMemAllocs
 };
 
 #if LIMIT_REGISTER_USE == 1
-__launch_bounds__( BLOCK_SIZE, 4 )
+__launch_bounds__( BLOCK_SIZE, MIN_BLOCKS_PER_MP)
 #endif
 __global__ void perft_bb_gpu_single_level(HexaBitBoardPosition **positions, CMove *moves, uint64 *globalPerftCounter, int nThreads)
 {
@@ -374,7 +367,7 @@ __global__ void perft_bb_gpu_single_level(HexaBitBoardPosition **positions, CMov
 // positions        - array of pointers to old boards
 // generatedMoves   - moves to be made
 #if LIMIT_REGISTER_USE == 1
-__launch_bounds__( BLOCK_SIZE, 4)
+__launch_bounds__( BLOCK_SIZE, MIN_BLOCKS_PER_MP)
 #endif
 __global__ void makeMove_and_perft_single_level(HexaBitBoardPosition **positions, CMove *generatedMoves, uint64 *globalPerftCounter, int nThreads)
 {
@@ -414,7 +407,7 @@ __global__ void makeMove_and_perft_single_level(HexaBitBoardPosition **positions
 
 // same as above function but works with indices
 #if LIMIT_REGISTER_USE == 1
-__launch_bounds__(BLOCK_SIZE, 4)
+__launch_bounds__(BLOCK_SIZE, MIN_BLOCKS_PER_MP)
 #endif
 __global__ void makeMove_and_perft_single_level_indices(HexaBitBoardPosition *positions, int *indices, CMove *moves, uint64 *globalPerftCounter, int nThreads)
 {
@@ -456,7 +449,7 @@ __global__ void makeMove_and_perft_single_level_indices(HexaBitBoardPosition *po
 // this version gets seperate perft counter per thread
 // perftCounters[] is array of pointers to perft counters - where each thread should atomically add the computed perft
 #if LIMIT_REGISTER_USE == 1
-__launch_bounds__( BLOCK_SIZE, 4)
+__launch_bounds__( BLOCK_SIZE, MIN_BLOCKS_PER_MP)
 #endif
 __global__ void makeMove_and_perft_single_level(HexaBitBoardPosition **positions, CMove *generatedMoves, uint64 **perftCounters, int nThreads)
 {
@@ -511,7 +504,7 @@ __global__ void makeMove_and_perft_single_level(HexaBitBoardPosition **positions
 
 // this version uses the indices[] array to index into parentPositions[] and parentCounters[] arrays
 #if LIMIT_REGISTER_USE == 1
-__launch_bounds__( BLOCK_SIZE, 4)
+__launch_bounds__( BLOCK_SIZE, MIN_BLOCKS_PER_MP)
 #endif
 __global__ void makeMove_and_perft_single_level_indices(HexaBitBoardPosition *parentBoards, uint32 *parentCounters, 
                                                         int *indices, CMove *moves, int nThreads)
@@ -569,7 +562,7 @@ __global__ void makeMove_and_perft_single_level_indices(HexaBitBoardPosition *pa
 // the move counts are returned in moveCounts[] array
 template <bool genBoard>
 #if LIMIT_REGISTER_USE == 1
-__launch_bounds__( BLOCK_SIZE, 4 )
+__launch_bounds__( BLOCK_SIZE, MIN_BLOCKS_PER_MP)
 #endif
 __global__ void makemove_and_count_moves_single_level(HexaBitBoardPosition **positions, CMove *moves, HexaBitBoardPosition *outPositions, uint32 *moveCounts, int nThreads)
 {
@@ -603,7 +596,7 @@ __global__ void makemove_and_count_moves_single_level(HexaBitBoardPosition **pos
 // 2. makes the move on parent board to produce current board. Writes it to outPositions[].
 // 3. Counts moves at current board position and writes it to moveCounts[].
 #if LIMIT_REGISTER_USE == 1
-__launch_bounds__(BLOCK_SIZE, 4)
+__launch_bounds__(BLOCK_SIZE, MIN_BLOCKS_PER_MP)
 #endif
 __global__ void makemove_and_count_moves_single_level(HexaBitBoardPosition *parentBoards, int *indices, CMove *moves,
                                                       HexaBitBoardPosition *outPositions, int *moveCounts, int nThreads)

diff --git a/perft_gpu.vcxproj b/perft_gpu.vcxproj
@@ -35,11 +35,11 @@
   </ItemGroup>
   <ItemGroup>
     <CudaCompile Include="perft.cu">
-      <CodeGeneration Condition="'$(Configuration)|$(Platform)'=='Release|x64'">compute_50,sm_50</CodeGeneration>
       <GenerateRelocatableDeviceCode Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</GenerateRelocatableDeviceCode>
       <GenerateRelocatableDeviceCode Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">true</GenerateRelocatableDeviceCode>
       <CodeGeneration Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">compute_50,sm_50</CodeGeneration>
       <GenerateRelocatableDeviceCode Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</GenerateRelocatableDeviceCode>
+      <MaxRegCount Condition="'$(Configuration)|$(Platform)'=='Release|x64'">0</MaxRegCount>
     </CudaCompile>
   </ItemGroup>
   <PropertyGroup Label="Globals">
@@ -75,7 +75,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(VCTargetsPath)\BuildCustomizations\CUDA 8.0.props" />
+    <Import Project="$(VCTargetsPath14)\BuildCustomizations\CUDA 8.0.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
     <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
@@ -182,11 +182,11 @@ copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)"</Command>
     </PostBuildEvent>
     <CudaCompile>
       <TargetMachinePlatform>64</TargetMachinePlatform>
-      <CodeGeneration>compute_50,sm_50</CodeGeneration>
+      <CodeGeneration>compute_60,sm_60</CodeGeneration>
     </CudaCompile>
   </ItemDefinitionGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(VCTargetsPath)\BuildCustomizations\CUDA 8.0.targets" />
+    <Import Project="$(VCTargetsPath14)\BuildCustomizations\CUDA 8.0.targets" />
   </ImportGroup>
 </Project>
diff --git a/switches.h b/switches.h
@@ -7,9 +7,18 @@
 #endif
 
 
+// can be tuned as per need
+// 256 works best for Maxwell
+// 384 best for newer chips!
+// (also make sure max registers used is set to 47 on maxwell and 64 on newer chips) or set max registers to 0 and enable LIMIT_REGISTER_USAGE
+#define BLOCK_SIZE 384
+
 // limit max used registers to 64 for some kernels
 // improves occupancy and performance (but doesn't work with debug info or debug builds)
-#define LIMIT_REGISTER_USE 0
+#define LIMIT_REGISTER_USE 1
+
+// 3 works best with 384 block size on new chips
+#define  MIN_BLOCKS_PER_MP 3
 
 // 768 MB preallocated memory size (for holding the perft tree in GPU memory)
 // on systems with more video memory (like Titan X), we can use 3x of this to hold bigger trees
@@ -23,19 +32,21 @@
 // the default is to use texture cache via __ldg instruction
 // (doesn't really affect performance either way. Maybe just a tiny bit slower with fancy magics)
 // Ankan - improves performance on Maxwell a LOT!
-#define USE_CONSTANT_MEMORY_FOR_LUT 1
+//  - but hurts performance on Newer hardware
+#define USE_CONSTANT_MEMORY_FOR_LUT 0
+
+// make use of a hash table to avoid duplicate calculations due to transpositions
+#define USE_TRANSPOSITION_TABLE 0
+
+#if USE_TRANSPOSITION_TABLE == 1
 
 // flag en-passent capture only when it's possible (in next move)
 // default is to flag en-passent on every double pawn push
 // This switch works only using using makeMove()
 // This helps A LOT when using transposition tables (> 50% improvement in perft(12) time)!
+// very slight regression without transposition tables so enable by default only when TT is enabled
 #define EXACT_EN_PASSENT_FLAGGING 1
 
-// make use of a hash table to avoid duplicate calculations due to transpositions
-#define USE_TRANSPOSITION_TABLE 1
-
-#if USE_TRANSPOSITION_TABLE == 1
-
 // find duplicates in the each level of the parallel breadth first search
 // and make sure we explore them only once
 #define FIND_DUPLICATES_IN_BFS 1