diff --git a/.clang-format b/.clang-format new file mode 100644 index 00000000..5708a4fb --- /dev/null +++ b/.clang-format @@ -0,0 +1,77 @@ +--- +AccessModifierOffset: -4 +AlignAfterOpenBracket: AlwaysBreak +AlignConsecutiveAssignments: false +AlignConsecutiveDeclarations: false +AlignEscapedNewlines: DontAlign +AlignOperands: false +AlignTrailingComments: false +AllowAllParametersOfDeclarationOnNextLine: false +AllowShortBlocksOnASingleLine: false +AllowShortCaseLabelsOnASingleLine: false +AllowShortFunctionsOnASingleLine: Empty +AllowShortIfStatementsOnASingleLine: false +AllowShortLoopsOnASingleLine: true +AlwaysBreakAfterReturnType: None +AlwaysBreakBeforeMultilineStrings: true +AlwaysBreakTemplateDeclarations: Yes +BinPackArguments: false +BinPackParameters: false +BreakBeforeBraces: Custom +BraceWrapping: + AfterClass: true + AfterControlStatement: true + AfterEnum: true + AfterFunction: true + AfterNamespace: true + AfterStruct: true + AfterUnion: true + AfterExternBlock: true + BeforeCatch: true + BeforeElse: true + IndentBraces: false + SplitEmptyFunction: false + SplitEmptyRecord: false + SplitEmptyNamespace: false +BreakBeforeBinaryOperators: All +BreakBeforeTernaryOperators: true +BreakConstructorInitializers: AfterColon +BreakInheritanceList: AfterColon +BreakStringLiterals: true +ColumnLimit: 80 +CompactNamespaces: false +ConstructorInitializerAllOnOneLineOrOnePerLine: true +ConstructorInitializerIndentWidth: 8 +ContinuationIndentWidth: 4 +Cpp11BracedListStyle: true +DerivePointerAlignment: false +FixNamespaceComments: false +IncludeBlocks: Regroup +IndentCaseLabels: false +IndentPPDirectives: None +IndentWidth: 4 +IndentWrappedFunctionNames: false +KeepEmptyLinesAtTheStartOfBlocks: false +Language: Cpp +NamespaceIndentation: All +PointerAlignment: Middle +ReflowComments: true +SortIncludes: true +SortUsingDeclarations: true +SpaceAfterCStyleCast: false +SpaceAfterTemplateKeyword: false +SpaceBeforeAssignmentOperators: true +SpaceBeforeCpp11BracedList: false +SpaceBeforeCtorInitializerColon: true +SpaceBeforeInheritanceColon: true +SpaceBeforeParens: Never +SpaceBeforeRangeBasedForLoopColon: true +SpaceInEmptyParentheses: false +SpacesInAngles: false +SpacesInCStyleCastParentheses: false +SpacesInContainerLiterals: false +SpacesInParentheses: false +SpacesInSquareBrackets: false +Standard: Cpp11 +UseTab: Never +... diff --git a/.clang-tidy b/.clang-tidy new file mode 100644 index 00000000..3292a65b --- /dev/null +++ b/.clang-tidy @@ -0,0 +1,3 @@ +--- +Checks: '*,-llvm-header-guard,-fuchsia-default-arguments-declarations,-cppcoreguidelines-no-malloc,-cppcoreguidelines-owning-memory,-misc-non-private-member-variables-in-classes' +HeaderFilterRegex: '.*' diff --git a/.gitignore b/.gitignore index b8056488..9a640b01 100644 --- a/.gitignore +++ b/.gitignore @@ -14,3 +14,5 @@ *~ /nbproject +/.vs +/build diff --git a/.travis.yml b/.travis.yml index 8bf68ced..4a169b7a 100644 --- a/.travis.yml +++ b/.travis.yml @@ -2,7 +2,7 @@ language: cpp sudo: required -dist: trusty +dist: bionic compiler: - gcc @@ -14,6 +14,7 @@ env: script: - mkdir build_tmp && cd build_tmp + - CXX=g++-5 && CC=gcc-5 - cmake -DCMAKE_INSTALL_PREFIX=$INSTALL_DIR $TRAVIS_BUILD_DIR - make - make install @@ -28,12 +29,12 @@ before_script: - sudo apt-get install -f -qq - sudo dpkg --get-selections | grep hold || { echo "All packages OK."; } - sudo apt-get install -q -y cmake-data cmake - - sudo apt-get install -qq build-essential - - gcc --version && g++ --version # 4.8 + - sudo apt-get install -qq build-essential g++-5 + - gcc-5 --version && g++-5 --version # 5.5.0 - apt-cache search nvidia-* - sudo apt-get install -qq nvidia-common - - sudo apt-get install -qq nvidia-cuda-dev nvidia-cuda-toolkit # 5.5 - - sudo apt-get install -qq libboost-dev # 1.54.0 + - sudo apt-get install -qq nvidia-cuda-dev nvidia-cuda-toolkit # 9.1.85 + - sudo apt-get install -qq libboost-dev # 1.65.1 - sudo find /usr/ -name libcuda*.so after_script: diff --git a/CHANGELOG.md b/CHANGELOG.md index 928f3015..3b3e5025 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,27 @@ Change Log / Release Log for mallocMC ================================================================ +2.4.0crp +-------- +**Date:** 2020-05-28 + +This release removes the Boost dependency and switched to C++11. + +### Changes to mallocMC 2.3.1crp + +**Features** + - Cleaning, remove Boost dependency & C++11 Migration #169 + +**Bug fixes** + - Choose the value for the -arch nvcc flag depending on CUDA version #164 #165 + +**Misc:** + - Travis CI: GCC 5.5.0 + CUDA 9.1.85 #170 + - Adding headers to projects and applied clang-tidy #171 + - clang-format #172 + +Thanks to Sergei Bastrakov, Bernhard Manfred Gruber and Axel Huebl for contributing to this release! + 2.3.1crp -------- **Date:** 2019-02-14 diff --git a/CMakeLists.txt b/CMakeLists.txt index 4376bd13..ded0f630 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,7 +1,12 @@ -project(mallocMC) -cmake_minimum_required(VERSION 2.8.12.2) +project(mallocMC LANGUAGES CUDA CXX) +cmake_minimum_required(VERSION 3.8) + +set(CMAKE_CXX_STANDARD 11) +set(CMAKE_CXX_STANDARD_REQUIRED ON) # helper for libs and packages +set(CMAKE_CUDA_STANDARD 11) +set(CMAKE_CUDA_STANDARD_REQUIRED ON) set(CMAKE_PREFIX_PATH "/usr/lib/x86_64-linux-gnu/" "$ENV{CUDA_ROOT}" "$ENV{BOOST_ROOT}") @@ -14,64 +19,37 @@ set(CMAKE_PREFIX_PATH "/usr/lib/x86_64-linux-gnu/" ################################################################################ if(POLICY CMP0074) - cmake_policy(SET CMP0074 NEW) + cmake_policy(SET CMP0074 NEW) endif() ############################################################################### # CUDA ############################################################################### -find_package(CUDA REQUIRED) -set(CUDA_NVCC_FLAGS "-arch=sm_20;-use_fast_math;") -set(CUDA_INCLUDE_DIRS ${CMAKE_CURRENT_SOURCE_DIR}) -include_directories(${CUDA_INCLUDE_DIRS}) -cuda_include_directories(${CUDA_INCLUDE_DIRS}) +if(NOT DEFINED COMPUTE_CAPABILITY) + set(COMPUTE_CAPABILITY "30") +endif() +set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -arch=sm_${COMPUTE_CAPABILITY} -use_fast_math") OPTION(CUDA_OUTPUT_INTERMEDIATE_CODE "Output ptx code" OFF) if(CUDA_OUTPUT_INTERMEDIATE_CODE) -set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS};-Xptxas;-v;--keep") -endif(CUDA_OUTPUT_INTERMEDIATE_CODE) + set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xptxas -v --keep") +endif() SET(CUDA_OPTIMIZATION_TYPE "unset" CACHE STRING "CUDA Optimization") set_property(CACHE CUDA_OPTIMIZATION_TYPE PROPERTY STRINGS "unset;-G0;-O0;-O1;-O2;-O3") -if(NOT ${CUDA_OPTIMIZATION_TYPE} STREQUAL "unset") - set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS};${CUDA_OPTIMIZATION_TYPE}") +if(NOT ${CUDA_OPTIMIZATION_TYPE} STREQUAL "unset") + set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} ${CUDA_OPTIMIZATION_TYPE}") endif() -############################################################################### -# Boost -############################################################################### -find_package(Boost 1.48.0 REQUIRED) -include_directories(SYSTEM ${Boost_INCLUDE_DIRS}) -set(LIBS ${LIBS} ${Boost_LIBRARIES}) - -# nvcc + boost 1.55 work around -if(Boost_VERSION EQUAL 105500) - set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} \"-DBOOST_NOINLINE=__attribute__((noinline))\" ") -endif(Boost_VERSION EQUAL 105500) - - ################################################################################ # Warnings ################################################################################ -# GNU if(CMAKE_COMPILER_IS_GNUCXX) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wshadow") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unknown-pragmas") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wextra") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unused-parameter") - # new warning in gcc 4.8 (flag ignored in previous version) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unused-local-typedefs") -# ICC + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wshadow -Wno-unknown-pragmas -Wextra -Wno-unused-parameter -Wno-unused-local-typedefs") elseif("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Intel") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wshadow") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DBOOST_NO_VARIADIC_TEMPLATES") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DBOOST_NO_CXX11_VARIADIC_TEMPLATES") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DBOOST_NO_FENV_H") -# PGI + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wshadow") elseif("${CMAKE_CXX_COMPILER_ID}" STREQUAL "PGI") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Minform=inform") endif() @@ -87,28 +65,19 @@ INSTALL( DESTINATION include PATTERN ".git" EXCLUDE PATTERN "mallocMC_config.hpp" EXCLUDE - ) +) ############################################################################### # Executables ############################################################################### +file(GLOB_RECURSE headers src/include/**) +add_custom_target(mallocMC SOURCES ${headers}) # create a target with the header files for IDE projects +source_group(TREE ${CMAKE_CURRENT_LIST_DIR}/src/include FILES ${headers}) + +include_directories(${CMAKE_CURRENT_LIST_DIR}/src/include) +add_executable(mallocMC_Example01 EXCLUDE_FROM_ALL examples/mallocMC_example01.cu examples/mallocMC_example01_config.hpp) +add_executable(mallocMC_Example02 EXCLUDE_FROM_ALL examples/mallocMC_example02.cu) +add_executable(mallocMC_Example03 EXCLUDE_FROM_ALL examples/mallocMC_example03.cu) +add_executable(VerifyHeap EXCLUDE_FROM_ALL tests/verify_heap.cu tests/verify_heap_config.hpp) add_custom_target(examples DEPENDS mallocMC_Example01 mallocMC_Example02 mallocMC_Example03 VerifyHeap) - -cuda_add_executable(mallocMC_Example01 - EXCLUDE_FROM_ALL - examples/mallocMC_example01.cu ) -cuda_add_executable(mallocMC_Example02 - EXCLUDE_FROM_ALL - examples/mallocMC_example02.cu ) -cuda_add_executable(mallocMC_Example03 - EXCLUDE_FROM_ALL - examples/mallocMC_example03.cu ) -cuda_add_executable(VerifyHeap - EXCLUDE_FROM_ALL - tests/verify_heap.cu ) - -target_link_libraries(mallocMC_Example01 ${LIBS}) -target_link_libraries(mallocMC_Example02 ${LIBS}) -target_link_libraries(mallocMC_Example03 ${LIBS}) -target_link_libraries(VerifyHeap ${LIBS}) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 00000000..84b2d7d5 --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,15 @@ +# Contributing + +## Formatting + +Please format your code before before opening pull requests using clang-format and the .clang-format file placed in the repository root. + +### Visual Studio and CLion +Suport for clang-format is built-in since Visual Studio 2017 15.7 and CLion 2019.1. +The .clang-format file in the repository will be automatically detected and formatting is done as you type, or triggered when pressing the format hotkey. + +### Bash +First install clang-format. Instructions therefore can be found on the web. To format you can run this command in bash: +``` +find -iname *.cu -o -iname *.hpp | xargs clang-format-10 -i +``` diff --git a/README.md b/README.md index d1c3afac..ab984f72 100644 --- a/README.md +++ b/README.md @@ -22,6 +22,11 @@ mallocMC is header-only, but requires a few other C++ libraries to be available. Our installation notes can be found in [INSTALL.md](INSTALL.md). +Contributing +------------ + +Rules for contributions are found in [CONTRIBUTING.md](CONTRIBUTING.md). + On the ScatterAlloc Algorithm ----------------------------- diff --git a/Usage.md b/Usage.md index 8d25057a..3f8049ea 100644 --- a/Usage.md +++ b/Usage.md @@ -19,15 +19,15 @@ Currently, there are the following policy classes available: |Policy | Policy Classes (implementations) | description | |------- |----------------------------------| ----------- | -|**CreationPolicy** | Scatter`` | A scattered allocation to tradeoff fragmentation for allocation time, as proposed in [ScatterAlloc](http://ieeexplore.ieee.org/xpl/articleDetails.jsp?arnumber=6339604). `conf1` configures the heap layout, `conf2` determines the hashing parameters| +|**CreationPolicy** | Scatter`` | A scattered allocation to tradeoff fragmentation for allocation time, as proposed in [ScatterAlloc](http://ieeexplore.ieee.org/xpl/articleDetails.jsp?arnumber=6339604). `conf1` configures the heap layout, `conf2` determines the hashing parameters| | | OldMalloc | device-side malloc/new and free/delete syscalls as implemented on NVidia CUDA graphics cards with compute capability sm_20 and higher | -|**DistributionPolicy** | XMallocSIMD`` | SIMD optimization for warp-wide allocation on NVIDIA CUDA accelerators, as proposed by [XMalloc](http://ieeexplore.ieee.org/xpl/articleDetails.jsp?arnumber=5577907). `conf` is used to determine the pagesize. If used in combination with *Scatter*, the pagesizes must match | +|**DistributionPolicy** | XMallocSIMD`` | SIMD optimization for warp-wide allocation on NVIDIA CUDA accelerators, as proposed by [XMalloc](http://ieeexplore.ieee.org/xpl/articleDetails.jsp?arnumber=5577907). `conf` is used to determine the pagesize. If used in combination with *Scatter*, the pagesizes must match | | | Noop | no workload distribution at all | -|**OOMPolicy** | ReturnNull | pointers will be *NULL*, if the request could not be fulfilled | +|**OOMPolicy** | ReturnNull | pointers will be *nullptr*, if the request could not be fulfilled | | | ~~BadAllocException~~ | will throw a `std::bad_alloc` exception. The accelerator has to support exceptions | |**ReservePoolPolicy** | SimpleCudaMalloc | allocate a fixed heap with `CudaMalloc` | | | CudaSetLimits | call to `CudaSetLimits` to increase the available Heap (e.g. when using *OldMalloc*) | -|**AlignmentPolicy** | Shrink`` | shrinks the pool so that the starting pointer is well aligned, applies padding to requested memory chunks. `conf` is used to determine the alignment| +|**AlignmentPolicy** | Shrink`` | shrinks the pool so that the starting pointer is well aligned, applies padding to requested memory chunks. `conf` is used to determine the alignment| | | Noop | no alignment at all | The user has to choose one of each policy that will form a useful allocator @@ -45,7 +45,7 @@ to the policy class: ```c++ // configure the AlignmentPolicy "Shrink" struct ShrinkConfig : mallocMC::AlignmentPolicies::Shrink<>::Properties { - typedef boost::mpl::int_<16> dataAlignment; + static constexpr auto dataAlignment = 16; }; ``` @@ -57,29 +57,29 @@ parameters to create the desired allocator type: ```c++ using namespace mallocMC; -typedef mallocMC::Allocator< +using Allocator1 = mallocMC::Allocator< CreationPolicy::OldMalloc, DistributionPolicy::Noop, OOMPolicy::ReturnNull, ReservePoolPolicy::CudaSetLimits, AlignmentPolicy::Noop -> Allocator1; +>; ``` `Allocator1` will resemble the behaviour of classical device-side allocation known from NVIDIA CUDA since compute capability sm_20. To get a more novel allocator, one -could create the following typedef instead: +could create the following alias instead: ```c++ using namespace mallocMC; -typedef mallocMC::Allocator< +using ScatterAllocator = mallocMC::Allocator< CreationPolicies::Scatter<>, DistributionPolicies::XMallocSIMD<>, OOMPolicies::ReturnNull, ReservePoolPolicies::SimpleCudaMalloc, AlignmentPolicies::Shrink -> ScatterAllocator; +>; ``` Notice, how the policy classes `Scatter` and `XMallocSIMD` are instantiated without @@ -122,13 +122,13 @@ A simplistic example would look like this: namespace mallocMC = MC; -typedef MC::Allocator< +using ScatterAllocator = MC::Allocator< MC::CreationPolicies::Scatter<>, MC::DistributionPolicies::XMallocSIMD<>, MC::OOMPolicies::ReturnNull, MC::ReservePoolPolicies::SimpleCudaMalloc, MC::AlignmentPolicies::Shrink - > ScatterAllocator; +>; __global__ exampleKernel(ScatterAllocator::AllocatorHandle sah) { diff --git a/examples/mallocMC_example01.cu b/examples/mallocMC_example01.cu index 7ac9c4ef..d0b8c898 100644 --- a/examples/mallocMC_example01.cu +++ b/examples/mallocMC_example01.cu @@ -26,131 +26,140 @@ THE SOFTWARE. */ -#include -#include -#include -#include +#include "mallocMC_example01_config.hpp" +#include #include -#include "mallocMC_example01_config.cu" +#include +#include +#include void run(); -int main() +auto main() -> int { - cudaDeviceProp deviceProp; - cudaGetDeviceProperties(&deviceProp, 0); - - if( deviceProp.major < 2 ) { - std::cerr << "Error: Compute Capability >= 2.0 required. (is "; - std::cerr << deviceProp.major << "."<< deviceProp.minor << ")" << std::endl; - return 1; - } - - cudaSetDevice(0); - run(); - cudaDeviceReset(); - - return 0; + int computeCapabilityMajor = 0; + cudaDeviceGetAttribute( + &computeCapabilityMajor, cudaDevAttrComputeCapabilityMajor, 0); + int computeCapabilityMinor = 0; + cudaDeviceGetAttribute( + &computeCapabilityMinor, cudaDevAttrComputeCapabilityMinor, 0); + + if(computeCapabilityMajor < 2) + { + std::cerr << "Error: Compute Capability >= 2.0 required. (is "; + std::cerr << computeCapabilityMajor << "." << computeCapabilityMinor + << ")" << std::endl; + return 1; + } + + cudaSetDevice(0); + run(); + cudaDeviceReset(); + + return 0; } +__device__ int ** arA; +__device__ int ** arB; +__device__ int ** arC; -__device__ int** arA; -__device__ int** arB; -__device__ int** arC; - - -__global__ void createArrayPointers(int x, int y, ScatterAllocator::AllocatorHandle mMC){ - arA = (int**) mMC.malloc(sizeof(int*) * x*y); - arB = (int**) mMC.malloc(sizeof(int*) * x*y); - arC = (int**) mMC.malloc(sizeof(int*) * x*y); +__global__ void +createArrayPointers(int x, int y, ScatterAllocator::AllocatorHandle mMC) +{ + arA = (int **)mMC.malloc(sizeof(int *) * x * y); + arB = (int **)mMC.malloc(sizeof(int *) * x * y); + arC = (int **)mMC.malloc(sizeof(int *) * x * y); } +__global__ void +fillArrays(int length, int * d, ScatterAllocator::AllocatorHandle mMC) +{ + int id = threadIdx.x + blockIdx.x * blockDim.x; -__global__ void fillArrays(int length, int* d, ScatterAllocator::AllocatorHandle mMC){ - int id = threadIdx.x + blockIdx.x*blockDim.x; - - arA[id] = (int*) mMC.malloc(length*sizeof(int)); - arB[id] = (int*) mMC.malloc(length*sizeof(int)); - arC[id] = (int*) mMC.malloc(sizeof(int)*length); + arA[id] = (int *)mMC.malloc(length * sizeof(int)); + arB[id] = (int *)mMC.malloc(length * sizeof(int)); + arC[id] = (int *)mMC.malloc(sizeof(int) * length); - for(int i=0 ; i array_sums(block*grid,0); + // device-side pointers + int * d; + cudaMalloc((void **)&d, sizeof(int) * block * grid); - // create arrays of arrays on the device - createArrayPointers<<<1,1>>>(grid,block, mMC ); + // host-side pointers + std::vector array_sums(block * grid, 0); - // fill 2 of them all with ascending values - fillArrays<<>>(length, d, mMC ); + // create arrays of arrays on the device + createArrayPointers<<<1, 1>>>(grid, block, mMC); - // add the 2 arrays (vector addition within each thread) - // and do a thread-wise reduce to d - addArrays<<>>(length, d); + // fill 2 of them all with ascending values + fillArrays<<>>(length, d, mMC); - cudaMemcpy(&array_sums[0],d,sizeof(int)*block*grid,cudaMemcpyDeviceToHost); + // add the 2 arrays (vector addition within each thread) + // and do a thread-wise reduce to d + addArrays<<>>(length, d); - mMC.getAvailableSlots(1024U*1024U); //get available megabyte-sized slots + cudaMemcpy( + &array_sums[0], d, sizeof(int) * block * grid, cudaMemcpyDeviceToHost); - int sum = std::accumulate(array_sums.begin(),array_sums.end(),0); - std::cout << "The sum of the arrays on GPU is " << sum << std::endl; + mMC.getAvailableSlots(1024U * 1024U); // get available megabyte-sized slots - int n = block*grid*length; - int gaussian = n*(n-1); - std::cout << "The gaussian sum as comparison: " << gaussian << std::endl; + int sum = std::accumulate(array_sums.begin(), array_sums.end(), 0); + std::cout << "The sum of the arrays on GPU is " << sum << std::endl; - freeArrays<<>>( mMC ); - freeArrayPointers<<<1,1>>>( mMC ); - cudaFree(d); + int n = block * grid * length; + int gaussian = n * (n - 1); + std::cout << "The gaussian sum as comparison: " << gaussian << std::endl; + freeArrays<<>>(mMC); + freeArrayPointers<<<1, 1>>>(mMC); + cudaFree(d); } diff --git a/examples/mallocMC_example01_config.cu b/examples/mallocMC_example01_config.hpp similarity index 57% rename from examples/mallocMC_example01_config.cu rename to examples/mallocMC_example01_config.hpp index 14ebf743..7b424b0b 100644 --- a/examples/mallocMC_example01_config.cu +++ b/examples/mallocMC_example01_config.hpp @@ -28,53 +28,52 @@ #pragma once -#include -#include - // basic files for mallocMC -#include "src/include/mallocMC/mallocMC_hostclass.hpp" +#include // Load all available policies for mallocMC -#include "src/include/mallocMC/CreationPolicies.hpp" -#include "src/include/mallocMC/DistributionPolicies.hpp" -#include "src/include/mallocMC/OOMPolicies.hpp" -#include "src/include/mallocMC/ReservePoolPolicies.hpp" -#include "src/include/mallocMC/AlignmentPolicies.hpp" - - +#include +#include +#include +#include +#include // configurate the CreationPolicy "Scatter" to modify the default behaviour -struct ScatterHeapConfig : mallocMC::CreationPolicies::Scatter<>::HeapProperties{ - typedef boost::mpl::int_<4096> pagesize; - typedef boost::mpl::int_<8> accessblocks; - typedef boost::mpl::int_<16> regionsize; - typedef boost::mpl::int_<2> wastefactor; - typedef boost::mpl::bool_ resetfreedpages; +struct ScatterHeapConfig : mallocMC::CreationPolicies::Scatter<>::HeapProperties +{ + static constexpr auto pagesize = 4096; + static constexpr auto accessblocks = 8; + static constexpr auto regionsize = 16; + static constexpr auto wastefactor = 2; + static constexpr auto resetfreedpages = false; }; -struct ScatterHashConfig : mallocMC::CreationPolicies::Scatter<>::HashingProperties{ - typedef boost::mpl::int_<38183> hashingK; - typedef boost::mpl::int_<17497> hashingDistMP; - typedef boost::mpl::int_<1> hashingDistWP; - typedef boost::mpl::int_<1> hashingDistWPRel; +struct ScatterHashConfig : + mallocMC::CreationPolicies::Scatter<>::HashingProperties +{ + static constexpr auto hashingK = 38183; + static constexpr auto hashingDistMP = 17497; + static constexpr auto hashingDistWP = 1; + static constexpr auto hashingDistWPRel = 1; }; // configure the DistributionPolicy "XMallocSIMD" -struct XMallocConfig : mallocMC::DistributionPolicies::XMallocSIMD<>::Properties { - typedef ScatterHeapConfig::pagesize pagesize; +struct XMallocConfig : mallocMC::DistributionPolicies::XMallocSIMD<>::Properties +{ + static constexpr auto pagesize = ScatterHeapConfig::pagesize; }; // configure the AlignmentPolicy "Shrink" -struct ShrinkConfig : mallocMC::AlignmentPolicies::Shrink<>::Properties { - typedef boost::mpl::int_<16> dataAlignment; +struct ShrinkConfig : mallocMC::AlignmentPolicies::Shrink<>::Properties +{ + static constexpr auto dataAlignment = 16; }; // Define a new allocator and call it ScatterAllocator // which resembles the behaviour of ScatterAlloc -typedef mallocMC::Allocator< - mallocMC::CreationPolicies::Scatter, - mallocMC::DistributionPolicies::XMallocSIMD, - mallocMC::OOMPolicies::ReturnNull, - mallocMC::ReservePoolPolicies::SimpleCudaMalloc, - mallocMC::AlignmentPolicies::Shrink - > ScatterAllocator; +using ScatterAllocator = mallocMC::Allocator< + mallocMC::CreationPolicies::Scatter, + mallocMC::DistributionPolicies::XMallocSIMD, + mallocMC::OOMPolicies::ReturnNull, + mallocMC::ReservePoolPolicies::SimpleCudaMalloc, + mallocMC::AlignmentPolicies::Shrink>; diff --git a/examples/mallocMC_example02.cu b/examples/mallocMC_example02.cu index d00d00e7..0dd60600 100644 --- a/examples/mallocMC_example02.cu +++ b/examples/mallocMC_example02.cu @@ -26,84 +26,88 @@ THE SOFTWARE. */ -#include #include -#include -#include - #include -#include -#include +#include +#include +#include /////////////////////////////////////////////////////////////////////////////// // includes for mallocMC /////////////////////////////////////////////////////////////////////////////// // basic files for mallocMC -#include "src/include/mallocMC/mallocMC_hostclass.hpp" +#include // Load all available policies for mallocMC -#include "src/include/mallocMC/CreationPolicies.hpp" -#include "src/include/mallocMC/DistributionPolicies.hpp" -#include "src/include/mallocMC/OOMPolicies.hpp" -#include "src/include/mallocMC/ReservePoolPolicies.hpp" -#include "src/include/mallocMC/AlignmentPolicies.hpp" +#include +#include +#include +#include +#include /////////////////////////////////////////////////////////////////////////////// // Configuration for mallocMC /////////////////////////////////////////////////////////////////////////////// // configurate the CreationPolicy "Scatter" -struct ScatterConfig{ - typedef boost::mpl::int_<4096> pagesize; - typedef boost::mpl::int_<8> accessblocks; - typedef boost::mpl::int_<16> regionsize; - typedef boost::mpl::int_<2> wastefactor; - typedef boost::mpl::bool_ resetfreedpages; +struct ScatterConfig +{ + static constexpr auto pagesize = 4096; + static constexpr auto accessblocks = 8; + static constexpr auto regionsize = 16; + static constexpr auto wastefactor = 2; + static constexpr auto resetfreedpages = false; }; -struct ScatterHashParams{ - typedef boost::mpl::int_<38183> hashingK; - typedef boost::mpl::int_<17497> hashingDistMP; - typedef boost::mpl::int_<1> hashingDistWP; - typedef boost::mpl::int_<1> hashingDistWPRel; +struct ScatterHashParams +{ + static constexpr auto hashingK = 38183; + static constexpr auto hashingDistMP = 17497; + static constexpr auto hashingDistWP = 1; + static constexpr auto hashingDistWPRel = 1; }; // configure the DistributionPolicy "XMallocSIMD" -struct DistributionConfig{ - typedef ScatterConfig::pagesize pagesize; +struct DistributionConfig +{ + static constexpr auto pagesize = ScatterConfig::pagesize; }; // configure the AlignmentPolicy "Shrink" -struct AlignmentConfig{ - typedef boost::mpl::int_<16> dataAlignment; +struct AlignmentConfig +{ + static constexpr auto dataAlignment = 16; }; // Define a new mMCator and call it ScatterAllocator // which resembles the behaviour of ScatterAlloc -typedef mallocMC::Allocator< - mallocMC::CreationPolicies::Scatter, - mallocMC::DistributionPolicies::XMallocSIMD, - mallocMC::OOMPolicies::ReturnNull, - mallocMC::ReservePoolPolicies::SimpleCudaMalloc, - mallocMC::AlignmentPolicies::Shrink - > ScatterAllocator; - +using ScatterAllocator = mallocMC::Allocator< + mallocMC::CreationPolicies::Scatter, + mallocMC::DistributionPolicies::XMallocSIMD, + mallocMC::OOMPolicies::ReturnNull, + mallocMC::ReservePoolPolicies::SimpleCudaMalloc, + mallocMC::AlignmentPolicies::Shrink>; /////////////////////////////////////////////////////////////////////////////// // End of mallocMC configuration /////////////////////////////////////////////////////////////////////////////// - void run(); -int main() +auto main() -> int { - cudaDeviceProp deviceProp; - cudaGetDeviceProperties(&deviceProp, 0); - - if( deviceProp.major < int(2) ) { + int computeCapabilityMajor = 0; + cudaDeviceGetAttribute( + &computeCapabilityMajor, cudaDevAttrComputeCapabilityMajor, 0); + int computeCapabilityMinor = 0; + cudaDeviceGetAttribute( + &computeCapabilityMinor, cudaDevAttrComputeCapabilityMinor, 0); + + if(computeCapabilityMajor < int(2)) + { std::cerr << "Error: Compute Capability >= 2.0 required. (is "; - std::cerr << deviceProp.major << "."<< deviceProp.minor << ")" << std::endl; + std::cerr << computeCapabilityMajor << "." << computeCapabilityMinor + << ")" << std::endl; return 1; } @@ -114,106 +118,110 @@ int main() return 0; } +__device__ int ** arA; +__device__ int ** arB; +__device__ int ** arC; -__device__ int** arA; -__device__ int** arB; -__device__ int** arC; - - -__global__ void createArrayPointers(int x, int y, ScatterAllocator::AllocatorHandle mMC){ - arA = (int**) mMC.malloc(sizeof(int*) * x*y); - arB = (int**) mMC.malloc(sizeof(int*) * x*y); - arC = (int**) mMC.malloc(sizeof(int*) * x*y); +__global__ void +createArrayPointers(int x, int y, ScatterAllocator::AllocatorHandle mMC) +{ + arA = (int **)mMC.malloc(sizeof(int *) * x * y); + arB = (int **)mMC.malloc(sizeof(int *) * x * y); + arC = (int **)mMC.malloc(sizeof(int *) * x * y); } +__global__ void +fillArrays(int length, int * d, ScatterAllocator::AllocatorHandle mMC) +{ + int id = threadIdx.x + blockIdx.x * blockDim.x; -__global__ void fillArrays(int length, int* d, ScatterAllocator::AllocatorHandle mMC){ - int id = threadIdx.x + blockIdx.x*blockDim.x; - - arA[id] = (int*) mMC.malloc(sizeof(int)*length); - arB[id] = (int*) mMC.malloc(sizeof(int)*length); - arC[id] = (int*) mMC.malloc(sizeof(int)*length); + arA[id] = (int *)mMC.malloc(sizeof(int) * length); + arB[id] = (int *)mMC.malloc(sizeof(int) * length); + arC[id] = (int *)mMC.malloc(sizeof(int) * length); - for(int i=0 ; i array_sums(block*grid,0); + std::vector array_sums(block * grid, 0); // create arrays of arrays on the device - createArrayPointers<<<1,1>>>(grid, block, mMC ); + createArrayPointers<<<1, 1>>>(grid, block, mMC); // fill 2 of them all with ascending values - fillArrays<<>>(length, d, mMC ); + fillArrays<<>>(length, d, mMC); // add the 2 arrays (vector addition within each thread) // and do a thread-wise reduce to d - addArrays<<>>(length, d); + addArrays<<>>(length, d); - cudaMemcpy(&array_sums[0], d, sizeof(int)*block*grid, cudaMemcpyDeviceToHost); + cudaMemcpy( + &array_sums[0], d, sizeof(int) * block * grid, cudaMemcpyDeviceToHost); int sum = std::accumulate(array_sums.begin(), array_sums.end(), 0); std::cout << "The sum of the arrays on GPU is " << sum << std::endl; - int n = block*grid*length; - int gaussian = n*(n-1); + int n = block * grid * length; + int gaussian = n * (n - 1); std::cout << "The gaussian sum as comparison: " << gaussian << std::endl; // checking the free memory of the allocator - if(mallocMC::Traits::providesAvailableSlots){ + if(mallocMC::Traits::providesAvailableSlots) + { std::cout << "there are "; - std::cout << mMC.getAvailableSlots(1024U*1024U); + std::cout << mMC.getAvailableSlots(1024U * 1024U); std::cout << " Slots of size 1MB available" << std::endl; } - freeArrays<<>>( mMC ); - freeArrayPointers<<<1, 1>>>( mMC ); + freeArrays<<>>(mMC); + freeArrayPointers<<<1, 1>>>(mMC); cudaFree(d); - } diff --git a/examples/mallocMC_example03.cu b/examples/mallocMC_example03.cu index a3a271b9..9ea7dde7 100644 --- a/examples/mallocMC_example03.cu +++ b/examples/mallocMC_example03.cu @@ -26,93 +26,87 @@ THE SOFTWARE. */ -#include #include -#include +#include +#include #include #include - -#include -#include -#include - +#include /////////////////////////////////////////////////////////////////////////////// // includes for mallocMC /////////////////////////////////////////////////////////////////////////////// -#include "src/include/mallocMC/mallocMC_hostclass.hpp" - -#include "src/include/mallocMC/CreationPolicies.hpp" -#include "src/include/mallocMC/DistributionPolicies.hpp" -#include "src/include/mallocMC/OOMPolicies.hpp" -#include "src/include/mallocMC/ReservePoolPolicies.hpp" -#include "src/include/mallocMC/AlignmentPolicies.hpp" - +#include +#include +#include +#include +#include +#include /////////////////////////////////////////////////////////////////////////////// // Configuration for mallocMC /////////////////////////////////////////////////////////////////////////////// // configurate the CreationPolicy "Scatter" -struct ScatterConfig{ - typedef boost::mpl::int_<4096> pagesize; - typedef boost::mpl::int_<8> accessblocks; - typedef boost::mpl::int_<16> regionsize; - typedef boost::mpl::int_<2> wastefactor; - typedef boost::mpl::bool_ resetfreedpages; +struct ScatterConfig +{ + static constexpr auto pagesize = 4096; + static constexpr auto accessblocks = 8; + static constexpr auto regionsize = 16; + static constexpr auto wastefactor = 2; + static constexpr auto resetfreedpages = false; }; -struct ScatterHashParams{ - typedef boost::mpl::int_<38183> hashingK; - typedef boost::mpl::int_<17497> hashingDistMP; - typedef boost::mpl::int_<1> hashingDistWP; - typedef boost::mpl::int_<1> hashingDistWPRel; +struct ScatterHashParams +{ + static constexpr auto hashingK = 38183; + static constexpr auto hashingDistMP = 17497; + static constexpr auto hashingDistWP = 1; + static constexpr auto hashingDistWPRel = 1; }; - // configure the AlignmentPolicy "Shrink" -struct AlignmentConfig{ - typedef boost::mpl::int_<16> dataAlignment; +struct AlignmentConfig +{ + static constexpr auto dataAlignment = 16; }; // Define a new mMCator and call it ScatterAllocator // which resembles the behaviour of ScatterAlloc -typedef mallocMC::Allocator< +using ScatterAllocator = mallocMC::Allocator< mallocMC::CreationPolicies::Scatter, mallocMC::DistributionPolicies::Noop, mallocMC::OOMPolicies::ReturnNull, mallocMC::ReservePoolPolicies::SimpleCudaMalloc, - mallocMC::AlignmentPolicies::Shrink -> ScatterAllocator; + mallocMC::AlignmentPolicies::Shrink>; /////////////////////////////////////////////////////////////////////////////// // End of mallocMC configuration /////////////////////////////////////////////////////////////////////////////// +__device__ int * arA; -__device__ int* arA; - - -__global__ void exampleKernel(ScatterAllocator::AllocatorHandle mMC){ +__global__ void exampleKernel(ScatterAllocator::AllocatorHandle mMC) +{ unsigned x = 42; - if(threadIdx.x==0) - arA = (int*) mMC.malloc(sizeof(int) * 32); + if(threadIdx.x == 0) + arA = (int *)mMC.malloc(sizeof(int) * 32); x = mMC.getAvailableSlots(1); __syncthreads(); arA[threadIdx.x] = threadIdx.x; - printf("tid: %d array: %d slots %d\n", threadIdx.x, arA[threadIdx.x],x); + printf("tid: %d array: %d slots %d\n", threadIdx.x, arA[threadIdx.x], x); if(threadIdx.x == 0) mMC.free(arA); } - -int main() +auto main() -> int { - ScatterAllocator mMC(1U*1024U*1024U*1024U); //1GB for device-side malloc + ScatterAllocator mMC( + 1U * 1024U * 1024U * 1024U); // 1GB for device-side malloc - exampleKernel<<<1,32>>>( mMC ); + exampleKernel<<<1, 32>>>(mMC); std::cout << "Slots from Host: " << mMC.getAvailableSlots(1) << std::endl; return 0; diff --git a/src/include/mallocMC/AlignmentPolicies.hpp b/src/include/mallocMC/AlignmentPolicies.hpp index c471b696..ece52367 100644 --- a/src/include/mallocMC/AlignmentPolicies.hpp +++ b/src/include/mallocMC/AlignmentPolicies.hpp @@ -28,9 +28,7 @@ #pragma once -#include "alignmentPolicies/Shrink.hpp" -#include "alignmentPolicies/Shrink_impl.hpp" - #include "alignmentPolicies/Noop.hpp" #include "alignmentPolicies/Noop_impl.hpp" - +#include "alignmentPolicies/Shrink.hpp" +#include "alignmentPolicies/Shrink_impl.hpp" diff --git a/src/include/mallocMC/CreationPolicies.hpp b/src/include/mallocMC/CreationPolicies.hpp index 56f6e23f..d987c18e 100644 --- a/src/include/mallocMC/CreationPolicies.hpp +++ b/src/include/mallocMC/CreationPolicies.hpp @@ -28,8 +28,7 @@ #pragma once -#include "creationPolicies/Scatter.hpp" -#include "creationPolicies/Scatter_impl.hpp" - #include "creationPolicies/OldMalloc.hpp" #include "creationPolicies/OldMalloc_impl.hpp" +#include "creationPolicies/Scatter.hpp" +#include "creationPolicies/Scatter_impl.hpp" diff --git a/src/include/mallocMC/DistributionPolicies.hpp b/src/include/mallocMC/DistributionPolicies.hpp index f534c57d..c601f819 100644 --- a/src/include/mallocMC/DistributionPolicies.hpp +++ b/src/include/mallocMC/DistributionPolicies.hpp @@ -30,6 +30,5 @@ #include "distributionPolicies/Noop.hpp" #include "distributionPolicies/Noop_impl.hpp" - #include "distributionPolicies/XMallocSIMD.hpp" #include "distributionPolicies/XMallocSIMD_impl.hpp" diff --git a/src/include/mallocMC/OOMPolicies.hpp b/src/include/mallocMC/OOMPolicies.hpp index 67eda519..60bcba22 100644 --- a/src/include/mallocMC/OOMPolicies.hpp +++ b/src/include/mallocMC/OOMPolicies.hpp @@ -28,8 +28,7 @@ #pragma once -#include "oOMPolicies/ReturnNull.hpp" -#include "oOMPolicies/ReturnNull_impl.hpp" - #include "oOMPolicies/BadAllocException.hpp" #include "oOMPolicies/BadAllocException_impl.hpp" +#include "oOMPolicies/ReturnNull.hpp" +#include "oOMPolicies/ReturnNull_impl.hpp" diff --git a/src/include/mallocMC/ReservePoolPolicies.hpp b/src/include/mallocMC/ReservePoolPolicies.hpp index 1cc3aa7b..d01bac5d 100644 --- a/src/include/mallocMC/ReservePoolPolicies.hpp +++ b/src/include/mallocMC/ReservePoolPolicies.hpp @@ -28,8 +28,7 @@ #pragma once -#include "reservePoolPolicies/SimpleCudaMalloc.hpp" -#include "reservePoolPolicies/SimpleCudaMalloc_impl.hpp" - #include "reservePoolPolicies/CudaSetLimits.hpp" #include "reservePoolPolicies/CudaSetLimits_impl.hpp" +#include "reservePoolPolicies/SimpleCudaMalloc.hpp" +#include "reservePoolPolicies/SimpleCudaMalloc_impl.hpp" diff --git a/src/include/mallocMC/alignmentPolicies/Noop.hpp b/src/include/mallocMC/alignmentPolicies/Noop.hpp index 3879da6f..bc07ae6b 100644 --- a/src/include/mallocMC/alignmentPolicies/Noop.hpp +++ b/src/include/mallocMC/alignmentPolicies/Noop.hpp @@ -27,16 +27,17 @@ #pragma once -namespace mallocMC{ -namespace AlignmentPolicies{ - - /** - * @brief a policy that does nothing - * - * This AlignmentPolicy will not perform any distribution, but only return - * its input (identity function) - */ - class Noop; - -} //namespace AlignmentPolicies -} //namespace mallocMC +namespace mallocMC +{ + namespace AlignmentPolicies + { + /** + * @brief a policy that does nothing + * + * This AlignmentPolicy will not perform any distribution, but only + * return its input (identity function) + */ + class Noop; + + } // namespace AlignmentPolicies +} // namespace mallocMC diff --git a/src/include/mallocMC/alignmentPolicies/Noop_impl.hpp b/src/include/mallocMC/alignmentPolicies/Noop_impl.hpp index e1c034d0..02ad382f 100644 --- a/src/include/mallocMC/alignmentPolicies/Noop_impl.hpp +++ b/src/include/mallocMC/alignmentPolicies/Noop_impl.hpp @@ -27,34 +27,39 @@ #pragma once -#include -#include - -#include "Noop.hpp" #include "../mallocMC_prefixes.hpp" +#include "Noop.hpp" -namespace mallocMC{ -namespace AlignmentPolicies{ - - class Noop{ - typedef boost::uint32_t uint32; - - public: - - static boost::tuple alignPool(void* memory, size_t memsize){ - return boost::make_tuple(memory,memsize); - } - - MAMC_HOST MAMC_ACCELERATOR - static uint32 applyPadding(uint32 bytes){ - return bytes; - } - - static std::string classname(){ - return "Noop"; - } - - }; - -} //namespace AlignmentPolicies -} //namespace mallocMC +#include +#include +#include + +namespace mallocMC +{ + namespace AlignmentPolicies + { + class Noop + { + using uint32 = std::uint32_t; + + public: + static auto alignPool(void * memory, size_t memsize) + -> std::tuple + { + return std::make_tuple(memory, memsize); + } + + MAMC_HOST MAMC_ACCELERATOR static auto applyPadding(uint32 bytes) + -> uint32 + { + return bytes; + } + + static auto classname() -> std::string + { + return "Noop"; + } + }; + + } // namespace AlignmentPolicies +} // namespace mallocMC diff --git a/src/include/mallocMC/alignmentPolicies/Shrink.hpp b/src/include/mallocMC/alignmentPolicies/Shrink.hpp index 4bafeff8..6ed70e49 100644 --- a/src/include/mallocMC/alignmentPolicies/Shrink.hpp +++ b/src/include/mallocMC/alignmentPolicies/Shrink.hpp @@ -31,31 +31,32 @@ #pragma once -#include - -namespace mallocMC{ -namespace AlignmentPolicies{ - -namespace ShrinkConfig{ - struct DefaultShrinkConfig{ - typedef boost::mpl::int_<16> dataAlignment; - }; -} - - /** - * @brief Provides proper alignment of pool and pads memory requests - * - * This AlignmentPolicy is based on ideas from ScatterAlloc - * (http://ieeexplore.ieee.org/xpl/articleDetails.jsp?arnumber=6339604). It - * performs alignment operations on big memory pools and requests to allocate - * memory. Memory pools are truncated at the beginning until the pointer to - * the memory fits the alignment. Requests to allocate memory are padded - * until their size is a multiple of the alignment. - * - * @tparam T_Config (optional) The alignment to use - */ - template - class Shrink; - -} //namespace AlignmentPolicies -} //namespace mallocMC +namespace mallocMC +{ + namespace AlignmentPolicies + { + namespace ShrinkConfig + { + struct DefaultShrinkConfig + { + static constexpr auto dataAlignment = 16; + }; + } // namespace ShrinkConfig + + /** + * @brief Provides proper alignment of pool and pads memory requests + * + * This AlignmentPolicy is based on ideas from ScatterAlloc + * (http://ieeexplore.ieee.org/xpl/articleDetails.jsp?arnumber=6339604). + * It performs alignment operations on big memory pools and requests to + * allocate memory. Memory pools are truncated at the beginning until + * the pointer to the memory fits the alignment. Requests to allocate + * memory are padded until their size is a multiple of the alignment. + * + * @tparam T_Config (optional) The alignment to use + */ + template + class Shrink; + + } // namespace AlignmentPolicies +} // namespace mallocMC diff --git a/src/include/mallocMC/alignmentPolicies/Shrink_impl.hpp b/src/include/mallocMC/alignmentPolicies/Shrink_impl.hpp index 60757519..783efcc7 100644 --- a/src/include/mallocMC/alignmentPolicies/Shrink_impl.hpp +++ b/src/include/mallocMC/alignmentPolicies/Shrink_impl.hpp @@ -31,34 +31,42 @@ #pragma once -#include -#include -#include -#include -#include -#include - -#include "Shrink.hpp" #include "../mallocMC_prefixes.hpp" +#include "Shrink.hpp" -namespace mallocMC{ -namespace AlignmentPolicies{ - -namespace Shrink2NS{ - - template struct __PointerEquivalent{ typedef unsigned int type;}; - template<> - struct __PointerEquivalent<8>{ typedef unsigned long long int type; }; -}// namespace ShrinkNS - - template - class Shrink{ - public: - typedef T_Config Properties; +#include +#include +#include +#include - private: - typedef boost::uint32_t uint32; - typedef Shrink2NS::__PointerEquivalent::type PointerEquivalent; +namespace mallocMC +{ + namespace AlignmentPolicies + { + namespace Shrink2NS + { + template + struct __PointerEquivalent + { + using type = unsigned int; + }; + template<> + struct __PointerEquivalent<8> + { + using type = unsigned long long; + }; + } // namespace Shrink2NS + + template + class Shrink + { + public: + using Properties = T_Config; + + private: + using uint32 = std::uint32_t; + using PointerEquivalent + = Shrink2NS::__PointerEquivalent::type; /** Allow for a hierarchical validation of parameters: * @@ -70,54 +78,65 @@ namespace Shrink2NS{ * default-struct < template-struct < command-line parameter */ #ifndef MALLOCMC_AP_SHRINK_DATAALIGNMENT -#define MALLOCMC_AP_SHRINK_DATAALIGNMENT Properties::dataAlignment::value +#define MALLOCMC_AP_SHRINK_DATAALIGNMENT (Properties::dataAlignment) #endif - BOOST_STATIC_CONSTEXPR uint32 dataAlignment = MALLOCMC_AP_SHRINK_DATAALIGNMENT; - - // \TODO: The static_cast can be removed once the minimal dependencies of - // this project is are at least CUDA 7.0 and gcc 4.8.2 - BOOST_STATIC_ASSERT(static_cast(dataAlignment) > 0); - //dataAlignment must also be a power of 2! - BOOST_STATIC_ASSERT(dataAlignment && !(dataAlignment & (dataAlignment-1)) ); - - public: - static boost::tuple alignPool(void* memory, size_t memsize){ - PointerEquivalent alignmentstatus = ((PointerEquivalent)memory) & (dataAlignment -1); - if(alignmentstatus != 0) - { - std::cout << "Heap Warning: memory to use not "; - std::cout << dataAlignment << " byte aligned..." << std::endl; - std::cout << "Before:" << std::endl; - std::cout << "dataAlignment: " << dataAlignment << std::endl; - std::cout << "Alignmentstatus: " << alignmentstatus << std::endl; - std::cout << "size_t memsize " << memsize << " byte" << std::endl; - std::cout << "void *memory " << memory << std::endl; - - memory = (void*)(((PointerEquivalent)memory) + dataAlignment - alignmentstatus); - memsize -= (size_t)dataAlignment + (size_t)alignmentstatus; - - std::cout << "Was shrunk automatically to: " << std::endl; - std::cout << "size_t memsize " << memsize << " byte" << std::endl; - std::cout << "void *memory " << memory << std::endl; - } - - return boost::make_tuple(memory,memsize); - } - - MAMC_HOST - MAMC_ACCELERATOR - static uint32 applyPadding(uint32 bytes){ - return (bytes + dataAlignment - 1) & ~(dataAlignment-1); - } - - MAMC_HOST - static std::string classname(){ - std::stringstream ss; - ss << "Shrink[" << dataAlignment << "]"; - return ss.str(); - } - - }; - -} //namespace AlignmentPolicies -} //namespace mallocMC + static constexpr uint32 dataAlignment + = MALLOCMC_AP_SHRINK_DATAALIGNMENT; + + // dataAlignment must be a power of 2! + static_assert( + dataAlignment != 0 + && (dataAlignment & (dataAlignment - 1)) == 0, + "dataAlignment must also be a power of 2"); + + public: + static auto alignPool(void * memory, size_t memsize) + -> std::tuple + { + PointerEquivalent alignmentstatus + = ((PointerEquivalent)memory) & (dataAlignment - 1); + if(alignmentstatus != 0) + { + std::cout << "Heap Warning: memory to use not "; + std::cout << dataAlignment << " byte aligned..." + << std::endl; + std::cout << "Before:" << std::endl; + std::cout << "dataAlignment: " << dataAlignment + << std::endl; + std::cout << "Alignmentstatus: " << alignmentstatus + << std::endl; + std::cout << "size_t memsize " << memsize << " byte" + << std::endl; + std::cout << "void *memory " << memory << std::endl; + + memory + = (void *)(((PointerEquivalent)memory) + dataAlignment - alignmentstatus); + memsize -= (size_t)dataAlignment + (size_t)alignmentstatus; + + std::cout << "Was shrunk automatically to: " << std::endl; + std::cout << "size_t memsize " << memsize << " byte" + << std::endl; + std::cout << "void *memory " << memory << std::endl; + } + + return std::make_tuple(memory, memsize); + } + + MAMC_HOST + MAMC_ACCELERATOR + static auto applyPadding(uint32 bytes) -> uint32 + { + return (bytes + dataAlignment - 1) & ~(dataAlignment - 1); + } + + MAMC_HOST + static auto classname() -> std::string + { + std::stringstream ss; + ss << "Shrink[" << dataAlignment << "]"; + return ss.str(); + } + }; + + } // namespace AlignmentPolicies +} // namespace mallocMC diff --git a/src/include/mallocMC/allocator.hpp b/src/include/mallocMC/allocator.hpp index a811b488..0f7c6561 100644 --- a/src/include/mallocMC/allocator.hpp +++ b/src/include/mallocMC/allocator.hpp @@ -28,67 +28,57 @@ #pragma once -#include "mallocMC_utils.hpp" +#include "device_allocator.hpp" +#include "mallocMC_allocator_handle.hpp" #include "mallocMC_constraints.hpp" #include "mallocMC_prefixes.hpp" #include "mallocMC_traits.hpp" -#include "mallocMC_allocator_handle.hpp" +#include "mallocMC_utils.hpp" -#include -#include -#include +#include #include +#include #include -namespace mallocMC{ - -namespace detail{ - - template< - typename T_Allocator, - bool T_providesAvailableSlots - > - struct GetAvailableSlotsIfAvailHost +namespace mallocMC +{ + namespace detail { - MAMC_HOST static - unsigned - getAvailableSlots( - size_t, - T_Allocator & - ) + template + struct GetAvailableSlotsIfAvailHost { - return 0; - } - }; - - template - struct GetAvailableSlotsIfAvailHost - { - MAMC_HOST - static unsigned - getAvailableSlots( - size_t slotSize, - T_Allocator& alloc - ){ - return T_Allocator::CreationPolicy::getAvailableSlotsHost(slotSize, alloc.getAllocatorHandle().devAllocator); - } - }; - -} - + MAMC_HOST static auto getAvailableSlots(size_t, T_Allocator &) + -> unsigned + { + return 0; + } + }; + + template + struct GetAvailableSlotsIfAvailHost + { + MAMC_HOST + static auto getAvailableSlots(size_t slotSize, T_Allocator & alloc) + -> unsigned + { + return T_Allocator::CreationPolicy::getAvailableSlotsHost( + slotSize, alloc.getAllocatorHandle().devAllocator); + } + }; + } // namespace detail struct HeapInfo { - void* p; + void * p; size_t size; }; /** * @brief "HostClass" that combines all policies to a useful allocator * - * This class implements the necessary glue-logic to form an actual allocator - * from the provided policies. It implements the public interface and - * executes some constraint checking based on an instance of the class + * This class implements the necessary glue-logic to form an actual + * allocator from the provided policies. It implements the public interface + * and executes some constraint checking based on an instance of the class * PolicyConstraints. * * @tparam T_CreationPolicy The desired type of a CreationPolicy @@ -98,37 +88,34 @@ namespace detail{ * @tparam T_AlignmentPolicy The desired type of a AlignmentPolicy */ template< - typename T_CreationPolicy, - typename T_DistributionPolicy, - typename T_OOMPolicy, - typename T_ReservePoolPolicy, - typename T_AlignmentPolicy - > + typename T_CreationPolicy, + typename T_DistributionPolicy, + typename T_OOMPolicy, + typename T_ReservePoolPolicy, + typename T_AlignmentPolicy> class Allocator : - public PolicyConstraints< - T_CreationPolicy, - T_DistributionPolicy, - T_OOMPolicy, - T_ReservePoolPolicy, - T_AlignmentPolicy - > + public PolicyConstraints< + T_CreationPolicy, + T_DistributionPolicy, + T_OOMPolicy, + T_ReservePoolPolicy, + T_AlignmentPolicy> { - typedef boost::uint32_t uint32; + using uint32 = std::uint32_t; public: - typedef T_CreationPolicy CreationPolicy; - typedef T_DistributionPolicy DistributionPolicy; - typedef T_OOMPolicy OOMPolicy; - typedef T_ReservePoolPolicy ReservePoolPolicy; - typedef T_AlignmentPolicy AlignmentPolicy; - typedef std::vector< HeapInfo > HeapInfoVector; - typedef DeviceAllocator< + using CreationPolicy = T_CreationPolicy; + using DistributionPolicy = T_DistributionPolicy; + using OOMPolicy = T_OOMPolicy; + using ReservePoolPolicy = T_ReservePoolPolicy; + using AlignmentPolicy = T_AlignmentPolicy; + using HeapInfoVector = std::vector; + using DevAllocator = DeviceAllocator< CreationPolicy, DistributionPolicy, OOMPolicy, - AlignmentPolicy - > DevAllocator; - typedef AllocatorHandleImpl AllocatorHandle; + AlignmentPolicy>; + using AllocatorHandle = AllocatorHandleImpl; private: AllocatorHandle allocatorHandle; @@ -139,29 +126,13 @@ namespace detail{ * @param size number of bytes */ MAMC_HOST - void - alloc( - size_t size - ) + void alloc(size_t size) { - void* pool = ReservePoolPolicy::setMemPool( size ); - boost::tie( - pool, - size - ) = AlignmentPolicy::alignPool( - pool, - size - ); - DevAllocator* devAllocatorPtr; - cudaMalloc( - ( void** ) &devAllocatorPtr, - sizeof( DevAllocator ) - ); - CreationPolicy::initHeap( - devAllocatorPtr, - pool, - size - ); + void * pool = ReservePoolPolicy::setMemPool(size); + std::tie(pool, size) = AlignmentPolicy::alignPool(pool, size); + DevAllocator * devAllocatorPtr; + cudaMalloc((void **)&devAllocatorPtr, sizeof(DevAllocator)); + CreationPolicy::initHeap(devAllocatorPtr, pool, size); allocatorHandle.devAllocator = devAllocatorPtr; heapInfos.p = pool; @@ -176,33 +147,28 @@ namespace detail{ MAMC_HOST void free() { - cudaFree( allocatorHandle.devAllocator ); - ReservePoolPolicy::resetMemPool( heapInfos.p ); - allocatorHandle.devAllocator = NULL; + cudaFree(allocatorHandle.devAllocator); + ReservePoolPolicy::resetMemPool(heapInfos.p); + allocatorHandle.devAllocator = nullptr; heapInfos.size = 0; - heapInfos.p = NULL; + heapInfos.p = nullptr; } /* forbid to copy the allocator */ MAMC_HOST - Allocator( const Allocator& ); + Allocator(const Allocator &); public: - - MAMC_HOST - Allocator( - size_t size = 8U * 1024U * 1024U - ) : - allocatorHandle( NULL ) + Allocator(size_t size = 8U * 1024U * 1024U) : allocatorHandle(nullptr) { - alloc( size ); + alloc(size); } MAMC_HOST - ~Allocator( ) + ~Allocator() { - free( ); + free(); } /** destroy current heap data and resize the heap @@ -210,18 +176,14 @@ namespace detail{ * @param size number of bytes */ MAMC_HOST - void - destructiveResize( - size_t size - ) + void destructiveResize(size_t size) { - free( ); - alloc( size ); + free(); + alloc(size); } MAMC_HOST - AllocatorHandle - getAllocatorHandle( ) + auto getAllocatorHandle() -> AllocatorHandle { return allocatorHandle; } @@ -232,45 +194,41 @@ namespace detail{ return getAllocatorHandle(); } - MAMC_HOST static - std::string - info( - std::string linebreak = " " - ) + MAMC_HOST static auto info(std::string linebreak = " ") -> std::string { std::stringstream ss; - ss << "CreationPolicy: " << CreationPolicy::classname( ) << " " << linebreak; - ss << "DistributionPolicy: " << DistributionPolicy::classname( ) << "" << linebreak; - ss << "OOMPolicy: " << OOMPolicy::classname( ) << " " << linebreak; - ss << "ReservePoolPolicy: " << ReservePoolPolicy::classname( ) << " " << linebreak; - ss << "AlignmentPolicy: " << AlignmentPolicy::classname( ) << " " << linebreak; + ss << "CreationPolicy: " << CreationPolicy::classname() + << " " << linebreak; + ss << "DistributionPolicy: " << DistributionPolicy::classname() + << "" << linebreak; + ss << "OOMPolicy: " << OOMPolicy::classname() + << " " << linebreak; + ss << "ReservePoolPolicy: " << ReservePoolPolicy::classname() + << " " << linebreak; + ss << "AlignmentPolicy: " << AlignmentPolicy::classname() + << " " << linebreak; return ss.str(); } - // polymorphism over the availability of getAvailableSlots for calling from the host + // polymorphism over the availability of getAvailableSlots for calling + // from the host MAMC_HOST - unsigned - getAvailableSlots( - size_t slotSize - ) + auto getAvailableSlots(size_t slotSize) -> unsigned { - slotSize = AlignmentPolicy::applyPadding( slotSize ); + slotSize = AlignmentPolicy::applyPadding(slotSize); return detail::GetAvailableSlotsIfAvailHost< Allocator, - Traits::providesAvailableSlots - >::getAvailableSlots( slotSize, *this ); + Traits::providesAvailableSlots>:: + getAvailableSlots(slotSize, *this); } MAMC_HOST - HeapInfoVector - getHeapLocations( ) + auto getHeapLocations() -> HeapInfoVector { - HeapInfoVector v; - v.push_back( heapInfos ); - return v; + HeapInfoVector v; + v.push_back(heapInfos); + return v; } - }; -} //namespace mallocMC - +} // namespace mallocMC diff --git a/src/include/mallocMC/creationPolicies/OldMalloc.hpp b/src/include/mallocMC/creationPolicies/OldMalloc.hpp index e48b4890..4b8db31f 100644 --- a/src/include/mallocMC/creationPolicies/OldMalloc.hpp +++ b/src/include/mallocMC/creationPolicies/OldMalloc.hpp @@ -27,18 +27,18 @@ #pragma once - -namespace mallocMC{ -namespace CreationPolicies{ - - /** - * @brief classic malloc/free behaviour known from CUDA - * - * This CreationPolicy implements the classic device-side malloc and free - * system calls that is offered by CUDA-capable accelerator of compute - * capability 2.0 and higher - */ - class OldMalloc; - -} //namespace CreationPolicies -} //namespace mallocMC +namespace mallocMC +{ + namespace CreationPolicies + { + /** + * @brief classic malloc/free behaviour known from CUDA + * + * This CreationPolicy implements the classic device-side malloc and + * free system calls that is offered by CUDA-capable accelerator of + * compute capability 2.0 and higher + */ + class OldMalloc; + + } // namespace CreationPolicies +} // namespace mallocMC diff --git a/src/include/mallocMC/creationPolicies/OldMalloc_impl.hpp b/src/include/mallocMC/creationPolicies/OldMalloc_impl.hpp index 162eb661..6c9b7475 100644 --- a/src/include/mallocMC/creationPolicies/OldMalloc_impl.hpp +++ b/src/include/mallocMC/creationPolicies/OldMalloc_impl.hpp @@ -27,45 +27,47 @@ #pragma once -#include -#include - #include "OldMalloc.hpp" -namespace mallocMC{ -namespace CreationPolicies{ - - class OldMalloc - { - typedef boost::uint32_t uint32; - - public: - typedef boost::mpl::bool_ providesAvailableSlots; - - __device__ void* create(uint32 bytes) - { - return ::malloc(static_cast(bytes)); - } +#include - __device__ void destroy(void* mem) +namespace mallocMC +{ + namespace CreationPolicies { - free(mem); - } - - __device__ bool isOOM(void* p, size_t s){ - return s && (p == NULL); - } - - template < typename T > - static void* initHeap(T* dAlloc, void*, size_t){ - return dAlloc; - } - - static std::string classname(){ - return "OldMalloc"; - } - - }; - -} //namespace CreationPolicies -} //namespace mallocMC + class OldMalloc + { + using uint32 = std::uint32_t; + + public: + static constexpr auto providesAvailableSlots = false; + + __device__ auto create(uint32 bytes) const -> void * + { + return ::malloc(static_cast(bytes)); + } + + __device__ void destroy(void * mem) const + { + ::free(mem); + } + + __device__ auto isOOM(void * p, size_t s) const -> bool + { + return s != 0 && (p == nullptr); + } + + template + static auto initHeap(T * dAlloc, void *, size_t) -> void * + { + return dAlloc; + } + + static auto classname() -> std::string + { + return "OldMalloc"; + } + }; + + } // namespace CreationPolicies +} // namespace mallocMC diff --git a/src/include/mallocMC/creationPolicies/Scatter.hpp b/src/include/mallocMC/creationPolicies/Scatter.hpp index 499bd73e..0db2af43 100644 --- a/src/include/mallocMC/creationPolicies/Scatter.hpp +++ b/src/include/mallocMC/creationPolicies/Scatter.hpp @@ -33,52 +33,52 @@ #pragma once -#include -#include +namespace mallocMC +{ + namespace CreationPolicies + { + namespace ScatterConf + { + struct DefaultScatterConfig + { + static constexpr auto pagesize = 4096; + static constexpr auto accessblocks = 8; + static constexpr auto regionsize = 16; + static constexpr auto wastefactor = 2; + static constexpr auto resetfreedpages = false; + }; -namespace mallocMC{ -namespace CreationPolicies{ -namespace ScatterConf{ - struct DefaultScatterConfig{ - typedef boost::mpl::int_<4096> pagesize; - typedef boost::mpl::int_<8> accessblocks; - typedef boost::mpl::int_<16> regionsize; - typedef boost::mpl::int_<2> wastefactor; - typedef boost::mpl::bool_ resetfreedpages; - }; + struct DefaultScatterHashingParams + { + static constexpr auto hashingK = 38183; + static constexpr auto hashingDistMP = 17497; + static constexpr auto hashingDistWP = 1; + static constexpr auto hashingDistWPRel = 1; + }; + } // namespace ScatterConf - struct DefaultScatterHashingParams{ - typedef boost::mpl::int_<38183> hashingK; - typedef boost::mpl::int_<17497> hashingDistMP; - typedef boost::mpl::int_<1> hashingDistWP; - typedef boost::mpl::int_<1> hashingDistWPRel; - }; -} + /** + * @brief fast memory allocation based on ScatterAlloc + * + * This CreationPolicy implements a fast memory allocator that trades + * speed for fragmentation of memory. This is based on the memory + * allocator "ScatterAlloc" + * (http://ieeexplore.ieee.org/xpl/articleDetails.jsp?arnumber=6339604), + * and is extended to report free memory slots of a given size (both on + * host and accelerator). To work properly, this policy class requires a + * pre-allocated heap on the accelerator and works only with Nvidia CUDA + * capable accelerators that have at least compute capability 2.0. + * + * @tparam T_Config (optional) configure the heap layout. The + * default can be obtained through Scatter<>::HeapProperties + * @tparam T_Hashing (optional) configure the parameters for + * the hashing formula. The default can be obtained through + * Scatter<>::HashingProperties + */ + template< + class T_Config = ScatterConf::DefaultScatterConfig, + class T_Hashing = ScatterConf::DefaultScatterHashingParams> + class Scatter; - /** - * @brief fast memory allocation based on ScatterAlloc - * - * This CreationPolicy implements a fast memory allocator that trades speed - * for fragmentation of memory. This is based on the memory allocator - * "ScatterAlloc" - * (http://ieeexplore.ieee.org/xpl/articleDetails.jsp?arnumber=6339604), and - * is extended to report free memory slots of a given size (both on host and - * accelerator). - * To work properly, this policy class requires a pre-allocated heap on the - * accelerator and works only with Nvidia CUDA capable accelerators that have - * at least compute capability 2.0. - * - * @tparam T_Config (optional) configure the heap layout. The - * default can be obtained through Scatter<>::HeapProperties - * @tparam T_Hashing (optional) configure the parameters for - * the hashing formula. The default can be obtained through - * Scatter<>::HashingProperties - */ - template< - class T_Config = ScatterConf::DefaultScatterConfig, - class T_Hashing = ScatterConf::DefaultScatterHashingParams - > - class Scatter; - -}// namespace CreationPolicies -}// namespace mallocMC + } // namespace CreationPolicies +} // namespace mallocMC diff --git a/src/include/mallocMC/creationPolicies/Scatter_impl.hpp b/src/include/mallocMC/creationPolicies/Scatter_impl.hpp index 88e3a611..7bb00ba7 100644 --- a/src/include/mallocMC/creationPolicies/Scatter_impl.hpp +++ b/src/include/mallocMC/creationPolicies/Scatter_impl.hpp @@ -33,57 +33,64 @@ #pragma once -#include -#include /* uint32_t */ -#include -#include -#include -#include -#include - #include "../mallocMC_utils.hpp" #include "Scatter.hpp" -namespace mallocMC{ -namespace CreationPolicies{ - -namespace ScatterKernelDetail{ - template - __global__ void initKernel(T_Allocator* heap, void* heapmem, size_t memsize){ - heap->pool = heapmem; - heap->initDeviceFunction(heapmem, memsize); - } - - - template < typename T_Allocator > - __global__ void getAvailableSlotsKernel(T_Allocator* heap, size_t slotSize, unsigned* slots){ - int gid = threadIdx.x + blockIdx.x*blockDim.x; - int nWorker = gridDim.x * blockDim.x; - unsigned temp = heap->getAvailaibleSlotsDeviceFunction(slotSize, gid, nWorker); - if(temp) atomicAdd(slots, temp); - } - +#include +#include /* uint32_t */ +#include +#include +#include +#include - template - __global__ void finalizeKernel(T_Allocator* heap){ - heap->finalizeDeviceFunction(); - } +namespace mallocMC +{ + namespace CreationPolicies + { + namespace ScatterKernelDetail + { + template + __global__ void + initKernel(T_Allocator * heap, void * heapmem, size_t memsize) + { + heap->pool = heapmem; + heap->initDeviceFunction(heapmem, memsize); + } -} //namespace ScatterKernelDetail + template + __global__ void getAvailableSlotsKernel( + T_Allocator * heap, + size_t slotSize, + unsigned * slots) + { + int gid = threadIdx.x + blockIdx.x * blockDim.x; + int nWorker = gridDim.x * blockDim.x; + unsigned temp = heap->getAvailaibleSlotsDeviceFunction( + slotSize, gid, nWorker); + if(temp) + atomicAdd(slots, temp); + } - template - class Scatter - { + template + __global__ void finalizeKernel(T_Allocator * heap) + { + heap->finalizeDeviceFunction(); + } - public: - typedef T_Config HeapProperties; - typedef T_Hashing HashingProperties; - struct Properties : HeapProperties, HashingProperties{}; - typedef boost::mpl::bool_ providesAvailableSlots; + } // namespace ScatterKernelDetail - private: - typedef boost::uint32_t uint32; + template + class Scatter + { + public: + using HeapProperties = T_Config; + using HashingProperties = T_Hashing; + struct Properties : HeapProperties, HashingProperties + {}; + static constexpr auto providesAvailableSlots = true; + private: + using uint32 = std::uint32_t; /** Allow for a hierarchical validation of parameters: * @@ -95,884 +102,1080 @@ namespace ScatterKernelDetail{ * default-struct < template-struct < command-line parameter */ #ifndef MALLOCMC_CP_SCATTER_PAGESIZE -#define MALLOCMC_CP_SCATTER_PAGESIZE static_cast(HeapProperties::pagesize::value) +#define MALLOCMC_CP_SCATTER_PAGESIZE (HeapProperties::pagesize) #endif - BOOST_STATIC_CONSTEXPR uint32 pagesize = MALLOCMC_CP_SCATTER_PAGESIZE; + static constexpr uint32 pagesize = MALLOCMC_CP_SCATTER_PAGESIZE; #ifndef MALLOCMC_CP_SCATTER_ACCESSBLOCKS -#define MALLOCMC_CP_SCATTER_ACCESSBLOCKS static_cast(HeapProperties::accessblocks::value) +#define MALLOCMC_CP_SCATTER_ACCESSBLOCKS (HeapProperties::accessblocks) #endif - BOOST_STATIC_CONSTEXPR uint32 accessblocks = MALLOCMC_CP_SCATTER_ACCESSBLOCKS; + static constexpr uint32 accessblocks + = MALLOCMC_CP_SCATTER_ACCESSBLOCKS; #ifndef MALLOCMC_CP_SCATTER_REGIONSIZE -#define MALLOCMC_CP_SCATTER_REGIONSIZE static_cast(HeapProperties::regionsize::value) +#define MALLOCMC_CP_SCATTER_REGIONSIZE (HeapProperties::regionsize) #endif - BOOST_STATIC_CONSTEXPR uint32 regionsize = MALLOCMC_CP_SCATTER_REGIONSIZE; + static constexpr uint32 regionsize = MALLOCMC_CP_SCATTER_REGIONSIZE; #ifndef MALLOCMC_CP_SCATTER_WASTEFACTOR -#define MALLOCMC_CP_SCATTER_WASTEFACTOR static_cast(HeapProperties::wastefactor::value) +#define MALLOCMC_CP_SCATTER_WASTEFACTOR (HeapProperties::wastefactor) #endif - BOOST_STATIC_CONSTEXPR uint32 wastefactor = MALLOCMC_CP_SCATTER_WASTEFACTOR; + static constexpr uint32 wastefactor + = MALLOCMC_CP_SCATTER_WASTEFACTOR; #ifndef MALLOCMC_CP_SCATTER_RESETFREEDPAGES -#define MALLOCMC_CP_SCATTER_RESETFREEDPAGES static_cast(HeapProperties::resetfreedpages::value) +#define MALLOCMC_CP_SCATTER_RESETFREEDPAGES (HeapProperties::resetfreedpages) #endif - BOOST_STATIC_CONSTEXPR bool resetfreedpages = MALLOCMC_CP_SCATTER_RESETFREEDPAGES; - + static constexpr bool resetfreedpages + = MALLOCMC_CP_SCATTER_RESETFREEDPAGES; - public: - BOOST_STATIC_CONSTEXPR uint32 _pagesize = pagesize; - BOOST_STATIC_CONSTEXPR uint32 _accessblocks = accessblocks; - BOOST_STATIC_CONSTEXPR uint32 _regionsize = regionsize; - BOOST_STATIC_CONSTEXPR uint32 _wastefactor = wastefactor; - BOOST_STATIC_CONSTEXPR bool _resetfreedpages = resetfreedpages; + public: + static constexpr uint32 _pagesize = pagesize; + static constexpr uint32 _accessblocks = accessblocks; + static constexpr uint32 _regionsize = regionsize; + static constexpr uint32 _wastefactor = wastefactor; + static constexpr bool _resetfreedpages = resetfreedpages; - private: + private: #if _DEBUG || ANALYSEHEAP - public: + public: #endif - //BOOST_STATIC_CONSTEXPR uint32 minChunkSize0 = pagesize/(32*32); - BOOST_STATIC_CONSTEXPR uint32 minChunkSize1 = 0x10; - BOOST_STATIC_CONSTEXPR uint32 HierarchyThreshold = (pagesize - 2*sizeof(uint32))/33; - BOOST_STATIC_CONSTEXPR uint32 minSegmentSize = 32*minChunkSize1 + sizeof(uint32); - BOOST_STATIC_CONSTEXPR uint32 tmp_maxOPM = minChunkSize1 > HierarchyThreshold ? 0 : (pagesize + (minSegmentSize-1)) / minSegmentSize; - BOOST_STATIC_CONSTEXPR uint32 maxOnPageMasks = 32 > tmp_maxOPM ? tmp_maxOPM : 32; + // static constexpr uint32 minChunkSize0 = pagesize/(32*32); + static constexpr uint32 minChunkSize1 = 0x10; + static constexpr uint32 HierarchyThreshold + = (pagesize - 2 * sizeof(uint32)) / 33; + static constexpr uint32 minSegmentSize + = 32 * minChunkSize1 + sizeof(uint32); + static constexpr uint32 tmp_maxOPM + = minChunkSize1 > HierarchyThreshold + ? 0 + : (pagesize + (minSegmentSize - 1)) / minSegmentSize; + static constexpr uint32 maxOnPageMasks + = 32 > tmp_maxOPM ? tmp_maxOPM : 32; #ifndef MALLOCMC_CP_SCATTER_HASHINGK -#define MALLOCMC_CP_SCATTER_HASHINGK static_cast(HashingProperties::hashingK::value) +#define MALLOCMC_CP_SCATTER_HASHINGK (HashingProperties::hashingK) #endif - BOOST_STATIC_CONSTEXPR uint32 hashingK = MALLOCMC_CP_SCATTER_HASHINGK; + static constexpr uint32 hashingK = MALLOCMC_CP_SCATTER_HASHINGK; #ifndef MALLOCMC_CP_SCATTER_HASHINGDISTMP -#define MALLOCMC_CP_SCATTER_HASHINGDISTMP static_cast(HashingProperties::hashingDistMP::value) +#define MALLOCMC_CP_SCATTER_HASHINGDISTMP (HashingProperties::hashingDistMP) #endif - BOOST_STATIC_CONSTEXPR uint32 hashingDistMP = MALLOCMC_CP_SCATTER_HASHINGDISTMP; + static constexpr uint32 hashingDistMP + = MALLOCMC_CP_SCATTER_HASHINGDISTMP; #ifndef MALLOCMC_CP_SCATTER_HASHINGDISTWP -#define MALLOCMC_CP_SCATTER_HASHINGDISTWP static_cast(HashingProperties::hashingDistWP::value) +#define MALLOCMC_CP_SCATTER_HASHINGDISTWP (HashingProperties::hashingDistWP) #endif - BOOST_STATIC_CONSTEXPR uint32 hashingDistWP = MALLOCMC_CP_SCATTER_HASHINGDISTWP; + static constexpr uint32 hashingDistWP + = MALLOCMC_CP_SCATTER_HASHINGDISTWP; #ifndef MALLOCMC_CP_SCATTER_HASHINGDISTWPREL -#define MALLOCMC_CP_SCATTER_HASHINGDISTWPREL static_cast(HashingProperties::hashingDistWPRel::value) +#define MALLOCMC_CP_SCATTER_HASHINGDISTWPREL \ + (HashingProperties::hashingDistWPRel) #endif - BOOST_STATIC_CONSTEXPR uint32 hashingDistWPRel = MALLOCMC_CP_SCATTER_HASHINGDISTWPREL; + static constexpr uint32 hashingDistWPRel + = MALLOCMC_CP_SCATTER_HASHINGDISTWPREL; + + /** + * Page Table Entry struct + * The PTE holds basic information about each page + */ + struct PTE + { + uint32 chunksize; + uint32 count; + uint32 bitmask; + __device__ void init() + { + chunksize = 0; + count = 0; + bitmask = 0; + } + }; + + /** + * Page struct + * The page struct is used to access the data on the page more + * efficiently and to clear the area on the page, which might hold + * bitsfields later one + */ + struct Page + { + char data[pagesize]; + + /** + * The pages init method + * This method initializes the region on the page which might + * hold bit fields when the page is used for a small chunk size + * @param previous_chunksize the chunksize which was uses for + * the page before + */ + __device__ void init() + { + // clear the entire data which can hold bitfields + uint32 * write + = (uint32 *)(data + pagesize - (int)(sizeof(uint32) * maxOnPageMasks)); + while(write < (uint32 *)(data + pagesize)) *write++ = 0; + } + }; + + // the data used by the allocator + + volatile PTE * _ptes; + volatile uint32 * _regions; + Page * _page; + uint32 _numpages; + size_t _memsize; + uint32 _pagebasedMutex; + volatile uint32 _firstFreePageBased; + volatile uint32 _firstfreeblock; + + /** + * randInit should create an random offset which can be used + * as the initial position in a bitfield + */ + static __device__ inline auto randInit() -> uint32 + { + // start with the laneid offset + return laneid(); + } - /** - * Page Table Entry struct - * The PTE holds basic information about each page - */ - struct PTE - { - uint32 chunksize; - uint32 count; - uint32 bitmask; + /** + * randInextspot delivers the next free spot in a bitfield + * it searches for the next unset bit to the left of spot and + * returns its offset. if there are no unset bits to the left + * then it wraps around + * @param bitfield the bitfield to be searched for + * @param spot the spot from which to search to the left + * @param spots number of bits that can be used + * @return next free spot in the bitfield + */ + static __device__ inline auto + nextspot(uint32 bitfield, uint32 spot, uint32 spots) -> uint32 + { + // wrap around the bitfields from the current spot to the left + bitfield = ((bitfield >> (spot + 1)) + | (bitfield << (spots - (spot + 1)))) + & ((1 << spots) - 1); + // compute the step from the current spot in the bitfield + const uint32 step = __ffs(~bitfield); + // and return the new spot + return (spot + step) % spots; + } - __device__ void init() - { - chunksize = 0; - count = 0; - bitmask = 0; - } - }; - - /** - * Page struct - * The page struct is used to access the data on the page more efficiently - * and to clear the area on the page, which might hold bitsfields later one - */ - struct PAGE - { - char data[pagesize]; - - /** - * The pages init method - * This method initializes the region on the page which might hold - * bit fields when the page is used for a small chunk size - * @param previous_chunksize the chunksize which was uses for the page before - */ - __device__ void init() - { - //clear the entire data which can hold bitfields - uint32* write = (uint32*)(data + pagesize - (int)(sizeof(uint32)*maxOnPageMasks)); - while(write < (uint32*)(data + pagesize)) - *write++ = 0; - } - }; - - // the data used by the allocator - - volatile PTE* _ptes; - volatile uint32* _regions; - PAGE* _page; - uint32 _numpages; - size_t _memsize; - uint32 _pagebasedMutex; - volatile uint32 _firstFreePageBased; - volatile uint32 _firstfreeblock; - - - /** - * randInit should create an random offset which can be used - * as the initial position in a bitfield - */ - __device__ inline uint32 randInit() - { - //start with the laneid offset - return laneid(); - } - - /** - * randInextspot delivers the next free spot in a bitfield - * it searches for the next unset bit to the left of spot and - * returns its offset. if there are no unset bits to the left - * then it wraps around - * @param bitfield the bitfield to be searched for - * @param spot the spot from which to search to the left - * @param spots number of bits that can be used - * @return next free spot in the bitfield - */ - __device__ inline uint32 nextspot(uint32 bitfield, uint32 spot, uint32 spots) - { - //wrap around the bitfields from the current spot to the left - bitfield = ((bitfield >> (spot + 1)) | (bitfield << (spots - (spot + 1))))&((1<= spots) - return -1; - spot = nextspot(old, spot, spots); - } - } - - - /** - * calcAdditionalChunks determines the number of chunks that are contained in the last segment of a hierarchical page - * - * The additional checks are necessary to ensure correct results for very large pages and small chunksizes - * - * @param fullsegments the number of segments that can be completely filled in a page. This may NEVER be bigger than 32! - * @param segmentsize the number of bytes that are contained in a completely filled segment (32 chunks) - * @param chunksize the chosen allocation size within the page - * @return the number of additional chunks that will not fit in one of the fullsegments. For any correct input, this number is smaller than 32 - */ - __device__ inline uint32 calcAdditionalChunks(uint32 fullsegments, uint32 segmentsize, uint32 chunksize){ - if(fullsegments != 32){ - return max(0,(int)pagesize - (int)fullsegments*segmentsize - (int)sizeof(uint32))/chunksize; - }else - return 0; - } - - - /** - * addChunkHierarchy finds a free chunk on a page which uses bit fields on the page - * @param chunksize the chunksize of the page - * @param fullsegments the number of full segments on the page (a 32 bits on the page) - * @param additional_chunks the number of additional chunks in last segment (less than 32 bits on the page) - * @param page the page to use - * @return pointer to a free chunk on the page, 0 if we were unable to obtain a free chunk - */ - __device__ inline void* addChunkHierarchy(uint32 chunksize, uint32 fullsegments, uint32 additional_chunks, uint32 page) - { - uint32 segments = fullsegments + (additional_chunks > 0 ? 1 : 0); - uint32 spot = randInit() % segments; - uint32 mask = _ptes[page].bitmask; - if((mask & (1 << spot)) != 0) - spot = nextspot(mask, spot, segments); - uint32 tries = segments - __popc(mask); - uint32* onpagemasks = onPageMasksPosition(page,segments); - for(uint32 i = 0; i < tries; ++i) - { - int hspot = usespot(onpagemasks + spot, spot < fullsegments ? 32 : additional_chunks); - if(hspot != -1) - return _page[page].data + (32*spot + hspot)*chunksize; - else - atomicOr((uint32*)&_ptes[page].bitmask, 1 << spot); - spot = nextspot(mask, spot, segments); - } - return 0; - } - - /** - * addChunkNoHierarchy finds a free chunk on a page which uses the bit fields of the pte only - * @param chunksize the chunksize of the page - * @param page the page to use - * @param spots the number of chunks which fit on the page - * @return pointer to a free chunk on the page, 0 if we were unable to obtain a free chunk - */ - __device__ inline void* addChunkNoHierarchy(uint32 chunksize, uint32 page, uint32 spots) - { - int spot = usespot((uint32*)&_ptes[page].bitmask, spots); - if(spot == -1) - return 0; //that should be impossible :) - return _page[page].data + spot*chunksize; - } - - /** - * tryUsePage tries to use the page for the allocation request - * @param page the page to use - * @param chunksize the chunksize of the page - * @return pointer to a free chunk on the page, 0 if we were unable to obtain a free chunk - */ - __device__ inline void* tryUsePage(uint32 page, uint32 chunksize) - { - - void* chunk_ptr = NULL; - - //increse the fill level - uint32 filllevel = atomicAdd((uint32*)&(_ptes[page].count), 1); - //recheck chunck size (it could be that the page got freed in the meanwhile...) - if(!resetfreedpages || _ptes[page].chunksize == chunksize) - { - if(chunksize <= HierarchyThreshold) - { - //more chunks than can be covered by the pte's single bitfield can be used - uint32 segmentsize = chunksize*32 + sizeof(uint32); - uint32 fullsegments = min(32,pagesize / segmentsize); - uint32 additional_chunks = calcAdditionalChunks(fullsegments, segmentsize, chunksize); - if(filllevel < fullsegments * 32 + additional_chunks) - chunk_ptr = addChunkHierarchy(chunksize, fullsegments, additional_chunks, page); - } - else - { - uint32 chunksinpage = min(pagesize / chunksize, 32); - if(filllevel < chunksinpage) - chunk_ptr = addChunkNoHierarchy(chunksize, page, chunksinpage); - } - } - - //this one is full/not useable - if(chunk_ptr == NULL) - atomicSub((uint32*)&(_ptes[page].count), 1); - - return chunk_ptr; - } - - - /** - * allocChunked tries to allocate the demanded number of bytes on one of the pages - * @param bytes the number of bytes to allocate - * @return pointer to a free chunk on a page, 0 if we were unable to obtain a free chunk - */ - __device__ void* allocChunked(uint32 bytes) - { - uint32 pagesperblock = _numpages/accessblocks; - uint32 reloff = warpSize*bytes / pagesize; - uint32 startpage = (bytes*hashingK + hashingDistMP*smid() + (hashingDistWP+hashingDistWPRel*reloff)*warpid() ) % pagesperblock; - uint32 maxchunksize = min(pagesize,wastefactor*bytes); - uint32 startblock = _firstfreeblock; - uint32 ptetry = startpage + startblock*pagesperblock; - uint32 checklevel = regionsize*3/4; - for(uint32 finder = 0; finder < 2; ++finder) - { - for(uint32 b = startblock; b < accessblocks; ++b) - { - while(ptetry < (b+1)*pagesperblock) + /** + * onPageMasksPosition returns a pointer to the beginning of the + * onpagemasks inside a page. + * @param page the page that holds the masks + * @param the number of hierarchical page tables (bitfields) that + * are used inside this mask. + * @return pointer to the first address inside the page that holds + * metadata bitfields. + */ + __device__ inline auto + onPageMasksPosition(uint32 page, uint32 nMasks) -> uint32 * + { + return ( + uint32 *)(_page[page].data + pagesize - (int)sizeof(uint32) * nMasks); + } + + /** + * usespot marks finds one free spot in the bitfield, marks it and + * returns its offset + * @param bitfield pointer to the bitfield to use + * @param spots overall number of spots the bitfield is responsible + * for + * @return if there is a free spot it returns the spot'S offset, + * otherwise -1 + */ + static __device__ inline auto + usespot(uint32 * bitfield, uint32 spots) -> int { - uint32 region = ptetry/regionsize; - uint32 regionfilllevel = _regions[region]; - if(regionfilllevel < checklevel ) - { - for( ; ptetry < (region+1)*regionsize; ++ptetry) + // get first spot + uint32 spot = randInit() % spots; + for(;;) { - uint32 chunksize = _ptes[ptetry].chunksize; - if(chunksize >= bytes && chunksize <= maxchunksize) - { - void * res = tryUsePage(ptetry, chunksize); - if(res != 0) return res; - } - else if(chunksize == 0) - { - //lets open up a new page - //it is already padded - uint32 new_chunksize = max(bytes,minChunkSize1); - uint32 beforechunksize = atomicCAS((uint32*)&_ptes[ptetry].chunksize, 0, new_chunksize); - if(beforechunksize == 0) + const uint32 mask = 1 << spot; + const uint32 old = atomicOr(bitfield, mask); + if((old & mask) == 0) + return spot; + // note: __popc(old) == spots should be sufficient, + // but if someone corrupts the memory we end up in an + // endless loop in here... + if(__popc(old) >= spots) + return -1; + spot = nextspot(old, spot, spots); + } + } + + /** + * calcAdditionalChunks determines the number of chunks that are + * contained in the last segment of a hierarchical page + * + * The additional checks are necessary to ensure correct results for + * very large pages and small chunksizes + * + * @param fullsegments the number of segments that can be completely + * filled in a page. This may NEVER be bigger than 32! + * @param segmentsize the number of bytes that are contained in a + * completely filled segment (32 chunks) + * @param chunksize the chosen allocation size within the page + * @return the number of additional chunks that will not fit in one + * of the fullsegments. For any correct input, this number is + * smaller than 32 + */ + static __device__ inline auto calcAdditionalChunks( + uint32 fullsegments, + uint32 segmentsize, + uint32 chunksize) -> uint32 + { + if(fullsegments != 32) + return max(0, + (int)pagesize - (int)fullsegments * segmentsize + - (int)sizeof(uint32)) + / chunksize; + else + return 0; + } + + /** + * addChunkHierarchy finds a free chunk on a page which uses bit + * fields on the page + * @param chunksize the chunksize of the page + * @param fullsegments the number of full segments on the page (a 32 + * bits on the page) + * @param additional_chunks the number of additional chunks in last + * segment (less than 32 bits on the page) + * @param page the page to use + * @return pointer to a free chunk on the page, 0 if we were unable + * to obtain a free chunk + */ + __device__ inline auto addChunkHierarchy( + uint32 chunksize, + uint32 fullsegments, + uint32 additional_chunks, + uint32 page) -> void * + { + const uint32 segments + = fullsegments + (additional_chunks > 0 ? 1 : 0); + uint32 spot = randInit() % segments; + const uint32 mask = _ptes[page].bitmask; + if((mask & (1 << spot)) != 0) + spot = nextspot(mask, spot, segments); + const uint32 tries = segments - __popc(mask); + uint32 * onpagemasks = onPageMasksPosition(page, segments); + for(uint32 i = 0; i < tries; ++i) + { + const int hspot = usespot( + &onpagemasks[spot], + spot < fullsegments ? 32 : additional_chunks); + if(hspot != -1) + return _page[page].data + + (32 * spot + hspot) * chunksize; + atomicOr((uint32 *)&_ptes[page].bitmask, 1 << spot); + spot = nextspot(mask, spot, segments); + } + return 0; + } + + /** + * addChunkNoHierarchy finds a free chunk on a page which uses the + * bit fields of the pte only + * @param chunksize the chunksize of the page + * @param page the page to use + * @param spots the number of chunks which fit on the page + * @return pointer to a free chunk on the page, 0 if we were unable + * to obtain a free chunk + */ + __device__ inline auto + addChunkNoHierarchy(uint32 chunksize, uint32 page, uint32 spots) + -> void * + { + const int spot = usespot((uint32 *)&_ptes[page].bitmask, spots); + if(spot == -1) + return 0; // that should be impossible :) + return _page[page].data + spot * chunksize; + } + + /** + * tryUsePage tries to use the page for the allocation request + * @param page the page to use + * @param chunksize the chunksize of the page + * @return pointer to a free chunk on the page, 0 if we were unable + * to obtain a free chunk + */ + __device__ inline auto tryUsePage(uint32 page, uint32 chunksize) + -> void * + { + void * chunk_ptr = nullptr; + + // increse the fill level + const uint32 filllevel + = atomicAdd((uint32 *)&(_ptes[page].count), 1); + // recheck chunck size (it could be that the page got freed in + // the meanwhile...) + if(!resetfreedpages || _ptes[page].chunksize == chunksize) + { + if(chunksize <= HierarchyThreshold) { - void * res = tryUsePage(ptetry, new_chunksize); - if(res != 0) return res; + // more chunks than can be covered by the pte's single + // bitfield can be used + const uint32 segmentsize + = chunksize * 32 + sizeof(uint32); + const uint32 fullsegments + = min(32, pagesize / segmentsize); + const uint32 additional_chunks = calcAdditionalChunks( + fullsegments, segmentsize, chunksize); + if(filllevel < fullsegments * 32 + additional_chunks) + chunk_ptr = addChunkHierarchy( + chunksize, + fullsegments, + additional_chunks, + page); } - else if(beforechunksize >= bytes && beforechunksize <= maxchunksize) + else { - //someone else aquired the page, but we can also use it - void * res = tryUsePage(ptetry, beforechunksize); - if(res != 0) return res; + const uint32 chunksinpage + = min(pagesize / chunksize, 32); + if(filllevel < chunksinpage) + chunk_ptr = addChunkNoHierarchy( + chunksize, page, chunksinpage); } - } } - //could not alloc in region, tell that - if(regionfilllevel + 1 <= regionsize) - atomicMax((uint32*)(_regions + region), regionfilllevel+1); - } - else - ptetry += regionsize; - //ptetry = (region+1)*regionsize; + + // this one is full/not useable + if(chunk_ptr == nullptr) + atomicSub((uint32 *)&(_ptes[page].count), 1); + + return chunk_ptr; } - //randomize the thread writing the info - //if(warpid() + laneid() == 0) - if(b > startblock) - _firstfreeblock = b; - } - - //we are really full :/ so lets search every page for a spot! - startblock = 0; - checklevel = regionsize + 1; - ptetry = 0; - } - return 0; - } - - - /** - * deallocChunked frees the chunk on the page and updates all data accordingly - * @param mem pointer to the chunk - * @param page the page the chunk is on - * @param chunksize the chunksize used for the page - */ - __device__ void deallocChunked(void* mem, uint32 page, uint32 chunksize) - { - uint32 inpage_offset = ((char*)mem - _page[page].data); - if(chunksize <= HierarchyThreshold) - { - //one more level in hierarchy - uint32 segmentsize = chunksize*32 + sizeof(uint32); - uint32 fullsegments = min(32,pagesize / segmentsize); - uint32 additional_chunks = calcAdditionalChunks(fullsegments,segmentsize,chunksize); - uint32 segment = inpage_offset / (chunksize*32); - uint32 withinsegment = (inpage_offset - segment*(chunksize*32))/chunksize; - //mark it as free - uint32 nMasks = fullsegments + (additional_chunks > 0 ? 1 : 0); - uint32* onpagemasks = onPageMasksPosition(page,nMasks); - uint32 old = atomicAnd(onpagemasks + segment, ~(1 << withinsegment)); - - // always do this, since it might fail due to a race-condition with addChunkHierarchy - atomicAnd((uint32*)&_ptes[page].bitmask, ~(1 << segment)); - } - else - { - uint32 segment = inpage_offset / chunksize; - atomicAnd((uint32*)&_ptes[page].bitmask, ~(1 << segment)); - } - //reduce filllevel as free - uint32 oldfilllevel = atomicSub((uint32*)&_ptes[page].count, 1); + /** + * allocChunked tries to allocate the demanded number of bytes on + * one of the pages + * @param bytes the number of bytes to allocate + * @return pointer to a free chunk on a page, 0 if we were unable to + * obtain a free chunk + */ + __device__ auto allocChunked(uint32 bytes) -> void * + { + const uint32 pagesperblock = _numpages / accessblocks; + const uint32 reloff = warpSize * bytes / pagesize; + const uint32 startpage + = (bytes * hashingK + hashingDistMP * smid() + + (hashingDistWP + hashingDistWPRel * reloff) * warpid()) + % pagesperblock; + const uint32 maxchunksize = min(pagesize, wastefactor * bytes); + uint32 startblock = _firstfreeblock; + uint32 ptetry = startpage + startblock * pagesperblock; + uint32 checklevel = regionsize * 3 / 4; + for(uint32 finder = 0; finder < 2; ++finder) + { + for(uint32 b = startblock; b < accessblocks; ++b) + { + while(ptetry < (b + 1) * pagesperblock) + { + const uint32 region = ptetry / regionsize; + const uint32 regionfilllevel = _regions[region]; + if(regionfilllevel < checklevel) + { + for(; ptetry < (region + 1) * regionsize; + ++ptetry) + { + const uint32 chunksize + = _ptes[ptetry].chunksize; + if(chunksize >= bytes + && chunksize <= maxchunksize) + { + void * res + = tryUsePage(ptetry, chunksize); + if(res != 0) + return res; + } + else if(chunksize == 0) + { + // lets open up a new page + // it is already padded + const uint32 new_chunksize + = max(bytes, minChunkSize1); + const uint32 beforechunksize + = atomicCAS( + (uint32 *)&_ptes[ptetry] + .chunksize, + 0, + new_chunksize); + if(beforechunksize == 0) + { + void * res = tryUsePage( + ptetry, new_chunksize); + if(res != 0) + return res; + } + else if( + beforechunksize >= bytes + && beforechunksize <= maxchunksize) + { + // someone else aquired the page, + // but we can also use it + void * res = tryUsePage( + ptetry, beforechunksize); + if(res != 0) + return res; + } + } + } + // could not alloc in region, tell that + if(regionfilllevel + 1 <= regionsize) + atomicMax( + (uint32 *)(_regions + region), + regionfilllevel + 1); + } + else + ptetry += regionsize; + // ptetry = (region+1)*regionsize; + } + // randomize the thread writing the info + // if(warpid() + laneid() == 0) + if(b > startblock) + _firstfreeblock = b; + } + + // we are really full :/ so lets search every page for a + // spot! + startblock = 0; + checklevel = regionsize + 1; + ptetry = 0; + } + return 0; + } - if(resetfreedpages) - { - if(oldfilllevel == 1) - { - //this page now got free! - // -> try lock it - uint32 old = atomicCAS((uint32*)&_ptes[page].count, 0, pagesize); - if(old == 0) + /** + * deallocChunked frees the chunk on the page and updates all data + * accordingly + * @param mem pointer to the chunk + * @param page the page the chunk is on + * @param chunksize the chunksize used for the page + */ + __device__ void + deallocChunked(void * mem, uint32 page, uint32 chunksize) { - //clean the bits for the hierarchy - _page[page].init(); - //remove chunk information - _ptes[page].chunksize = 0; - __threadfence(); - //unlock it - atomicSub((uint32*)&_ptes[page].count, pagesize); + const uint32 inpage_offset = ((char *)mem - _page[page].data); + if(chunksize <= HierarchyThreshold) + { + // one more level in hierarchy + const uint32 segmentsize = chunksize * 32 + sizeof(uint32); + const uint32 fullsegments = min(32, pagesize / segmentsize); + const uint32 additional_chunks = calcAdditionalChunks( + fullsegments, segmentsize, chunksize); + const uint32 segment = inpage_offset / (chunksize * 32); + const uint32 withinsegment + = (inpage_offset - segment * (chunksize * 32)) + / chunksize; + // mark it as free + const uint32 nMasks + = fullsegments + (additional_chunks > 0 ? 1 : 0); + uint32 * onpagemasks = onPageMasksPosition(page, nMasks); + uint32 old = atomicAnd( + &onpagemasks[segment], ~(1 << withinsegment)); + + // always do this, since it might fail due to a + // race-condition with addChunkHierarchy + atomicAnd((uint32 *)&_ptes[page].bitmask, ~(1 << segment)); + } + else + { + const uint32 segment = inpage_offset / chunksize; + atomicAnd((uint32 *)&_ptes[page].bitmask, ~(1 << segment)); + } + // reduce filllevel as free + const uint32 oldfilllevel + = atomicSub((uint32 *)&_ptes[page].count, 1); + + if(resetfreedpages) + { + if(oldfilllevel == 1) + { + // this page now got free! + // -> try lock it + const uint32 old = atomicCAS( + (uint32 *)&_ptes[page].count, 0, pagesize); + if(old == 0) + { + // clean the bits for the hierarchy + _page[page].init(); + // remove chunk information + _ptes[page].chunksize = 0; + __threadfence(); + // unlock it + atomicSub((uint32 *)&_ptes[page].count, pagesize); + } + } + } + + // meta information counters ... should not be changed by too + // many threads, so.. + if(oldfilllevel == pagesize / 2 / chunksize) + { + const uint32 region = page / regionsize; + _regions[region] = 0; + const uint32 block + = region * regionsize * accessblocks / _numpages; + if(warpid() + laneid() == 0) + atomicMin((uint32 *)&_firstfreeblock, block); + } } - } - } - //meta information counters ... should not be changed by too many threads, so.. - if(oldfilllevel == pagesize / 2 / chunksize) - { - uint32 region = page / regionsize; - _regions[region] = 0; - uint32 block = region * regionsize * accessblocks / _numpages ; - if(warpid() + laneid() == 0) - atomicMin((uint32*)&_firstfreeblock, block); - } - } - - /** - * markpages markes a fixed number of pages as used - * @param startpage first page to mark - * @param pages number of pages to mark - * @param bytes number of overall bytes to mark pages for - * @return true on success, false if one of the pages is not free - */ - __device__ bool markpages(uint32 startpage, uint32 pages, uint32 bytes) - { - int abord = -1; - for(uint32 trypage = startpage; trypage < startpage + pages; ++trypage) - { - uint32 old = atomicCAS((uint32*)&_ptes[trypage].chunksize, 0, bytes); - if(old != 0) - { - abord = trypage; - break; - } - } - if(abord == -1) - return true; - for(uint32 trypage = startpage; trypage < abord; ++trypage) - atomicCAS((uint32*)&_ptes[trypage].chunksize, bytes, 0); - return false; - } - - /** - * allocPageBasedSingleRegion tries to allocate the demanded number of bytes on a continues sequence of pages - * @param startpage first page to be used - * @param endpage last page to be used - * @param bytes number of overall bytes to mark pages for - * @return pointer to the first page to use, 0 if we were unable to use all the requested pages - */ - __device__ void* allocPageBasedSingleRegion(uint32 startpage, uint32 endpage, uint32 bytes) - { - uint32 pagestoalloc = divup(bytes, pagesize); - uint32 freecount = 0; - bool left_free = false; - for(uint32 search_page = startpage+1; search_page > endpage; ) - { - --search_page; - if(_ptes[search_page].chunksize == 0) - { - if(++freecount == pagestoalloc) + /** + * markpages markes a fixed number of pages as used + * @param startpage first page to mark + * @param pages number of pages to mark + * @param bytes number of overall bytes to mark pages for + * @return true on success, false if one of the pages is not free + */ + __device__ auto + markpages(uint32 startpage, uint32 pages, uint32 bytes) -> bool { - //try filling it up - if(markpages(search_page, pagestoalloc, bytes)) - { - //mark that we filled up everything up to here - if(!left_free) - atomicCAS((uint32*)&_firstFreePageBased, startpage, search_page - 1); - return _page[search_page].data; - } + int abord = -1; + for(uint32 trypage = startpage; trypage < startpage + pages; + ++trypage) + { + const uint32 old = atomicCAS( + (uint32 *)&_ptes[trypage].chunksize, 0, bytes); + if(old != 0) + { + abord = trypage; + break; + } + } + if(abord == -1) + return true; + for(uint32 trypage = startpage; trypage < abord; ++trypage) + atomicCAS((uint32 *)&_ptes[trypage].chunksize, bytes, 0); + return false; } - } - else - { - left_free = true; - freecount = 0; - } - } - return 0; - } - - /** - * allocPageBasedSingle tries to allocate the demanded number of bytes on a continues sequence of pages - * @param bytes number of overall bytes to mark pages for - * @return pointer to the first page to use, 0 if we were unable to use all the requested pages - * @pre only a single thread of a warp is allowed to call the function concurrently - */ - __device__ void* allocPageBasedSingle(uint32 bytes) - { - //acquire mutex - while(atomicExch(&_pagebasedMutex,1) != 0); - //search for free spot from the back - uint32 spage = _firstFreePageBased; - void* res = allocPageBasedSingleRegion(spage, 0, bytes); - if(res == 0) - //also check the rest of the pages - res = allocPageBasedSingleRegion(_numpages, spage, bytes); - - //free mutex - atomicExch(&_pagebasedMutex,0); - return res; - } - /** - * allocPageBased tries to allocate the demanded number of bytes on a continues sequence of pages - * @param bytes number of overall bytes to mark pages for - * @return pointer to the first page to use, 0 if we were unable to use all the requested pages - */ - __device__ void* allocPageBased(uint32 bytes) - { - //this is rather slow, but we dont expect that to happen often anyway - - //only one thread per warp can acquire the mutex - void* res = 0; - for( + + /** + * allocPageBasedSingleRegion tries to allocate the demanded number + * of bytes on a continues sequence of pages + * @param startpage first page to be used + * @param endpage last page to be used + * @param bytes number of overall bytes to mark pages for + * @return pointer to the first page to use, 0 if we were unable to + * use all the requested pages + */ + __device__ auto allocPageBasedSingleRegion( + uint32 startpage, + uint32 endpage, + uint32 bytes) -> void * + { + const uint32 pagestoalloc = divup(bytes, pagesize); + uint32 freecount = 0; + bool left_free = false; + for(uint32 search_page = startpage + 1; search_page > endpage;) + { + --search_page; + if(_ptes[search_page].chunksize == 0) + { + if(++freecount == pagestoalloc) + { + // try filling it up + if(markpages(search_page, pagestoalloc, bytes)) + { + // mark that we filled up everything up to here + if(!left_free) + atomicCAS( + (uint32 *)&_firstFreePageBased, + startpage, + search_page - 1); + return _page[search_page].data; + } + } + } + else + { + left_free = true; + freecount = 0; + } + } + return 0; + } + + /** + * allocPageBasedSingle tries to allocate the demanded number of + * bytes on a continues sequence of pages + * @param bytes number of overall bytes to mark pages for + * @return pointer to the first page to use, 0 if we were unable to + * use all the requested pages + * @pre only a single thread of a warp is allowed to call the + * function concurrently + */ + __device__ auto allocPageBasedSingle(uint32 bytes) -> void * + { + // acquire mutex + while(atomicExch(&_pagebasedMutex, 1) != 0) + ; + // search for free spot from the back + const uint32 spage = _firstFreePageBased; + void * res = allocPageBasedSingleRegion(spage, 0, bytes); + if(res == 0) + // also check the rest of the pages + res = allocPageBasedSingleRegion(_numpages, spage, bytes); + + // free mutex + atomicExch(&_pagebasedMutex, 0); + return res; + } + /** + * allocPageBased tries to allocate the demanded number of bytes on + * a continues sequence of pages + * @param bytes number of overall bytes to mark pages for + * @return pointer to the first page to use, 0 if we were unable to + * use all the requested pages + */ + __device__ auto allocPageBased(uint32 bytes) -> void * + { + // this is rather slow, but we dont expect that to happen often + // anyway + + // only one thread per warp can acquire the mutex + void * res = 0; #if(__CUDACC_VER_MAJOR__ >= 9) - unsigned int __mask = __activemask(), + const unsigned int mask = __activemask(); #else - unsigned int __mask = __ballot(1), + const unsigned int mask = __ballot(1); #endif - __num = __popc(__mask), - __lanemask = mallocMC::lanemask_lt(), - __local_id = __popc(__lanemask & __mask), - __active = 0; - __active < __num; - ++__active - ) - if (__active == __local_id) - res = allocPageBasedSingle(bytes); - return res; - } - - /** - * deallocPageBased frees the memory placed on a sequence of pages - * @param mem pointer to the first page - * @param page the first page - * @param bytes the number of bytes to be freed - */ - __device__ void deallocPageBased(void* mem, uint32 page, uint32 bytes) - { - uint32 pages = divup(bytes,pagesize); - for(uint32 p = page; p < page+pages; ++p) - _page[p].init(); - __threadfence(); - for(uint32 p = page; p < page+pages; ++p) - atomicCAS((uint32*)&_ptes[p].chunksize, bytes, 0); - atomicMax((uint32*)&_firstFreePageBased, page+pages-1); - } - - - public: - /** - * create allocates the requested number of bytes via the heap. Coalescing has to be done before by another policy. - * @param bytes number of bytes to allocate - * @return pointer to the allocated memory - */ - __device__ void* create(uint32 bytes) - { - if(bytes == 0) - return 0; - //take care of padding - //bytes = (bytes + dataAlignment - 1) & ~(dataAlignment-1); // in alignment-policy - if(bytes < pagesize) - //chunck based - return allocChunked(bytes); - else - //allocate a range of pages - return allocPageBased(bytes); - } - - /** - * destroy frees the memory regions previously acllocted via create - * @param mempointer to the memory region to free - */ - __device__ void destroy(void* mem) - { - if(mem == 0) - return; - //lets see on which page we are on - uint32 page = ((char*)mem - (char*)_page)/pagesize; - uint32 chunksize = _ptes[page].chunksize; - - //is the pointer the beginning of a chunk? - uint32 inpage_offset = ((char*)mem - _page[page].data); - uint32 block = inpage_offset/chunksize; - uint32 inblockoffset = inpage_offset - block*chunksize; - if(inblockoffset != 0) - { - uint32* counter = (uint32*)(_page[page].data + block*chunksize); - //coalesced mem free - uint32 old = atomicSub(counter, 1); - if(old != 1) - return; - mem = (void*) counter; - } - - if(chunksize < pagesize) - deallocChunked(mem, page, chunksize); - else - deallocPageBased(mem, page, chunksize); - } - - /** - * init inits the heap data structures - * the init method must be called before the heap can be used. the method can be called - * with an arbitrary number of threads, which will increase the inits efficiency - * @param memory pointer to the memory used for the heap - * @param memsize size of the memory in bytes - */ - __device__ void initDeviceFunction(void* memory, size_t memsize) - { - uint32 linid = threadIdx.x + blockDim.x*(threadIdx.y + threadIdx.z*blockDim.y); - uint32 threads = blockDim.x*blockDim.y*blockDim.z; - uint32 linblockid = blockIdx.x + gridDim.x*(blockIdx.y + blockIdx.z*gridDim.y); - uint32 blocks = gridDim.x*gridDim.y*gridDim.z; - linid = linid + linblockid*threads; - - uint32 numregions = ((unsigned long long)memsize)/( ((unsigned long long)regionsize)*(sizeof(PTE)+pagesize)+sizeof(uint32)); - uint32 numpages = numregions*regionsize; - //pointer is copied (copy is called page) - PAGE* page = (PAGE*)(memory); - //sec check for alignment - //copy is checked - //PointerEquivalent alignmentstatus = ((PointerEquivalent)page) & (16 -1); - //if(alignmentstatus != 0) - //{ - // if(linid == 0){ - // printf("c Before:\n"); - // printf("c dataAlignment: %d\n",16); - // printf("c Alignmentstatus: %d\n",alignmentstatus); - // printf("c size_t memsize %llu byte\n", memsize); - // printf("c void *memory %p\n", page); - // } - // //copy is adjusted, potentially pointer to higher address now. - // page =(PAGE*)(((PointerEquivalent)page) + 16 - alignmentstatus); - // if(linid == 0) printf("c Heap Warning: memory to use not 16 byte aligned...\n"); - //} - PTE* ptes = (PTE*)(page + numpages); - uint32* regions = (uint32*)(ptes + numpages); - //sec check for mem size - //this check refers to the original memory-pointer, which was not adjusted! - if( (void*)(regions + numregions) > (((char*)memory) + memsize) ) - { - --numregions; - numpages = min(numregions*regionsize,numpages); - if(linid == 0) printf("c Heap Warning: needed to reduce number of regions to stay within memory limit\n"); - } - //if(linid == 0) printf("Heap info: wasting %d bytes\n",(((POINTEREQUIVALENT)memory) + memsize) - (POINTEREQUIVALENT)(regions + numregions)); - - //if(linid == 0 && alignmentstatus != 0){ - // printf("c Was shrinked automatically to:\n"); - // printf("c size_t memsize %llu byte\n", memsize); - // printf("c void *memory %p\n", page); - //} - threads = threads*blocks; - - for(uint32 i = linid; i < numpages; i+= threads) - { - ptes[i].init(); - page[i].init(); - } - for(uint32 i = linid; i < numregions; i+= threads) - regions[i] = 0; + const unsigned int num = __popc(mask); + const unsigned int lanemask = mallocMC::lanemask_lt(); + const unsigned int local_id = __popc(lanemask & mask); + for(unsigned int active = 0; active < num; ++active) + if(active == local_id) + res = allocPageBasedSingle(bytes); + return res; + } - if(linid == 0) - { - _memsize = memsize; - _numpages = numpages; - _ptes = (volatile PTE*)ptes; - _page = page; - _regions = regions; - _firstfreeblock = 0; - _pagebasedMutex = 0; - _firstFreePageBased = numpages-1; - - if( (char*) (_page+numpages) > (char*)(memory) + memsize) - printf("error in heap alloc: numpages too high\n"); - } - - } - - __device__ bool isOOM(void* p, size_t s){ - // one thread that requested memory returned null - return s && (p == NULL); - } - - - template < typename T_DeviceAllocator > - static void* initHeap( T_DeviceAllocator* heap, void* pool, size_t memsize){ - if( pool == NULL && memsize != 0 ) - { - throw std::invalid_argument( - "Scatter policy cannot use NULL for non-empty memory pools. " - "Maybe you are using an incompatible ReservePoolPolicy or AlignmentPolicy." - ); - } - ScatterKernelDetail::initKernel<<<1,256>>>(heap, pool, memsize); - return heap; - } - - /** counts how many elements of a size fit inside a given page - * - * Examines a (potentially already used) page to find how many elements - * of size chunksize still fit on the page. This includes hierarchically - * organized pages and empty pages. The algorithm determines the number - * of chunks in the page in a manner similar to the allocation algorithm - * of CreationPolicies::Scatter. - * - * @param page the number of the page to examine. The page needs to be - * formatted with a chunksize and potentially a hierarchy. - * @param chunksize the size of element that should be placed inside the - * page. This size must be appropriate to the formatting of the - * page. - */ - __device__ unsigned countFreeChunksInPage(uint32 page, uint32 chunksize){ - uint32 filledChunks = _ptes[page].count; - if(chunksize <= HierarchyThreshold) - { - uint32 segmentsize = chunksize*32 + sizeof(uint32); //each segment can hold 32 2nd-level chunks - uint32 fullsegments = min(32,pagesize / segmentsize); //there might be space for more than 32 segments with 32 2nd-level chunks - uint32 additional_chunks = calcAdditionalChunks(fullsegments, segmentsize, chunksize); - uint32 level2Chunks = fullsegments * 32 + additional_chunks; - return level2Chunks - filledChunks; - }else{ - uint32 chunksinpage = min(pagesize / chunksize, 32); //without hierarchy, there can not be more than 32 chunks - return chunksinpage - filledChunks; - } - } - - - /** counts the number of available slots inside the heap - * - * Searches the heap for all possible locations of an element with size - * slotSize. The used traversal algorithms are similar to the allocation - * strategy of CreationPolicies::Scatter, to ensure comparable results. - * There are 3 different algorithms, based on the size of the requested - * slot: 1 slot spans over multiple pages, 1 slot fits in one chunk - * within a page, 1 slot fits in a fraction of a chunk. - * - * @param slotSize the amount of bytes that a single slot accounts for - * @param gid the id of the thread. this id does not have to correspond - * with threadId.x, but there must be a continous range of ids - * beginning from 0. - * @param stride the stride should be equal to the number of different - * gids (and therefore of value max(gid)-1) - */ - __device__ unsigned getAvailaibleSlotsDeviceFunction(size_t slotSize, int gid, int stride) - { - unsigned slotcount = 0; - if(slotSize < pagesize){ // multiple slots per page - for(uint32 currentpage = gid; currentpage < _numpages; currentpage += stride){ - uint32 maxchunksize = min(pagesize, wastefactor*(uint32)slotSize); - uint32 region = currentpage/regionsize; - uint32 regionfilllevel = _regions[region]; - - uint32 chunksize = _ptes[currentpage].chunksize; - if(chunksize >= slotSize && chunksize <= maxchunksize){ //how many chunks left? (each chunk is big enough) - slotcount += countFreeChunksInPage(currentpage, chunksize); - }else if(chunksize == 0){ - chunksize = max((uint32)slotSize, minChunkSize1); //ensure minimum chunk size - slotcount += countFreeChunksInPage(currentpage, chunksize); //how many chunks fit in one page? - }else{ - continue; //the chunks on this page are too small for the request :( + /** + * deallocPageBased frees the memory placed on a sequence of pages + * @param mem pointer to the first page + * @param page the first page + * @param bytes the number of bytes to be freed + */ + __device__ void + deallocPageBased(void * mem, uint32 page, uint32 bytes) + { + const uint32 pages = divup(bytes, pagesize); + for(uint32 p = page; p < page + pages; ++p) _page[p].init(); + __threadfence(); + for(uint32 p = page; p < page + pages; ++p) + atomicCAS((uint32 *)&_ptes[p].chunksize, bytes, 0); + atomicMax((uint32 *)&_firstFreePageBased, page + pages - 1); + } + + public: + /** + * create allocates the requested number of bytes via the heap. + * Coalescing has to be done before by another policy. + * @param bytes number of bytes to allocate + * @return pointer to the allocated memory + */ + __device__ auto create(uint32 bytes) -> void * + { + if(bytes == 0) + return 0; + // take care of padding + // bytes = (bytes + dataAlignment - 1) & ~(dataAlignment-1); // + // in alignment-policy + if(bytes < pagesize) + // chunck based + return allocChunked(bytes); + else + // allocate a range of pages + return allocPageBased(bytes); } - } - }else{ // 1 slot needs multiple pages - if(gid > 0) return 0; //do this serially - uint32 pagestoalloc = divup((uint32)slotSize, pagesize); - uint32 freecount = 0; - for(uint32 currentpage = _numpages; currentpage > 0;){ //this already includes all superblocks - --currentpage; - if(_ptes[currentpage].chunksize == 0){ - if(++freecount == pagestoalloc){ - freecount = 0; - ++slotcount; - } - }else{ // the sequence of free pages was interrupted - freecount = 0; + + /** + * destroy frees the memory regions previously acllocted via create + * @param mempointer to the memory region to free + */ + __device__ void destroy(void * mem) + { + if(mem == 0) + return; + // lets see on which page we are on + const uint32 page = ((char *)mem - (char *)_page) / pagesize; + const uint32 chunksize = _ptes[page].chunksize; + + // is the pointer the beginning of a chunk? + const uint32 inpage_offset = ((char *)mem - _page[page].data); + const uint32 block = inpage_offset / chunksize; + const uint32 inblockoffset = inpage_offset - block * chunksize; + if(inblockoffset != 0) + { + uint32 * counter + = (uint32 *)(_page[page].data + block * chunksize); + // coalesced mem free + uint32 old = atomicSub(counter, 1); + if(old != 1) + return; + mem = (void *)counter; + } + + if(chunksize < pagesize) + deallocChunked(mem, page, chunksize); + else + deallocPageBased(mem, page, chunksize); } - } - } - return slotcount; - } - - - /** Count, how many elements can be allocated at maximum - * - * Takes an input size and determines, how many elements of this size can - * be allocated with the CreationPolicy Scatter. This will return the - * maximum number of free slots of the indicated size. It is not - * guaranteed where these slots are (regarding fragmentation). Therefore, - * the practically usable number of slots might be smaller. This function - * is executed in parallel. Speedup can possibly increased by a higher - * amount ofparallel workers. - * - * @param slotSize the size of allocatable elements to count - * @param obj a reference to the allocator instance (host-side) - */ - public: - template - static unsigned getAvailableSlotsHost(size_t const slotSize, T_DeviceAllocator* heap){ - unsigned h_slots = 0; - unsigned* d_slots; - cudaMalloc((void**) &d_slots, sizeof(unsigned)); - cudaMemcpy(d_slots, &h_slots, sizeof(unsigned), cudaMemcpyHostToDevice); - - ScatterKernelDetail::getAvailableSlotsKernel<<<64,256>>>(heap, slotSize, d_slots); - - cudaMemcpy(&h_slots, d_slots, sizeof(unsigned), cudaMemcpyDeviceToHost); - cudaFree(d_slots); - return h_slots; - } - - - /** Count, how many elements can be allocated at maximum - * - * Takes an input size and determines, how many elements of this size can - * be allocated with the CreationPolicy Scatter. This will return the - * maximum number of free slots of the indicated size. It is not - * guaranteed where these slots are (regarding fragmentation). Therefore, - * the practically usable number of slots might be smaller. This function - * is executed separately for each warp and does not cooperate with other - * warps. Maximum speed is expected if every thread in the warp executes - * the function. - * Uses 256 byte of shared memory. - * - * @param slotSize the size of allocatable elements to count - */ - __device__ unsigned getAvailableSlotsAccelerator(size_t slotSize){ - int linearId; - int wId = warpid_withinblock(); //do not use warpid-function, since this value is not guaranteed to be stable across warp lifetime + + /** + * init inits the heap data structures + * the init method must be called before the heap can be used. the + * method can be called with an arbitrary number of threads, which + * will increase the inits efficiency + * @param memory pointer to the memory used for the heap + * @param memsize size of the memory in bytes + */ + __device__ void initDeviceFunction(void * memory, size_t memsize) + { + uint32 linid = threadIdx.x + + blockDim.x * (threadIdx.y + threadIdx.z * blockDim.y); + uint32 threads = blockDim.x * blockDim.y * blockDim.z; + const uint32 linblockid = blockIdx.x + + gridDim.x * (blockIdx.y + blockIdx.z * gridDim.y); + uint32 blocks = gridDim.x * gridDim.y * gridDim.z; + linid += linblockid * threads; + + uint32 numregions = ((unsigned long long)memsize) + / (((unsigned long long)regionsize) + * (sizeof(PTE) + pagesize) + + sizeof(uint32)); + + uint32 numpages = numregions * regionsize; + // pointer is copied (copy is called page) + Page * page = (Page *)memory; + // sec check for alignment + // copy is checked + // PointerEquivalent alignmentstatus = ((PointerEquivalent)page) + // & (16 -1); if(alignmentstatus != 0) + //{ + // if(linid == 0){ + // printf("c Before:\n"); + // printf("c dataAlignment: %d\n",16); + // printf("c Alignmentstatus: %d\n",alignmentstatus); + // printf("c size_t memsize %llu byte\n", memsize); + // printf("c void *memory %p\n", page); + // } + // //copy is adjusted, potentially pointer to higher address + // now. page =(Page*)(((PointerEquivalent)page) + 16 - + // alignmentstatus); if(linid == 0) printf("c Heap Warning: + // memory to use not 16 byte aligned...\n"); + //} + PTE * ptes = (PTE *)(page + numpages); + uint32 * regions = (uint32 *)(ptes + numpages); + // sec check for mem size + // this check refers to the original memory-pointer, which was + // not adjusted! + if((char *)(regions + numregions) + > (((char *)memory) + memsize)) + { + --numregions; + numpages = min(numregions * regionsize, numpages); + if(linid == 0) + printf( + "c Heap Warning: needed to reduce number of " + "regions to stay within memory limit\n"); + } + // if(linid == 0) printf("Heap info: wasting %d + // bytes\n",(((POINTEREQUIVALENT)memory) + memsize) - + // (POINTEREQUIVALENT)(regions + numregions)); + + // if(linid == 0 && alignmentstatus != 0){ + // printf("c Was shrinked automatically to:\n"); + // printf("c size_t memsize %llu byte\n", memsize); + // printf("c void *memory %p\n", page); + //} + threads *= blocks; + + for(uint32 i = linid; i < numpages; i += threads) + { + ptes[i].init(); + page[i].init(); + } + for(uint32 i = linid; i < numregions; i += threads) + regions[i] = 0; + + if(linid == 0) + { + _memsize = memsize; + _numpages = numpages; + _ptes = (volatile PTE *)ptes; + _page = page; + _regions = regions; + _firstfreeblock = 0; + _pagebasedMutex = 0; + _firstFreePageBased = numpages - 1; + + if((char *)&_page[numpages] > (char *)memory + memsize) + printf("error in heap alloc: numpages too high\n"); + } + } + + static __device__ auto isOOM(void * p, size_t s) -> bool + { + // one thread that requested memory returned null + return s && (p == nullptr); + } + + template + static auto + initHeap(T_DeviceAllocator * heap, void * pool, size_t memsize) + -> void * + { + if(pool == nullptr && memsize != 0) + { + throw std::invalid_argument( + "Scatter policy cannot use nullptr for non-empty " + "memory pools. " + "Maybe you are using an incompatible ReservePoolPolicy " + "or AlignmentPolicy."); + } + ScatterKernelDetail::initKernel<<<1, 256>>>( + heap, pool, memsize); + return heap; + } + + /** counts how many elements of a size fit inside a given page + * + * Examines a (potentially already used) page to find how many + * elements of size chunksize still fit on the page. This includes + * hierarchically organized pages and empty pages. The algorithm + * determines the number of chunks in the page in a manner similar + * to the allocation algorithm of CreationPolicies::Scatter. + * + * @param page the number of the page to examine. The page needs to + * be formatted with a chunksize and potentially a hierarchy. + * @param chunksize the size of element that should be placed inside + * the page. This size must be appropriate to the formatting of the + * page. + */ + __device__ auto countFreeChunksInPage(uint32 page, uint32 chunksize) + -> unsigned + { + const uint32 filledChunks = _ptes[page].count; + if(chunksize <= HierarchyThreshold) + { + const uint32 segmentsize = chunksize * 32 + + sizeof(uint32); // each segment can hold 32 2nd-level + // chunks + const uint32 fullsegments = min( + 32, pagesize / segmentsize); // there might be space for + // more than 32 segments + // with 32 2nd-level chunks + const uint32 additional_chunks = calcAdditionalChunks( + fullsegments, segmentsize, chunksize); + const uint32 level2Chunks + = fullsegments * 32 + additional_chunks; + return level2Chunks - filledChunks; + } + else + { + const uint32 chunksinpage = min( + pagesize / chunksize, + 32); // without hierarchy, there can not be more than 32 + // chunks + return chunksinpage - filledChunks; + } + } + + /** counts the number of available slots inside the heap + * + * Searches the heap for all possible locations of an element with + * size slotSize. The used traversal algorithms are similar to the + * allocation strategy of CreationPolicies::Scatter, to ensure + * comparable results. There are 3 different algorithms, based on + * the size of the requested slot: 1 slot spans over multiple pages, + * 1 slot fits in one chunk within a page, 1 slot fits in a fraction + * of a chunk. + * + * @param slotSize the amount of bytes that a single slot accounts + * for + * @param gid the id of the thread. this id does not have to + * correspond with threadId.x, but there must be a continous range + * of ids beginning from 0. + * @param stride the stride should be equal to the number of + * different gids (and therefore of value max(gid)-1) + */ + __device__ auto getAvailaibleSlotsDeviceFunction( + size_t slotSize, + int gid, + int stride) -> unsigned + { + unsigned slotcount = 0; + if(slotSize < pagesize) + { // multiple slots per page + for(uint32 currentpage = gid; currentpage < _numpages; + currentpage += stride) + { + const uint32 maxchunksize + = min(pagesize, wastefactor * (uint32)slotSize); + const uint32 region = currentpage / regionsize; + const uint32 regionfilllevel = _regions[region]; + + uint32 chunksize = _ptes[currentpage].chunksize; + if(chunksize >= slotSize && chunksize <= maxchunksize) + { // how many chunks left? (each chunk is big enough) + slotcount += countFreeChunksInPage( + currentpage, chunksize); + } + else if(chunksize == 0) + { + chunksize = max( + (uint32)slotSize, + minChunkSize1); // ensure minimum chunk size + slotcount += countFreeChunksInPage( + currentpage, + chunksize); // how many chunks fit in one page? + } + else + { + continue; // the chunks on this page are too small + // for the request :( + } + } + } + else + { // 1 slot needs multiple pages + if(gid > 0) + return 0; // do this serially + const uint32 pagestoalloc + = divup((uint32)slotSize, pagesize); + uint32 freecount = 0; + for(uint32 currentpage = _numpages; currentpage > 0;) + { // this already includes all superblocks + --currentpage; + if(_ptes[currentpage].chunksize == 0) + { + if(++freecount == pagestoalloc) + { + freecount = 0; + ++slotcount; + } + } + else + { // the sequence of free pages was interrupted + freecount = 0; + } + } + } + return slotcount; + } + + /** Count, how many elements can be allocated at maximum + * + * Takes an input size and determines, how many elements of this + * size can be allocated with the CreationPolicy Scatter. This will + * return the maximum number of free slots of the indicated size. It + * is not guaranteed where these slots are (regarding + * fragmentation). Therefore, the practically usable number of slots + * might be smaller. This function is executed in parallel. Speedup + * can possibly increased by a higher amount ofparallel workers. + * + * @param slotSize the size of allocatable elements to count + * @param obj a reference to the allocator instance (host-side) + */ + public: + template + static auto getAvailableSlotsHost( + size_t const slotSize, + T_DeviceAllocator * heap) -> unsigned + { + unsigned h_slots = 0; + unsigned * d_slots; + cudaMalloc((void **)&d_slots, sizeof(unsigned)); + cudaMemcpy( + d_slots, + &h_slots, + sizeof(unsigned), + cudaMemcpyHostToDevice); + + ScatterKernelDetail::getAvailableSlotsKernel<<<64, 256>>>( + heap, slotSize, d_slots); + + cudaMemcpy( + &h_slots, + d_slots, + sizeof(unsigned), + cudaMemcpyDeviceToHost); + cudaFree(d_slots); + return h_slots; + } + + /** Count, how many elements can be allocated at maximum + * + * Takes an input size and determines, how many elements of this + * size can be allocated with the CreationPolicy Scatter. This will + * return the maximum number of free slots of the indicated size. It + * is not guaranteed where these slots are (regarding + * fragmentation). Therefore, the practically usable number of slots + * might be smaller. This function is executed separately for each + * warp and does not cooperate with other warps. Maximum speed is + * expected if every thread in the warp executes the function. Uses + * 256 byte of shared memory. + * + * @param slotSize the size of allocatable elements to count + */ + __device__ auto getAvailableSlotsAccelerator(size_t slotSize) + -> unsigned + { + const int wId + = warpid_withinblock(); // do not use warpid-function, since + // this value is not guaranteed to + // be stable across warp lifetime #if(__CUDACC_VER_MAJOR__ >= 9) - uint32 activeThreads = __popc(__activemask()); + const uint32 activeThreads = __popc(__activemask()); #else - uint32 activeThreads = __popc(__ballot(true)); + const uint32 activeThreads = __popc(__ballot(true)); #endif - __shared__ uint32 activePerWarp[MaxThreadsPerBlock::value / WarpSize::value]; //maximum number of warps in a block - __shared__ unsigned warpResults[MaxThreadsPerBlock::value / WarpSize::value]; - warpResults[wId] = 0; - activePerWarp[wId] = 0; - - // the active threads obtain an id from 0 to activeThreads-1 - if(slotSize>0) linearId = atomicAdd(&activePerWarp[wId], 1); - else return 0; - - //printf("Block %d, id %d: activeThreads=%d linearId=%d\n",blockIdx.x,threadIdx.x,activeThreads,linearId); - unsigned temp = getAvailaibleSlotsDeviceFunction(slotSize, linearId, activeThreads); - if(temp) atomicAdd(&warpResults[wId], temp); - __threadfence_block(); - return warpResults[wId]; - } - - - static std::string classname(){ - std::stringstream ss; - ss << "Scatter["; - ss << pagesize << ","; - ss << accessblocks << ","; - ss << regionsize << ","; - ss << wastefactor << ","; - ss << resetfreedpages << ","; - ss << hashingK << ","; - ss << hashingDistMP << ","; - ss << hashingDistWP << ","; - ss << hashingDistWPRel<< "]"; - return ss.str(); - } - - }; - -} //namespace CreationPolicies -} //namespace mallocMC + __shared__ uint32 activePerWarp + [MaxThreadsPerBlock::value + / WarpSize::value]; // maximum number of warps in a block + __shared__ unsigned + warpResults[MaxThreadsPerBlock::value / WarpSize::value]; + warpResults[wId] = 0; + activePerWarp[wId] = 0; + + // the active threads obtain an id from 0 to activeThreads-1 + if(slotSize == 0) + return 0; + const int linearId = atomicAdd(&activePerWarp[wId], 1); + + // printf("Block %d, id %d: activeThreads=%d + // linearId=%d\n",blockIdx.x,threadIdx.x,activeThreads,linearId); + const unsigned temp = getAvailaibleSlotsDeviceFunction( + slotSize, linearId, activeThreads); + if(temp) + atomicAdd(&warpResults[wId], temp); + __threadfence_block(); + return warpResults[wId]; + } + + static auto classname() -> std::string + { + std::stringstream ss; + ss << "Scatter["; + ss << pagesize << ","; + ss << accessblocks << ","; + ss << regionsize << ","; + ss << wastefactor << ","; + ss << resetfreedpages << ","; + ss << hashingK << ","; + ss << hashingDistMP << ","; + ss << hashingDistWP << ","; + ss << hashingDistWPRel << "]"; + return ss.str(); + } + }; + + } // namespace CreationPolicies +} // namespace mallocMC diff --git a/src/include/mallocMC/device_allocator.hpp b/src/include/mallocMC/device_allocator.hpp index 6c24fcfb..cf938be7 100644 --- a/src/include/mallocMC/device_allocator.hpp +++ b/src/include/mallocMC/device_allocator.hpp @@ -28,74 +28,60 @@ #pragma once -#include "mallocMC_utils.hpp" #include "mallocMC_constraints.hpp" #include "mallocMC_prefixes.hpp" #include "mallocMC_traits.hpp" +#include "mallocMC_utils.hpp" -#include -#include - -namespace mallocMC{ - -namespace detail{ +#include +#include - /** - * @brief Template class to call getAvailableSlots[Host|Accelerator] if the CreationPolicy provides it. - * - * Returns 0 else. - * - * @tparam T_Allocator The type of the Allocator to be used - * @tparam T_isHost True for the host call, false for the accelerator call - * @tparam T_providesAvailableSlots If the CreationPolicy provides getAvailableSlots[Host|Accelerator] (auto filled, do not set) - */ - template< - typename T_Allocator, - bool T_providesAvailableSlots - > - struct GetAvailableSlotsIfAvailAcc +namespace mallocMC +{ + namespace detail { - MAMC_ACCELERATOR static - unsigned - getAvailableSlots( - size_t, - T_Allocator & - ) + /** + * @brief Template class to call getAvailableSlots[Host|Accelerator] if + * the CreationPolicy provides it. + * + * Returns 0 else. + * + * @tparam T_Allocator The type of the Allocator to be used + * @tparam T_isHost True for the host call, false for the accelerator + * call + * @tparam T_providesAvailableSlots If the CreationPolicy provides + * getAvailableSlots[Host|Accelerator] (auto filled, do not set) + */ + template + struct GetAvailableSlotsIfAvailAcc { - return 0; - } - - }; - - template< - typename T_Allocator - > - struct GetAvailableSlotsIfAvailAcc< - T_Allocator, - true - >{ - MAMC_ACCELERATOR static - unsigned - getAvailableSlots( - size_t slotSize, - T_Allocator& alloc - ) + MAMC_ACCELERATOR static auto + getAvailableSlots(size_t, T_Allocator &) -> unsigned + { + return 0; + } + }; + + template + struct GetAvailableSlotsIfAvailAcc { - return alloc.T_Allocator::CreationPolicy - ::getAvailableSlotsAccelerator( slotSize ); - } - - }; - -} // namespace detail + MAMC_ACCELERATOR static auto + getAvailableSlots(size_t slotSize, T_Allocator & alloc) -> unsigned + { + return alloc + .T_Allocator::CreationPolicy ::getAvailableSlotsAccelerator( + slotSize); + } + }; + } // namespace detail /** * @brief "HostClass" that combines all policies to a useful allocator * - * This class implements the necessary glue-logic to form an actual allocator - * from the provided policies. It implements the public interface and - * executes some constraint checking based on an instance of the class + * This class implements the necessary glue-logic to form an actual + * allocator from the provided policies. It implements the public interface + * and executes some constraint checking based on an instance of the class * PolicyConstraints. * * @tparam T_CreationPolicy The desired type of a CreationPolicy @@ -108,63 +94,51 @@ namespace detail{ typename T_CreationPolicy, typename T_DistributionPolicy, typename T_OOMPolicy, - typename T_AlignmentPolicy - > - class DeviceAllocator : - public T_CreationPolicy + typename T_AlignmentPolicy> + class DeviceAllocator : public T_CreationPolicy { - typedef boost::uint32_t uint32; + using uint32 = std::uint32_t; + public: - typedef T_CreationPolicy CreationPolicy; - typedef T_DistributionPolicy DistributionPolicy; - typedef T_OOMPolicy OOMPolicy; - typedef T_AlignmentPolicy AlignmentPolicy; + using CreationPolicy = T_CreationPolicy; + using DistributionPolicy = T_DistributionPolicy; + using OOMPolicy = T_OOMPolicy; + using AlignmentPolicy = T_AlignmentPolicy; - void* pool; + void * pool; MAMC_ACCELERATOR - void* - malloc( - size_t bytes - ) + auto malloc(size_t bytes) -> void * { DistributionPolicy distributionPolicy; - bytes = AlignmentPolicy::applyPadding( bytes ); - uint32 req_size = distributionPolicy.collect( bytes ); - void* memBlock = CreationPolicy::create( req_size ); - const bool oom = CreationPolicy::isOOM( memBlock, req_size ); - if( oom ) - memBlock = OOMPolicy::handleOOM( memBlock ); - void* myPart = distributionPolicy.distribute( memBlock ); + bytes = AlignmentPolicy::applyPadding(bytes); + uint32 req_size = distributionPolicy.collect(bytes); + void * memBlock = CreationPolicy::create(req_size); + const bool oom = CreationPolicy::isOOM(memBlock, req_size); + if(oom) + memBlock = OOMPolicy::handleOOM(memBlock); + void * myPart = distributionPolicy.distribute(memBlock); return myPart; } MAMC_ACCELERATOR - void - free( - void* p - ) + void free(void * p) { - CreationPolicy::destroy( p ); + CreationPolicy::destroy(p); } - /* polymorphism over the availability of getAvailableSlots for calling * from the accelerator */ MAMC_ACCELERATOR - unsigned - getAvailableSlots( - size_t slotSize - ) + auto getAvailableSlots(size_t slotSize) -> unsigned { - slotSize = AlignmentPolicy::applyPadding( slotSize ); + slotSize = AlignmentPolicy::applyPadding(slotSize); return detail::GetAvailableSlotsIfAvailAcc< DeviceAllocator, - Traits< DeviceAllocator >::providesAvailableSlots - >::getAvailableSlots( slotSize, *this ); + Traits::providesAvailableSlots>:: + getAvailableSlots(slotSize, *this); } - }; } // namespace mallocMC diff --git a/src/include/mallocMC/distributionPolicies/Noop.hpp b/src/include/mallocMC/distributionPolicies/Noop.hpp index 608a5f2f..94efd135 100644 --- a/src/include/mallocMC/distributionPolicies/Noop.hpp +++ b/src/include/mallocMC/distributionPolicies/Noop.hpp @@ -27,17 +27,17 @@ #pragma once - -namespace mallocMC{ -namespace DistributionPolicies{ - - /** - * @brief a policy that does nothing - * - * This DistributionPolicy will not perform any distribution, but only return - * its input (identity function) - */ - class Noop; - -} //namespace DistributionPolicies -} //namespace mallocMC +namespace mallocMC +{ + namespace DistributionPolicies + { + /** + * @brief a policy that does nothing + * + * This DistributionPolicy will not perform any distribution, but only + * return its input (identity function) + */ + class Noop; + + } // namespace DistributionPolicies +} // namespace mallocMC diff --git a/src/include/mallocMC/distributionPolicies/Noop_impl.hpp b/src/include/mallocMC/distributionPolicies/Noop_impl.hpp index 6d55c2b9..ecc13455 100644 --- a/src/include/mallocMC/distributionPolicies/Noop_impl.hpp +++ b/src/include/mallocMC/distributionPolicies/Noop_impl.hpp @@ -27,36 +27,38 @@ #pragma once -#include -#include - -#include "Noop.hpp" #include "../mallocMC_prefixes.hpp" +#include "Noop.hpp" -namespace mallocMC{ -namespace DistributionPolicies{ - - class Noop - { - typedef boost::uint32_t uint32; - - public: +#include +#include - MAMC_ACCELERATOR - uint32 collect(uint32 bytes){ - return bytes; - } +namespace mallocMC +{ + namespace DistributionPolicies + { + class Noop + { + using uint32 = std::uint32_t; - MAMC_ACCELERATOR - void* distribute(void* allocatedMem){ - return allocatedMem; - } + public: + MAMC_ACCELERATOR + auto collect(uint32 bytes) const -> uint32 + { + return bytes; + } - static std::string classname(){ - return "Noop"; - } + MAMC_ACCELERATOR + auto distribute(void * allocatedMem) const -> void * + { + return allocatedMem; + } - }; + static auto classname() -> std::string + { + return "Noop"; + } + }; -} //namespace DistributionPolicies -} //namespace mallocMC + } // namespace DistributionPolicies +} // namespace mallocMC diff --git a/src/include/mallocMC/distributionPolicies/XMallocSIMD.hpp b/src/include/mallocMC/distributionPolicies/XMallocSIMD.hpp index c80b785a..17528509 100644 --- a/src/include/mallocMC/distributionPolicies/XMallocSIMD.hpp +++ b/src/include/mallocMC/distributionPolicies/XMallocSIMD.hpp @@ -33,39 +33,39 @@ #pragma once -#include +namespace mallocMC +{ + namespace DistributionPolicies + { + namespace XMallocSIMDConf + { + struct DefaultXMallocConfig + { + static constexpr auto pagesize = 4096; + }; + } // namespace XMallocSIMDConf -namespace mallocMC{ -namespace DistributionPolicies{ - - namespace XMallocSIMDConf{ - struct DefaultXMallocConfig{ - typedef boost::mpl::int_<4096> pagesize; - }; - } + /** + * @brief SIMD optimized chunk resizing in the style of XMalloc + * + * This DistributionPolicy can take the memory requests from a group of + * worker threads and combine them, so that only one of the workers will + * allocate the whole request. Later, each worker gets an appropriate + * offset into the allocated chunk. This is beneficial for SIMD + * architectures since only one of the workers has to compete for the + * resource. This algorithm is inspired by the XMalloc memory allocator + * (http://ieeexplore.ieee.org/xpls/abs_all.jsp?arnumber=5577907&tag=1) + * and its implementation in ScatterAlloc + * (http://ieeexplore.ieee.org/xpl/articleDetails.jsp?arnumber=6339604) + * XMallocSIMD is inteded to be used with Nvidia CUDA capable + * accelerators that support at least compute capability 2.0 + * + * @tparam T_Config (optional) The configuration struct to overwrite + * default configuration. The default can be obtained through + * XMallocSIMD<>::Properties + */ + template + class XMallocSIMD; - /** - * @brief SIMD optimized chunk resizing in the style of XMalloc - * - * This DistributionPolicy can take the memory requests from a group of - * worker threads and combine them, so that only one of the workers will - * allocate the whole request. Later, each worker gets an appropriate offset - * into the allocated chunk. This is beneficial for SIMD architectures since - * only one of the workers has to compete for the resource. This algorithm - * is inspired by the XMalloc memory allocator - * (http://ieeexplore.ieee.org/xpls/abs_all.jsp?arnumber=5577907&tag=1) and - * its implementation in ScatterAlloc - * (http://ieeexplore.ieee.org/xpl/articleDetails.jsp?arnumber=6339604) - * XMallocSIMD is inteded to be used with Nvidia CUDA capable accelerators - * that support at least compute capability 2.0 - * - * @tparam T_Config (optional) The configuration struct to overwrite - * default configuration. The default can be obtained through - * XMallocSIMD<>::Properties - */ - template - class XMallocSIMD; - - -} //namespace DistributionPolicies -} //namespace mallocMC + } // namespace DistributionPolicies +} // namespace mallocMC diff --git a/src/include/mallocMC/distributionPolicies/XMallocSIMD_impl.hpp b/src/include/mallocMC/distributionPolicies/XMallocSIMD_impl.hpp index 37afe7f8..bafd62d0 100644 --- a/src/include/mallocMC/distributionPolicies/XMallocSIMD_impl.hpp +++ b/src/include/mallocMC/distributionPolicies/XMallocSIMD_impl.hpp @@ -33,39 +33,43 @@ #pragma once -#include -#include -#include -#include -#include - -#include "../mallocMC_utils.hpp" #include "../mallocMC_prefixes.hpp" +#include "../mallocMC_utils.hpp" #include "XMallocSIMD.hpp" -namespace mallocMC{ -namespace DistributionPolicies{ - - template - class XMallocSIMD - { - private: - - typedef boost::uint32_t uint32; - bool can_use_coalescing; - uint32 warpid; - uint32 myoffset; - uint32 threadcount; - uint32 req_size; - public: - typedef T_Config Properties; - - MAMC_ACCELERATOR - XMallocSIMD() : can_use_coalescing(false), warpid(warpid_withinblock()), - myoffset(0), threadcount(0), req_size(0) - {} - - private: +#include +#include +#include +#include + +namespace mallocMC +{ + namespace DistributionPolicies + { + template + class XMallocSIMD + { + private: + using uint32 = std::uint32_t; + bool can_use_coalescing; + uint32 warpid; + uint32 myoffset; + uint32 threadcount; + uint32 req_size; + + public: + using Properties = T_Config; + + MAMC_ACCELERATOR + XMallocSIMD() : + can_use_coalescing(false), + warpid(warpid_withinblock()), + myoffset(0), + threadcount(0), + req_size(0) + {} + + private: /** Allow for a hierarchical validation of parameters: * * shipped default-parameters (in the inherited struct) have lowest precedence. @@ -76,86 +80,82 @@ namespace DistributionPolicies{ * default-struct < template-struct < command-line parameter */ #ifndef MALLOCMC_DP_XMALLOCSIMD_PAGESIZE -#define MALLOCMC_DP_XMALLOCSIMD_PAGESIZE Properties::pagesize::value +#define MALLOCMC_DP_XMALLOCSIMD_PAGESIZE (Properties::pagesize) #endif - BOOST_STATIC_CONSTEXPR uint32 pagesize = MALLOCMC_DP_XMALLOCSIMD_PAGESIZE; - - //all the properties must be unsigned integers > 0 - BOOST_STATIC_ASSERT(!std::numeric_limits::is_signed); - - // \TODO: The static_cast can be removed once the minimal dependencies of - // this project is are at least CUDA 7.0 and gcc 4.8.2 - BOOST_STATIC_ASSERT(static_cast(pagesize) > 0); - - public: - BOOST_STATIC_CONSTEXPR uint32 _pagesize = pagesize; - - MAMC_ACCELERATOR - uint32 collect(uint32 bytes){ - - can_use_coalescing = false; - myoffset = 0; - threadcount = 0; - - //init with initial counter - __shared__ uint32 warp_sizecounter[MaxThreadsPerBlock::value / WarpSize::value]; - warp_sizecounter[warpid] = 16; - - //second half: make sure that all coalesced allocations can fit within one page - //necessary for offset calculation - bool coalescible = bytes > 0 && bytes < (pagesize / 32); + static constexpr uint32 pagesize = MALLOCMC_DP_XMALLOCSIMD_PAGESIZE; + + public: + static constexpr uint32 _pagesize = pagesize; + + MAMC_ACCELERATOR + auto collect(uint32 bytes) -> uint32 + { + can_use_coalescing = false; + myoffset = 0; + threadcount = 0; + + // init with initial counter + __shared__ uint32 warp_sizecounter + [MaxThreadsPerBlock::value / WarpSize::value]; + warp_sizecounter[warpid] = 16; + + // second half: make sure that all coalesced allocations can fit + // within one page necessary for offset calculation + bool coalescible = bytes > 0 && bytes < (pagesize / 32); #if(__CUDACC_VER_MAJOR__ >= 9) - threadcount = __popc(__ballot_sync(__activemask(), coalescible)); + threadcount + = __popc(__ballot_sync(__activemask(), coalescible)); #else - threadcount = __popc(__ballot(coalescible)); + threadcount = __popc(__ballot(coalescible)); #endif - if (coalescible && threadcount > 1) - { - myoffset = atomicAdd(&warp_sizecounter[warpid], bytes); - can_use_coalescing = true; - } - - req_size = bytes; - if (can_use_coalescing) - req_size = (myoffset == 16) ? warp_sizecounter[warpid] : 0; - - return req_size; - } - - - MAMC_ACCELERATOR - void* distribute(void* allocatedMem){ - __shared__ char* warp_res[MaxThreadsPerBlock::value / WarpSize::value]; - - char* myalloc = (char*) allocatedMem; - if (req_size && can_use_coalescing) - { - warp_res[warpid] = myalloc; - if (myalloc != 0) - *(uint32*)myalloc = threadcount; - } - __threadfence_block(); - - void *myres = myalloc; - if(can_use_coalescing) - { - if(warp_res[warpid] != 0) - myres = warp_res[warpid] + myoffset; - else - myres = 0; - } - return myres; - } - - MAMC_HOST - static std::string classname(){ - std::stringstream ss; - ss << "XMallocSIMD[" << pagesize << "]"; - return ss.str(); - } - - }; - -} //namespace DistributionPolicies - -} //namespace mallocMC + if(coalescible && threadcount > 1) + { + myoffset = atomicAdd(&warp_sizecounter[warpid], bytes); + can_use_coalescing = true; + } + + req_size = bytes; + if(can_use_coalescing) + req_size = (myoffset == 16) ? warp_sizecounter[warpid] : 0; + + return req_size; + } + + MAMC_ACCELERATOR + auto distribute(void * allocatedMem) -> void * + { + __shared__ char * + warp_res[MaxThreadsPerBlock::value / WarpSize::value]; + + char * myalloc = (char *)allocatedMem; + if(req_size && can_use_coalescing) + { + warp_res[warpid] = myalloc; + if(myalloc != 0) + *(uint32 *)myalloc = threadcount; + } + __threadfence_block(); + + void * myres = myalloc; + if(can_use_coalescing) + { + if(warp_res[warpid] != 0) + myres = warp_res[warpid] + myoffset; + else + myres = 0; + } + return myres; + } + + MAMC_HOST + static auto classname() -> std::string + { + std::stringstream ss; + ss << "XMallocSIMD[" << pagesize << "]"; + return ss.str(); + } + }; + + } // namespace DistributionPolicies + +} // namespace mallocMC diff --git a/src/include/mallocMC/mallocMC.hpp b/src/include/mallocMC/mallocMC.hpp index 4e421032..9fb7d7aa 100644 --- a/src/include/mallocMC/mallocMC.hpp +++ b/src/include/mallocMC/mallocMC.hpp @@ -43,8 +43,8 @@ #include "mallocMC_hostclass.hpp" // all the policies +#include "AlignmentPolicies.hpp" #include "CreationPolicies.hpp" #include "DistributionPolicies.hpp" -#include "ReservePoolPolicies.hpp" -#include "AlignmentPolicies.hpp" #include "OOMPolicies.hpp" +#include "ReservePoolPolicies.hpp" diff --git a/src/include/mallocMC/mallocMC_allocator_handle.hpp b/src/include/mallocMC/mallocMC_allocator_handle.hpp index 2832496d..9a52e910 100644 --- a/src/include/mallocMC/mallocMC_allocator_handle.hpp +++ b/src/include/mallocMC/mallocMC_allocator_handle.hpp @@ -30,49 +30,34 @@ #include "mallocMC_prefixes.hpp" -namespace mallocMC{ - - template +namespace mallocMC +{ + template struct AllocatorHandleImpl { - typedef typename T_HostAllocator::DevAllocator DevAllocator; + using DevAllocator = typename T_HostAllocator::DevAllocator; - DevAllocator* devAllocator; + DevAllocator * devAllocator; - AllocatorHandleImpl( - DevAllocator* p - ) : - devAllocator( p ) - { - } + explicit AllocatorHandleImpl(DevAllocator * p) : devAllocator(p) {} MAMC_ACCELERATOR - void* - malloc( - size_t size - ) + auto malloc(size_t size) -> void * { - return devAllocator->malloc( size ); + return devAllocator->malloc(size); } MAMC_ACCELERATOR - void - free( - void* p - ) + void free(void * p) { - devAllocator->free( p ); + devAllocator->free(p); } MAMC_ACCELERATOR - unsigned - getAvailableSlots( - size_t slotSize - ) + auto getAvailableSlots(size_t slotSize) -> unsigned { - return devAllocator->getAvailableSlots( slotSize ); + return devAllocator->getAvailableSlots(slotSize); } - }; } // namespace mallocMC diff --git a/src/include/mallocMC/mallocMC_constraints.hpp b/src/include/mallocMC/mallocMC_constraints.hpp index 867e97f7..5f6f4b36 100644 --- a/src/include/mallocMC/mallocMC_constraints.hpp +++ b/src/include/mallocMC/mallocMC_constraints.hpp @@ -30,58 +30,69 @@ #include "creationPolicies/Scatter.hpp" #include "distributionPolicies/XMallocSIMD.hpp" -#include -namespace mallocMC{ - - /** The default PolicyCheckers (do always succeed) - */ - template - class PolicyCheck1{}; - - template - class PolicyCheck2{}; - - template - class PolicyCheck3{}; - - template - class PolicyCheck4{}; - - template - class PolicyCheck5{}; - - - /** Enforces constraints on policies or combinations of polices - * - * Uses template specialization of PolicyChecker - */ - template < - typename T_CreationPolicy, - typename T_DistributionPolicy, - typename T_OOMPolicy, - typename T_GetHeapPolicy, - typename T_AlignmentPolicy - > - - class PolicyConstraints:PolicyCheck2{ - - }; - - - /** Scatter and XMallocSIMD need the same pagesize! - * - * This constraint ensures that if the CreationPolicy "Scatter" and the - * DistributionPolicy "XMallocSIMD" are selected, they are configured to use - * the same value for their "pagesize"-parameter. - */ - template - class PolicyCheck2< - typename CreationPolicies::Scatter, - typename DistributionPolicies::XMallocSIMD - >{ - BOOST_MPL_ASSERT_MSG(x::pagesize::value == z::pagesize::value, - Pagesize_must_be_the_same_when_combining_Scatter_and_XMallocSIMD, () ); - }; - -}//namespace mallocMC +namespace mallocMC +{ + /** The default PolicyCheckers (do always succeed) + */ + template + class PolicyCheck1 + {}; + + template + class PolicyCheck2 + {}; + + template + class PolicyCheck3 + {}; + + template< + typename Policy1, + typename Policy2, + typename Policy3, + typename Policy4> + class PolicyCheck4 + {}; + + template< + typename Policy1, + typename Policy2, + typename Policy3, + typename Policy4, + typename Policy5> + class PolicyCheck5 + {}; + + /** Enforces constraints on policies or combinations of polices + * + * Uses template specialization of PolicyChecker + */ + template< + typename T_CreationPolicy, + typename T_DistributionPolicy, + typename T_OOMPolicy, + typename T_GetHeapPolicy, + typename T_AlignmentPolicy> + + class PolicyConstraints : + PolicyCheck2 + {}; + + /** Scatter and XMallocSIMD need the same pagesize! + * + * This constraint ensures that if the CreationPolicy "Scatter" and the + * DistributionPolicy "XMallocSIMD" are selected, they are configured to use + * the same value for their "pagesize"-parameter. + */ + template + class PolicyCheck2< + typename CreationPolicies::Scatter, + typename DistributionPolicies::XMallocSIMD> + { + static_assert( + x::pagesize == z::pagesize, + "Pagesize must be the same when combining Scatter and XMallocSIMD"); + }; + +} // namespace mallocMC diff --git a/src/include/mallocMC/mallocMC_hostclass.hpp b/src/include/mallocMC/mallocMC_hostclass.hpp index c47e6a15..48bc1f74 100644 --- a/src/include/mallocMC/mallocMC_hostclass.hpp +++ b/src/include/mallocMC/mallocMC_hostclass.hpp @@ -28,6 +28,6 @@ #pragma once -#include "mallocMC_traits.hpp" -#include "device_allocator.hpp" #include "allocator.hpp" +#include "device_allocator.hpp" +#include "mallocMC_traits.hpp" diff --git a/src/include/mallocMC/mallocMC_prefixes.hpp b/src/include/mallocMC/mallocMC_prefixes.hpp index 1199e3e1..71c15372 100644 --- a/src/include/mallocMC/mallocMC_prefixes.hpp +++ b/src/include/mallocMC/mallocMC_prefixes.hpp @@ -28,6 +28,7 @@ #pragma once +#include + #define MAMC_HOST __host__ #define MAMC_ACCELERATOR __device__ - diff --git a/src/include/mallocMC/mallocMC_traits.hpp b/src/include/mallocMC/mallocMC_traits.hpp index 0e811d74..d40efa83 100644 --- a/src/include/mallocMC/mallocMC_traits.hpp +++ b/src/include/mallocMC/mallocMC_traits.hpp @@ -28,15 +28,12 @@ #pragma once -#include - - -namespace mallocMC{ - - template - struct Traits{ - BOOST_STATIC_CONSTEXPR bool providesAvailableSlots = T_Allocator::CreationPolicy::providesAvailableSlots::value; +namespace mallocMC +{ + template + struct Traits + { + static constexpr bool providesAvailableSlots + = T_Allocator::CreationPolicy::providesAvailableSlots; }; - -} //namespace mallocMC - +} // namespace mallocMC diff --git a/src/include/mallocMC/mallocMC_utils.hpp b/src/include/mallocMC/mallocMC_utils.hpp index 2353cd37..ebe5ee83 100644 --- a/src/include/mallocMC/mallocMC_utils.hpp +++ b/src/include/mallocMC/mallocMC_utils.hpp @@ -33,204 +33,204 @@ #pragma once +#include + #ifdef _MSC_VER #include #endif -#include -#include -#include -#include - #include "mallocMC_prefixes.hpp" +#include +#include +#include +#include namespace CUDA { - class error : public std::runtime_error - { - private: - static std::string genErrorString(cudaError errorValue, const char* file, int line) - { - std::ostringstream msg; - msg << file << '(' << line << "): error: " << cudaGetErrorString(errorValue); - return msg.str(); - } - public: - error(cudaError errorValue, const char* file, int line) - : runtime_error(genErrorString(errorValue, file, line)) + class error : public std::runtime_error + { + private: + static auto + genErrorString(cudaError errorValue, const char * file, int line) + -> std::string + { + std::ostringstream msg; + msg << file << '(' << line + << "): error: " << cudaGetErrorString(errorValue); + return msg.str(); + } + + public: + error(cudaError errorValue, const char * file, int line) : + runtime_error(genErrorString(errorValue, file, line)) + {} + + explicit error(cudaError errorValue) : + runtime_error(cudaGetErrorString(errorValue)) + {} + + explicit error(const std::string & msg) : runtime_error(msg) {} + }; + + inline void checkError(cudaError errorValue, const char * file, int line) { + if(errorValue != cudaSuccess) + throw CUDA::error(errorValue, file, line); } - error(cudaError errorValue) - : runtime_error(cudaGetErrorString(errorValue)) + inline void checkError(const char * file, int line) { + checkError(cudaGetLastError(), file, line); } - error(const std::string& msg) - : runtime_error(msg) + inline void checkError() { + cudaError errorValue = cudaGetLastError(); + if(errorValue != cudaSuccess) + throw CUDA::error(errorValue); } - }; - - inline void checkError(cudaError errorValue, const char* file, int line) - { - if (errorValue != cudaSuccess) - throw CUDA::error(errorValue, file, line); - } - - inline void checkError(const char* file, int line) - { - checkError(cudaGetLastError(), file, line); - } - - inline void checkError() - { - cudaError errorValue = cudaGetLastError(); - if (errorValue != cudaSuccess) - throw CUDA::error(errorValue); - } - -#define MALLOCMC_CUDA_CHECKED_CALL(call) CUDA::checkError(call, __FILE__, __LINE__) -#define MALLOCMC_CUDA_CHECK_ERROR() CUDA::checkError(__FILE__, __LINE__) -} +#define MALLOCMC_CUDA_CHECKED_CALL(call) \ + CUDA::checkError(call, __FILE__, __LINE__) +#define MALLOCMC_CUDA_CHECK_ERROR() CUDA::checkError(__FILE__, __LINE__) +} // namespace CUDA namespace mallocMC { + template + class __PointerEquivalent + { + public: + using type = unsigned int; + }; + template<> + class __PointerEquivalent<8> + { + public: + using type = unsigned long long; + }; + + using PointerEquivalent + = mallocMC::__PointerEquivalent::type; - template - class __PointerEquivalent - { - public: - typedef unsigned int type; - }; - template<> - class __PointerEquivalent<8> - { - public: - typedef unsigned long long int type; - }; - - typedef mallocMC::__PointerEquivalent::type PointerEquivalent; - - - MAMC_ACCELERATOR inline boost::uint32_t laneid() - { - boost::uint32_t mylaneid; - asm("mov.u32 %0, %%laneid;" : "=r" (mylaneid)); - return mylaneid; - } - - /** warp index within a multiprocessor - * - * Index of the warp within the multiprocessor at the moment of the query. - * The result is volatile and can be different with each query. - * - * @return current index of the warp - */ - MAMC_ACCELERATOR inline boost::uint32_t warpid() - { - boost::uint32_t mywarpid; - asm("mov.u32 %0, %%warpid;" : "=r" (mywarpid)); - return mywarpid; - } - - /** maximum number of warps on a multiprocessor - * - * @return maximum number of warps on a multiprocessor - */ - MAMC_ACCELERATOR inline boost::uint32_t nwarpid() - { - boost::uint32_t mynwarpid; - asm("mov.u32 %0, %%nwarpid;" : "=r" (mynwarpid)); - return mynwarpid; - } - - MAMC_ACCELERATOR inline boost::uint32_t smid() - { - boost::uint32_t mysmid; - asm("mov.u32 %0, %%smid;" : "=r" (mysmid)); - return mysmid; - } - - MAMC_ACCELERATOR inline boost::uint32_t nsmid() - { - boost::uint32_t mynsmid; - asm("mov.u32 %0, %%nsmid;" : "=r" (mynsmid)); - return mynsmid; - } - MAMC_ACCELERATOR inline boost::uint32_t lanemask() - { - boost::uint32_t lanemask; - asm("mov.u32 %0, %%lanemask_eq;" : "=r" (lanemask)); - return lanemask; - } - - MAMC_ACCELERATOR inline boost::uint32_t lanemask_le() - { - boost::uint32_t lanemask; - asm("mov.u32 %0, %%lanemask_le;" : "=r" (lanemask)); - return lanemask; - } - - MAMC_ACCELERATOR inline boost::uint32_t lanemask_lt() - { - boost::uint32_t lanemask; - asm("mov.u32 %0, %%lanemask_lt;" : "=r" (lanemask)); - return lanemask; - } - - MAMC_ACCELERATOR inline boost::uint32_t lanemask_ge() - { - boost::uint32_t lanemask; - asm("mov.u32 %0, %%lanemask_ge;" : "=r" (lanemask)); - return lanemask; - } - - MAMC_ACCELERATOR inline boost::uint32_t lanemask_gt() - { - boost::uint32_t lanemask; - asm("mov.u32 %0, %%lanemask_gt;" : "=r" (lanemask)); - return lanemask; - } - - template - MAMC_HOST MAMC_ACCELERATOR inline T divup(T a, T b) { return (a + b - 1)/b; } - - /** the maximal number threads per block - * - * https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#compute-capabilities - */ - struct MaxThreadsPerBlock - { - // valid for sm_2.X - sm_7.5 - BOOST_STATIC_CONSTEXPR uint32_t value = 1024; - }; - - /** number of threads within a warp - * - * https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#compute-capabilities - */ - struct WarpSize - { - // valid for sm_2.X - sm_7.5 - BOOST_STATIC_CONSTEXPR uint32_t value = 32; - }; - - /** warp id within a cuda block - * - * The id is constant over the lifetime of the thread. - * The id is not equal to warpid(). - * - * @return warp id within the block - */ - MAMC_ACCELERATOR inline boost::uint32_t warpid_withinblock() - { - return ( - threadIdx.z * blockDim.y * blockDim.x + - threadIdx.y * blockDim.x + - threadIdx.x - ) / WarpSize::value; - } -} + MAMC_ACCELERATOR inline auto laneid() -> std::uint32_t + { + std::uint32_t mylaneid; + asm("mov.u32 %0, %%laneid;" : "=r"(mylaneid)); + return mylaneid; + } + + /** warp index within a multiprocessor + * + * Index of the warp within the multiprocessor at the moment of the query. + * The result is volatile and can be different with each query. + * + * @return current index of the warp + */ + MAMC_ACCELERATOR inline auto warpid() -> std::uint32_t + { + std::uint32_t mywarpid; + asm("mov.u32 %0, %%warpid;" : "=r"(mywarpid)); + return mywarpid; + } + + /** maximum number of warps on a multiprocessor + * + * @return maximum number of warps on a multiprocessor + */ + MAMC_ACCELERATOR inline auto nwarpid() -> std::uint32_t + { + std::uint32_t mynwarpid; + asm("mov.u32 %0, %%nwarpid;" : "=r"(mynwarpid)); + return mynwarpid; + } + + MAMC_ACCELERATOR inline auto smid() -> std::uint32_t + { + std::uint32_t mysmid; + asm("mov.u32 %0, %%smid;" : "=r"(mysmid)); + return mysmid; + } + + MAMC_ACCELERATOR inline auto nsmid() -> std::uint32_t + { + std::uint32_t mynsmid; + asm("mov.u32 %0, %%nsmid;" : "=r"(mynsmid)); + return mynsmid; + } + MAMC_ACCELERATOR inline auto lanemask() -> std::uint32_t + { + std::uint32_t lanemask; + asm("mov.u32 %0, %%lanemask_eq;" : "=r"(lanemask)); + return lanemask; + } + + MAMC_ACCELERATOR inline auto lanemask_le() -> std::uint32_t + { + std::uint32_t lanemask; + asm("mov.u32 %0, %%lanemask_le;" : "=r"(lanemask)); + return lanemask; + } + + MAMC_ACCELERATOR inline auto lanemask_lt() -> std::uint32_t + { + std::uint32_t lanemask; + asm("mov.u32 %0, %%lanemask_lt;" : "=r"(lanemask)); + return lanemask; + } + + MAMC_ACCELERATOR inline auto lanemask_ge() -> std::uint32_t + { + std::uint32_t lanemask; + asm("mov.u32 %0, %%lanemask_ge;" : "=r"(lanemask)); + return lanemask; + } + + MAMC_ACCELERATOR inline auto lanemask_gt() -> std::uint32_t + { + std::uint32_t lanemask; + asm("mov.u32 %0, %%lanemask_gt;" : "=r"(lanemask)); + return lanemask; + } + + template + MAMC_HOST MAMC_ACCELERATOR inline auto divup(T a, T b) -> T + { + return (a + b - 1) / b; + } + + /** the maximal number threads per block + * + * https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#compute-capabilities + */ + struct MaxThreadsPerBlock + { + // valid for sm_2.X - sm_7.5 + static constexpr uint32_t value = 1024; + }; + + /** number of threads within a warp + * + * https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#compute-capabilities + */ + struct WarpSize + { + // valid for sm_2.X - sm_7.5 + static constexpr uint32_t value = 32; + }; + + /** warp id within a cuda block + * + * The id is constant over the lifetime of the thread. + * The id is not equal to warpid(). + * + * @return warp id within the block + */ + MAMC_ACCELERATOR inline auto warpid_withinblock() -> std::uint32_t + { + return (threadIdx.z * blockDim.y * blockDim.x + threadIdx.y * blockDim.x + + threadIdx.x) + / WarpSize::value; + } +} // namespace mallocMC diff --git a/src/include/mallocMC/oOMPolicies/BadAllocException.hpp b/src/include/mallocMC/oOMPolicies/BadAllocException.hpp index 9f25ecae..26a13418 100644 --- a/src/include/mallocMC/oOMPolicies/BadAllocException.hpp +++ b/src/include/mallocMC/oOMPolicies/BadAllocException.hpp @@ -27,19 +27,21 @@ #pragma once -namespace mallocMC{ -namespace OOMPolicies{ - - /** - * @brief Throws a std::bad_alloc exception on OutOfMemory - * - * This OOMPolicy will throw a std::bad_alloc exception, if the accelerator - * supports it. Currently, Nvidia CUDA does not support any form of exception - * handling, therefore handleOOM() does not have any effect on these - * accelerators. Using this policy on other types of accelerators that do not - * support exceptions results in undefined behaviour. - */ - struct BadAllocException; - -} //namespace OOMPolicies -} //namespace mallocMC +namespace mallocMC +{ + namespace OOMPolicies + { + /** + * @brief Throws a std::bad_alloc exception on OutOfMemory + * + * This OOMPolicy will throw a std::bad_alloc exception, if the + * accelerator supports it. Currently, Nvidia CUDA does not support any + * form of exception handling, therefore handleOOM() does not have any + * effect on these accelerators. Using this policy on other types of + * accelerators that do not support exceptions results in undefined + * behaviour. + */ + struct BadAllocException; + + } // namespace OOMPolicies +} // namespace mallocMC diff --git a/src/include/mallocMC/oOMPolicies/BadAllocException_impl.hpp b/src/include/mallocMC/oOMPolicies/BadAllocException_impl.hpp index fb180370..05bd6626 100644 --- a/src/include/mallocMC/oOMPolicies/BadAllocException_impl.hpp +++ b/src/include/mallocMC/oOMPolicies/BadAllocException_impl.hpp @@ -27,19 +27,21 @@ #pragma once -#include -#include - -#include "BadAllocException.hpp" #include "../mallocMC_prefixes.hpp" +#include "BadAllocException.hpp" -namespace mallocMC{ -namespace OOMPolicies{ +#include +#include - struct BadAllocException - { - MAMC_ACCELERATOR - static void* handleOOM(void* mem){ +namespace mallocMC +{ + namespace OOMPolicies + { + struct BadAllocException + { + MAMC_ACCELERATOR + static auto handleOOM(void * mem) -> void * + { #ifdef __CUDACC__ //#if __CUDA_ARCH__ < 350 #define PM_EXCEPTIONS_NOT_SUPPORTED_HERE @@ -48,18 +50,18 @@ namespace OOMPolicies{ #ifdef PM_EXCEPTIONS_NOT_SUPPORTED_HERE #undef PM_EXCEPTIONS_NOT_SUPPORTED_HERE - assert(false); + assert(false); #else - std::bad_alloc exception; - throw exception; + throw std::bad_alloc{}; #endif - return mem; - } + return mem; + } - static std::string classname(){ - return "BadAllocException"; - } - }; + static auto classname() -> std::string + { + return "BadAllocException"; + } + }; -} //namespace OOMPolicies -} //namespace mallocMC + } // namespace OOMPolicies +} // namespace mallocMC diff --git a/src/include/mallocMC/oOMPolicies/ReturnNull.hpp b/src/include/mallocMC/oOMPolicies/ReturnNull.hpp index 2db8568a..1d723133 100644 --- a/src/include/mallocMC/oOMPolicies/ReturnNull.hpp +++ b/src/include/mallocMC/oOMPolicies/ReturnNull.hpp @@ -27,15 +27,16 @@ #pragma once -namespace mallocMC{ -namespace OOMPolicies{ - - /** - * @brief Returns a NULL pointer on OutOfMemory conditions - * - * This OOMPolicy will return NULL, if handleOOM() is called. - */ - class ReturnNull; - -} //namespace OOMPolicies -} //namespace mallocMC +namespace mallocMC +{ + namespace OOMPolicies + { + /** + * @brief Returns a nullptr pointer on OutOfMemory conditions + * + * This OOMPolicy will return nullptr, if handleOOM() is called. + */ + class ReturnNull; + + } // namespace OOMPolicies +} // namespace mallocMC diff --git a/src/include/mallocMC/oOMPolicies/ReturnNull_impl.hpp b/src/include/mallocMC/oOMPolicies/ReturnNull_impl.hpp index f18bd0e2..3e44b18f 100644 --- a/src/include/mallocMC/oOMPolicies/ReturnNull_impl.hpp +++ b/src/include/mallocMC/oOMPolicies/ReturnNull_impl.hpp @@ -27,26 +27,29 @@ #pragma once -#include - -#include "ReturnNull.hpp" #include "../mallocMC_prefixes.hpp" +#include "ReturnNull.hpp" -namespace mallocMC{ -namespace OOMPolicies{ - - class ReturnNull - { - public: - MAMC_ACCELERATOR - static void* handleOOM(void* mem){ - return NULL; - } - - static std::string classname(){ - return "ReturnNull"; - } - }; +#include -} //namespace OOMPolicies -} //namespace mallocMC +namespace mallocMC +{ + namespace OOMPolicies + { + class ReturnNull + { + public: + MAMC_ACCELERATOR + static auto handleOOM(void * mem) -> void * + { + return nullptr; + } + + static auto classname() -> std::string + { + return "ReturnNull"; + } + }; + + } // namespace OOMPolicies +} // namespace mallocMC diff --git a/src/include/mallocMC/reservePoolPolicies/CudaSetLimits.hpp b/src/include/mallocMC/reservePoolPolicies/CudaSetLimits.hpp index cfbceea2..1cee0a5b 100644 --- a/src/include/mallocMC/reservePoolPolicies/CudaSetLimits.hpp +++ b/src/include/mallocMC/reservePoolPolicies/CudaSetLimits.hpp @@ -27,18 +27,19 @@ #pragma once -namespace mallocMC{ -namespace ReservePoolPolicies{ - - /** - * @brief set CUDA internal heap for device-side malloc calls - * - * This ReservePoolPolicy is intended for use with CUDA capable accelerators - * that support at least compute capability 2.0. It should be used in - * conjunction with a CreationPolicy that actually requires the CUDA-internal - * heap to be sized by calls to cudaDeviceSetLimit() - */ - struct CudaSetLimits; - -} //namespace ReservePoolPolicies -} //namespace mallocMC +namespace mallocMC +{ + namespace ReservePoolPolicies + { + /** + * @brief set CUDA internal heap for device-side malloc calls + * + * This ReservePoolPolicy is intended for use with CUDA capable + * accelerators that support at least compute capability 2.0. It should + * be used in conjunction with a CreationPolicy that actually requires + * the CUDA-internal heap to be sized by calls to cudaDeviceSetLimit() + */ + struct CudaSetLimits; + + } // namespace ReservePoolPolicies +} // namespace mallocMC diff --git a/src/include/mallocMC/reservePoolPolicies/CudaSetLimits_impl.hpp b/src/include/mallocMC/reservePoolPolicies/CudaSetLimits_impl.hpp index eb4ddfa2..baed9ad6 100644 --- a/src/include/mallocMC/reservePoolPolicies/CudaSetLimits_impl.hpp +++ b/src/include/mallocMC/reservePoolPolicies/CudaSetLimits_impl.hpp @@ -27,29 +27,33 @@ #pragma once -#include -#include - #include "CudaSetLimits.hpp" -namespace mallocMC{ -namespace ReservePoolPolicies{ - - struct CudaSetLimits{ - static void* setMemPool(size_t memsize){ - cudaDeviceSetLimit(cudaLimitMallocHeapSize, memsize); - return NULL; - } - - static void resetMemPool(void *p=NULL){ - cudaDeviceSetLimit(cudaLimitMallocHeapSize, 8192U); - } - - static std::string classname(){ - return "CudaSetLimits"; - } - - }; +#include +#include -} //namespace ReservePoolPolicies -} //namespace mallocMC +namespace mallocMC +{ + namespace ReservePoolPolicies + { + struct CudaSetLimits + { + static auto setMemPool(size_t memsize) -> void * + { + cudaDeviceSetLimit(cudaLimitMallocHeapSize, memsize); + return nullptr; + } + + static void resetMemPool(void * p = nullptr) + { + cudaDeviceSetLimit(cudaLimitMallocHeapSize, 8192U); + } + + static auto classname() -> std::string + { + return "CudaSetLimits"; + } + }; + + } // namespace ReservePoolPolicies +} // namespace mallocMC diff --git a/src/include/mallocMC/reservePoolPolicies/SimpleCudaMalloc.hpp b/src/include/mallocMC/reservePoolPolicies/SimpleCudaMalloc.hpp index 1bb386f7..c98197d1 100644 --- a/src/include/mallocMC/reservePoolPolicies/SimpleCudaMalloc.hpp +++ b/src/include/mallocMC/reservePoolPolicies/SimpleCudaMalloc.hpp @@ -27,18 +27,19 @@ #pragma once -namespace mallocMC{ -namespace ReservePoolPolicies{ - - /** - * @brief creates/allocates a fixed memory pool on the accelerator - * - * This ReservePoolPolicy will create a memory pool of a fixed size on the - * accelerator by using a host-side call to cudaMalloc(). The pool is later - * freed through cudaFree(). This can only be used with accelerators that - * support CUDA and compute capability 2.0 or higher. - */ - struct SimpleCudaMalloc; - -} //namespace ReservePoolPolicies -} //namespace mallocMC +namespace mallocMC +{ + namespace ReservePoolPolicies + { + /** + * @brief creates/allocates a fixed memory pool on the accelerator + * + * This ReservePoolPolicy will create a memory pool of a fixed size on + * the accelerator by using a host-side call to cudaMalloc(). The pool + * is later freed through cudaFree(). This can only be used with + * accelerators that support CUDA and compute capability 2.0 or higher. + */ + struct SimpleCudaMalloc; + + } // namespace ReservePoolPolicies +} // namespace mallocMC diff --git a/src/include/mallocMC/reservePoolPolicies/SimpleCudaMalloc_impl.hpp b/src/include/mallocMC/reservePoolPolicies/SimpleCudaMalloc_impl.hpp index 23563d97..71b297e2 100644 --- a/src/include/mallocMC/reservePoolPolicies/SimpleCudaMalloc_impl.hpp +++ b/src/include/mallocMC/reservePoolPolicies/SimpleCudaMalloc_impl.hpp @@ -27,31 +27,34 @@ #pragma once -#include - #include "../mallocMC_utils.hpp" - #include "SimpleCudaMalloc.hpp" -namespace mallocMC{ -namespace ReservePoolPolicies{ - - struct SimpleCudaMalloc{ - static void* setMemPool(size_t memsize){ - void* pool = NULL; - MALLOCMC_CUDA_CHECKED_CALL(cudaMalloc(&pool, memsize)); - return pool; - } - - static void resetMemPool(void* p){ - MALLOCMC_CUDA_CHECKED_CALL(cudaFree(p)); - } - - static std::string classname(){ - return "SimpleCudaMalloc"; - } - - }; +#include -} //namespace ReservePoolPolicies -} //namespace mallocMC +namespace mallocMC +{ + namespace ReservePoolPolicies + { + struct SimpleCudaMalloc + { + static auto setMemPool(size_t memsize) -> void * + { + void * pool = nullptr; + MALLOCMC_CUDA_CHECKED_CALL(cudaMalloc(&pool, memsize)); + return pool; + } + + static void resetMemPool(void * p) + { + MALLOCMC_CUDA_CHECKED_CALL(cudaFree(p)); + } + + static auto classname() -> std::string + { + return "SimpleCudaMalloc"; + } + }; + + } // namespace ReservePoolPolicies +} // namespace mallocMC diff --git a/src/include/mallocMC/version.hpp b/src/include/mallocMC/version.hpp index f2c15e1e..8a81728c 100644 --- a/src/include/mallocMC/version.hpp +++ b/src/include/mallocMC/version.hpp @@ -38,8 +38,8 @@ /** the mallocMC version: major API changes should be reflected here */ #define MALLOCMC_VERSION_MAJOR 2 -#define MALLOCMC_VERSION_MINOR 3 -#define MALLOCMC_VERSION_PATCH 1 +#define MALLOCMC_VERSION_MINOR 4 +#define MALLOCMC_VERSION_PATCH 0 /** the mallocMC flavor is used to differentiate the releases of the * Computational Radiation Physics group (crp) from other releases diff --git a/tests/verify_heap.cu b/tests/verify_heap.cu index d5a3004f..3ee214ab 100644 --- a/tests/verify_heap.cu +++ b/tests/verify_heap.cu @@ -26,57 +26,75 @@ THE SOFTWARE. */ - // get a CUDA error and print it nicely -#define CUDA_CHECK(cmd) {cudaError_t error = cmd; \ - if(error!=cudaSuccess){\ - printf("<%s>:%i ",__FILE__,__LINE__);\ - printf("[CUDA] Error: %s\n", cudaGetErrorString(error));}} +#define CUDA_CHECK(cmd) \ + { \ + cudaError_t error = cmd; \ + if(error != cudaSuccess) \ + { \ + printf("<%s>:%i ", __FILE__, __LINE__); \ + printf("[CUDA] Error: %s\n", cudaGetErrorString(error)); \ + } \ + } // start kernel, wait for finish and check errors -#define CUDA_CHECK_KERNEL_SYNC(...) __VA_ARGS__;CUDA_CHECK(cudaDeviceSynchronize()) +#define CUDA_CHECK_KERNEL_SYNC(...) \ + __VA_ARGS__; \ + CUDA_CHECK(cudaDeviceSynchronize()) // each pointer in the datastructure will point to this many // elements of type allocElem_t -#define ELEMS_PER_SLOT 750 +constexpr auto ELEMS_PER_SLOT = 750; +#include #include #include -#include +#include #include #include -//include the Heap with the arguments given in the config -#include "src/include/mallocMC/mallocMC_utils.hpp" +// include the Heap with the arguments given in the config #include "verify_heap_config.hpp" +#include + // global variable for verbosity, might change due to user input '--verbose' bool verbose = false; // the type of the elements to allocate -typedef unsigned long long allocElem_t; - -bool run_heap_verification(const size_t, const unsigned, const unsigned, const bool); -void parse_cmdline(const int, char**, size_t*, unsigned*, unsigned*, bool*); -void print_help(char**); +using allocElem_t = unsigned long long; +auto run_heap_verification( + const size_t, + const unsigned, + const unsigned, + const bool) -> bool; +void parse_cmdline( + const int, + char **, + size_t *, + unsigned *, + unsigned *, + bool *); +void print_help(char **); // used to create an empty stream for non-verbose output -struct nullstream : std::ostream { - nullstream() : std::ostream(0) { } +struct nullstream : std::ostream +{ + nullstream() : std::ostream(0) {} }; -// uses global verbosity to switch between std::cout and a NULL-output -std::ostream& dout() { - static nullstream n; - return verbose ? std::cout : n; +// uses global verbosity to switch between std::cout and a nullptr-output +auto dout() -> std::ostream & +{ + static nullstream n; + return verbose ? std::cout : n; } // define some defaults -BOOST_STATIC_CONSTEXPR unsigned threads_default = 128; -BOOST_STATIC_CONSTEXPR unsigned blocks_default = 64; -BOOST_STATIC_CONSTEXPR size_t heapInMB_default = 1024; // 1GB - +static constexpr unsigned threads_default = 128; +static constexpr unsigned blocks_default = 64; +static constexpr size_t heapInMB_default = 1024; // 1GB /** * will do a basic verification of scatterAlloc. @@ -87,39 +105,50 @@ BOOST_STATIC_CONSTEXPR size_t heapInMB_default = 1024; // 1GB * @return will return 0 if the verification was successful, * otherwise returns 1 */ -int main(int argc, char** argv){ - bool correct = false; - bool machine_readable = false; - size_t heapInMB = heapInMB_default; - unsigned threads = threads_default; - unsigned blocks = blocks_default; - - parse_cmdline(argc, argv, &heapInMB, &threads, &blocks, &machine_readable); - - cudaDeviceProp deviceProp; - cudaGetDeviceProperties(&deviceProp, 0); - - if( deviceProp.major < 2 ) { - std::cerr << "Error: Compute Capability >= 2.0 required. (is "; - std::cerr << deviceProp.major << "."<< deviceProp.minor << ")" << std::endl; - return 1; - } - - cudaSetDevice(0); - correct = run_heap_verification(heapInMB, threads, blocks, machine_readable); - cudaDeviceReset(); - - if(!machine_readable || verbose){ - if(correct){ - std::cout << "\033[0;32mverification successful ✔\033[0m" << std::endl; - return 0; - }else{ - std::cerr << "\033[0;31mverification failed\033[0m" << std::endl; - return 1; +auto main(int argc, char ** argv) -> int +{ + bool correct = false; + bool machine_readable = false; + size_t heapInMB = heapInMB_default; + unsigned threads = threads_default; + unsigned blocks = blocks_default; + + parse_cmdline(argc, argv, &heapInMB, &threads, &blocks, &machine_readable); + + int computeCapabilityMajor = 0; + cudaDeviceGetAttribute( + &computeCapabilityMajor, cudaDevAttrComputeCapabilityMajor, 0); + int computeCapabilityMinor = 0; + cudaDeviceGetAttribute( + &computeCapabilityMinor, cudaDevAttrComputeCapabilityMinor, 0); + + if(computeCapabilityMajor < 2) + { + std::cerr << "Error: Compute Capability >= 2.0 required. (is "; + std::cerr << computeCapabilityMajor << "." << computeCapabilityMinor + << ")\n"; + return 1; } - } -} + cudaSetDevice(0); + correct + = run_heap_verification(heapInMB, threads, blocks, machine_readable); + cudaDeviceReset(); + + if(!machine_readable || verbose) + { + if(correct) + { + std::cout << "\033[0;32mverification successful ✔\033[0m\n"; + return 0; + } + else + { + std::cerr << "\033[0;31mverification failed\033[0m\n"; + return 1; + } + } +} /** * will parse command line arguments @@ -134,98 +163,106 @@ int main(int argc, char** argv){ */ void parse_cmdline( const int argc, - char**argv, - size_t *heapInMB, - unsigned *threads, - unsigned *blocks, - bool *machine_readable - ){ - - std::vector > parameters; - - // Parse Commandline, tokens are shaped like ARG=PARAM or ARG - // This requires to use '=', if you want to supply a value with a parameter - for (int i = 1; i < argc; ++i) { - char* pos = strtok(argv[i], "="); - std::pair < std::string, std::string > p(std::string(pos), std::string("")); - pos = strtok(NULL, "="); - if (pos != NULL) { - p.second = std::string(pos); + char ** argv, + size_t * heapInMB, + unsigned * threads, + unsigned * blocks, + bool * machine_readable) +{ + std::vector> parameters; + + // Parse Commandline, tokens are shaped like ARG=PARAM or ARG + // This requires to use '=', if you want to supply a value with a parameter + for(int i = 1; i < argc; ++i) + { + char * pos = strtok(argv[i], "="); + std::pair p( + std::string(pos), std::string("")); + pos = strtok(nullptr, "="); + if(pos != nullptr) + { + p.second = std::string(pos); + } + parameters.push_back(p); } - parameters.push_back(p); - } - - // go through all parameters that were found - for (unsigned i = 0; i < parameters.size(); ++i) { - std::pair < std::string, std::string > p = parameters.at(i); - if (p.first == "-v" || p.first == "--verbose") { - verbose = true; + // go through all parameters that were found + for(unsigned i = 0; i < parameters.size(); ++i) + { + std::pair p = parameters.at(i); + + if(p.first == "-v" || p.first == "--verbose") + { + verbose = true; + } + + if(p.first == "--threads") + { + *threads = atoi(p.second.c_str()); + } + + if(p.first == "--blocks") + { + *blocks = atoi(p.second.c_str()); + } + + if(p.first == "--heapsize") + { + *heapInMB = size_t(atoi(p.second.c_str())); + } + + if(p.first == "-h" || p.first == "--help") + { + print_help(argv); + exit(0); + } + + if(p.first == "-m" || p.first == "--machine_readable") + { + *machine_readable = true; + } } - - if (p.first == "--threads") { - *threads = atoi(p.second.c_str()); - } - - if (p.first == "--blocks") { - *blocks = atoi(p.second.c_str()); - } - - if(p.first == "--heapsize") { - *heapInMB = size_t(atoi(p.second.c_str())); - } - - if(p.first == "-h" || p.first == "--help"){ - print_help(argv); - exit(0); - } - - if(p.first == "-m" || p.first == "--machine_readable"){ - *machine_readable = true; - } - } } - /** * prints a helpful message about program use * * @param argv the argv-parameter from main, used to find the program name */ -void print_help(char** argv){ - std::stringstream s; - - s << "SYNOPSIS:" << std::endl; - s << argv[0] << " [OPTIONS]" << std::endl; - s << "" << std::endl; - s << "OPTIONS:" << std::endl; - s << " -h, --help" << std::endl; - s << " Print this help message and exit" << std::endl; - s << "" << std::endl; - s << " -v, --verbose" << std::endl; - s << " Print information about parameters and progress" << std::endl; - s << "" << std::endl; - s << " -m, --machine_readable" << std::endl; - s << " Print all relevant parameters as CSV. This will" << std::endl; - s << " suppress all other output unless explicitly" << std::endl; - s << " requested with --verbose or -v" << std::endl; - s << "" << std::endl; - s << " --threads=N" << std::endl; - s << " Set the number of threads per block (default " ; - s << threads_default << "128)" << std::endl; - s << "" << std::endl; - s << " --blocks=N" << std::endl; - s << " Set the number of blocks in the grid (default " ; - s << blocks_default << ")" << std::endl; - s << "" << std::endl; - s << " --heapsize=N" << std::endl; - s << " Set the heapsize to N Megabyte (default " ; - s << heapInMB_default << "1024)" << std::endl; - - std::cout << s.str(); +void print_help(char ** argv) +{ + std::stringstream s; + + s << "SYNOPSIS:" << '\n'; + s << argv[0] << " [OPTIONS]" << '\n'; + s << "" << '\n'; + s << "OPTIONS:" << '\n'; + s << " -h, --help" << '\n'; + s << " Print this help message and exit" << '\n'; + s << "" << '\n'; + s << " -v, --verbose" << '\n'; + s << " Print information about parameters and progress" << '\n'; + s << "" << '\n'; + s << " -m, --machine_readable" << '\n'; + s << " Print all relevant parameters as CSV. This will" << '\n'; + s << " suppress all other output unless explicitly" << '\n'; + s << " requested with --verbose or -v" << '\n'; + s << "" << '\n'; + s << " --threads=N" << '\n'; + s << " Set the number of threads per block (default "; + s << threads_default << "128)" << '\n'; + s << "" << '\n'; + s << " --blocks=N" << '\n'; + s << " Set the number of blocks in the grid (default "; + s << blocks_default << ")" << '\n'; + s << "" << '\n'; + s << " --heapsize=N" << '\n'; + s << " Set the heapsize to N Megabyte (default "; + s << heapInMB_default << "1024)" << '\n'; + + std::cout << s.str() << std::flush; } - /** * checks validity of memory for each single cell * @@ -245,31 +282,37 @@ void print_help(char** argv){ * Will change to 0, if there was a value that didn't match */ __global__ void check_content( - allocElem_t** data, - unsigned long long *counter, - unsigned long long* globalSum, + allocElem_t ** data, + unsigned long long * counter, + unsigned long long * globalSum, const size_t nSlots, - int* correct - ){ - - unsigned long long sum=0; - while(true){ - size_t pos = atomicAdd(counter,1); - if(pos >= nSlots){break;} - const size_t offset = pos*ELEMS_PER_SLOT; - for(size_t i=0;i(data[pos][i]) != static_cast(offset+i)){ - //printf("\nError in Kernel: data[%llu][%llu] is %#010x (should be %#010x)\n", - // pos,i,static_cast(data[pos][i]),allocElem_t(offset+i)); - atomicAnd(correct,0); - } - sum += static_cast(data[pos][i]); + int * correct) +{ + unsigned long long sum = 0; + while(true) + { + const size_t pos = atomicAdd(counter, 1); + if(pos >= nSlots) + { + break; + } + const size_t offset = pos * ELEMS_PER_SLOT; + for(size_t i = 0; i < ELEMS_PER_SLOT; ++i) + { + if(static_cast(data[pos][i]) + != static_cast(offset + i)) + { + // printf("\nError in Kernel: data[%llu][%llu] is %#010x (should + // be %#010x)\n", + // pos,i,static_cast(data[pos][i]),allocElem_t(offset+i)); + atomicAnd(correct, 0); + } + sum += static_cast(data[pos][i]); + } } - } - atomicAdd(globalSum,sum); + atomicAdd(globalSum, sum); } - /** * checks validity of memory for each single cell * @@ -285,27 +328,32 @@ __global__ void check_content( * Will change to 0, if there was a value that didn't match */ __global__ void check_content_fast( - allocElem_t** data, - unsigned long long *counter, + allocElem_t ** data, + unsigned long long * counter, const size_t nSlots, - int* correct - ){ - - int c = 1; - while(true){ - size_t pos = atomicAdd(counter,1); - if(pos >= nSlots){break;} - const size_t offset = pos*ELEMS_PER_SLOT; - for(size_t i=0;i(data[pos][i]) != static_cast(offset+i)){ - c=0; - } + int * correct) +{ + int c = 1; + while(true) + { + size_t pos = atomicAdd(counter, 1); + if(pos >= nSlots) + { + break; + } + const size_t offset = pos * ELEMS_PER_SLOT; + for(size_t i = 0; i < ELEMS_PER_SLOT; ++i) + { + if(static_cast(data[pos][i]) + != static_cast(offset + i)) + { + c = 0; + } + } } - } - atomicAnd(correct,c); + atomicAnd(correct, c); } - /** * allocate a lot of small arrays and fill them * @@ -320,30 +368,32 @@ __global__ void check_content_fast( * allocated structures (for verification purposes) */ __global__ void allocAll( - allocElem_t** data, - unsigned long long* counter, - unsigned long long* globalSum, - ScatterAllocator::AllocatorHandle mMC - ){ - - unsigned long long sum=0; - while(true){ - allocElem_t* p = (allocElem_t*) mMC.malloc(sizeof(allocElem_t) * ELEMS_PER_SLOT); - if(p == NULL) break; - - size_t pos = atomicAdd(counter,1); - const size_t offset = pos*ELEMS_PER_SLOT; - for(size_t i=0;i(offset + i); - sum += static_cast(p[i]); + allocElem_t ** data, + unsigned long long * counter, + unsigned long long * globalSum, + ScatterAllocator::AllocatorHandle mMC) +{ + unsigned long long sum = 0; + while(true) + { + allocElem_t * p + = (allocElem_t *)mMC.malloc(sizeof(allocElem_t) * ELEMS_PER_SLOT); + if(p == nullptr) + break; + + size_t pos = atomicAdd(counter, 1); + const size_t offset = pos * ELEMS_PER_SLOT; + for(size_t i = 0; i < ELEMS_PER_SLOT; ++i) + { + p[i] = static_cast(offset + i); + sum += static_cast(p[i]); + } + data[pos] = p; } - data[pos] = p; - } - atomicAdd(globalSum,sum); + atomicAdd(globalSum, sum); } - /** * free all the values again * @@ -353,20 +403,20 @@ __global__ void allocAll( * @param max the maximum number of elements to free */ __global__ void deallocAll( - allocElem_t** data, - unsigned long long* counter, + allocElem_t ** data, + unsigned long long * counter, const size_t nSlots, - ScatterAllocator::AllocatorHandle mMC - ){ - - while(true){ - size_t pos = atomicAdd(counter,1); - if(pos >= nSlots) break; - mMC.free(data[pos]); - } + ScatterAllocator::AllocatorHandle mMC) +{ + while(true) + { + size_t pos = atomicAdd(counter, 1); + if(pos >= nSlots) + break; + mMC.free(data[pos]); + } } - /** * damages one element in the data * @@ -376,11 +426,11 @@ __global__ void deallocAll( * * @param data the datastructure to damage */ -__global__ void damageElement(allocElem_t** data){ - data[1][0] = static_cast(5*ELEMS_PER_SLOT - 1); +__global__ void damageElement(allocElem_t ** data) +{ + data[1][0] = static_cast(5 * ELEMS_PER_SLOT - 1); } - /** * wrapper function to allocate memory on device * @@ -396,35 +446,43 @@ __global__ void damageElement(allocElem_t** data){ * @param threads the number of CUDA threads per block */ void allocate( - allocElem_t** d_testData, - unsigned long long* h_nSlots, - unsigned long long* h_sum, + allocElem_t ** d_testData, + unsigned long long * h_nSlots, + unsigned long long * h_sum, const unsigned blocks, const unsigned threads, - ScatterAllocator* mMC - ){ - - dout() << "allocating on device..."; - - unsigned long long zero = 0; - unsigned long long *d_sum; - unsigned long long *d_nSlots; - - MALLOCMC_CUDA_CHECKED_CALL(cudaMalloc((void**) &d_sum,sizeof(unsigned long long))); - MALLOCMC_CUDA_CHECKED_CALL(cudaMalloc((void**) &d_nSlots, sizeof(unsigned long long))); - MALLOCMC_CUDA_CHECKED_CALL(cudaMemcpy(d_sum,&zero,sizeof(unsigned long long),cudaMemcpyHostToDevice)); - MALLOCMC_CUDA_CHECKED_CALL(cudaMemcpy(d_nSlots,&zero,sizeof(unsigned long long),cudaMemcpyHostToDevice)); - - CUDA_CHECK_KERNEL_SYNC(allocAll<<>>(d_testData, d_nSlots, d_sum, *mMC )); - - MALLOCMC_CUDA_CHECKED_CALL(cudaMemcpy(h_sum,d_sum,sizeof(unsigned long long),cudaMemcpyDeviceToHost)); - MALLOCMC_CUDA_CHECKED_CALL(cudaMemcpy(h_nSlots,d_nSlots,sizeof(unsigned long long),cudaMemcpyDeviceToHost)); - cudaFree(d_sum); - cudaFree(d_nSlots); - dout() << "done" << std::endl; + ScatterAllocator * mMC) +{ + dout() << "allocating on device..."; + + unsigned long long zero = 0; + unsigned long long * d_sum; + unsigned long long * d_nSlots; + + MALLOCMC_CUDA_CHECKED_CALL( + cudaMalloc((void **)&d_sum, sizeof(unsigned long long))); + MALLOCMC_CUDA_CHECKED_CALL( + cudaMalloc((void **)&d_nSlots, sizeof(unsigned long long))); + MALLOCMC_CUDA_CHECKED_CALL(cudaMemcpy( + d_sum, &zero, sizeof(unsigned long long), cudaMemcpyHostToDevice)); + MALLOCMC_CUDA_CHECKED_CALL(cudaMemcpy( + d_nSlots, &zero, sizeof(unsigned long long), cudaMemcpyHostToDevice)); + + CUDA_CHECK_KERNEL_SYNC( + allocAll<<>>(d_testData, d_nSlots, d_sum, *mMC)); + + MALLOCMC_CUDA_CHECKED_CALL(cudaMemcpy( + h_sum, d_sum, sizeof(unsigned long long), cudaMemcpyDeviceToHost)); + MALLOCMC_CUDA_CHECKED_CALL(cudaMemcpy( + h_nSlots, + d_nSlots, + sizeof(unsigned long long), + cudaMemcpyDeviceToHost)); + cudaFree(d_sum); + cudaFree(d_nSlots); + dout() << "done\n"; } - /** * Wrapper function to verify allocation on device * @@ -438,72 +496,74 @@ void allocate( * @param threads the number of CUDA threads per block * @return true if the verification was successful, false otherwise */ -bool verify( - allocElem_t **d_testData, +auto verify( + allocElem_t ** d_testData, const unsigned long long nSlots, const unsigned blocks, - const unsigned threads - ){ - - dout() << "verifying on device... "; - - const unsigned long long zero = 0; - int h_correct = 1; - int* d_correct; - unsigned long long *d_sum; - unsigned long long *d_counter; - - MALLOCMC_CUDA_CHECKED_CALL(cudaMalloc((void**) &d_sum, sizeof(unsigned long long))); - MALLOCMC_CUDA_CHECKED_CALL(cudaMalloc((void**) &d_counter, sizeof(unsigned long long))); - MALLOCMC_CUDA_CHECKED_CALL(cudaMalloc((void**) &d_correct, sizeof(int))); - MALLOCMC_CUDA_CHECKED_CALL(cudaMemcpy(d_sum,&zero,sizeof(unsigned long long),cudaMemcpyHostToDevice)); - MALLOCMC_CUDA_CHECKED_CALL(cudaMemcpy(d_counter,&zero,sizeof(unsigned long long),cudaMemcpyHostToDevice)); - MALLOCMC_CUDA_CHECKED_CALL(cudaMemcpy(d_correct,&h_correct,sizeof(int),cudaMemcpyHostToDevice)); - - // can be replaced by a call to check_content_fast, - // if the gaussian sum (see below) is not used and you - // want to be a bit faster - CUDA_CHECK_KERNEL_SYNC(check_content<<>>( - d_testData, - d_counter, - d_sum, - static_cast(nSlots), - d_correct - )); - MALLOCMC_CUDA_CHECKED_CALL(cudaMemcpy(&h_correct,d_correct,sizeof(int),cudaMemcpyDeviceToHost)); - - // This only works, if the type "allocElem_t" - // can hold all the IDs (usually unsigned long long) - /* - dout() << "verifying on host..."; - unsigned long long h_sum, h_counter; - unsigned long long gaussian_sum = (ELEMS_PER_SLOT*nSlots * (ELEMS_PER_SLOT*nSlots-1))/2; - MALLOCMC_CUDA_CHECKED_CALL(cudaMemcpy(&h_sum,d_sum,sizeof(unsigned long long),cudaMemcpyDeviceToHost)); - MALLOCMC_CUDA_CHECKED_CALL(cudaMemcpy(&h_counter,d_counter,sizeof(unsigned long long),cudaMemcpyDeviceToHost)); - if(gaussian_sum != h_sum){ - dout() << "\nGaussian Sum doesn't match: is " << h_sum; - dout() << " (should be " << gaussian_sum << ")" << std::endl; - h_correct=false; - } - if(nSlots != h_counter-(blocks*threads)){ - dout() << "\nallocated number of elements doesn't match: is " << h_counter; - dout() << " (should be " << nSlots << ")" << std::endl; - h_correct=false; - } - */ - - if(h_correct){ - dout() << "done" << std::endl; - }else{ - dout() << "failed" << std::endl; - } - - cudaFree(d_correct); - cudaFree(d_sum); - cudaFree(d_counter); - return static_cast(h_correct); -} + const unsigned threads) -> bool +{ + dout() << "verifying on device... "; + + const unsigned long long zero = 0; + int h_correct = 1; + int * d_correct; + unsigned long long * d_sum; + unsigned long long * d_counter; + + MALLOCMC_CUDA_CHECKED_CALL( + cudaMalloc((void **)&d_sum, sizeof(unsigned long long))); + MALLOCMC_CUDA_CHECKED_CALL( + cudaMalloc((void **)&d_counter, sizeof(unsigned long long))); + MALLOCMC_CUDA_CHECKED_CALL(cudaMalloc((void **)&d_correct, sizeof(int))); + MALLOCMC_CUDA_CHECKED_CALL(cudaMemcpy( + d_sum, &zero, sizeof(unsigned long long), cudaMemcpyHostToDevice)); + MALLOCMC_CUDA_CHECKED_CALL(cudaMemcpy( + d_counter, &zero, sizeof(unsigned long long), cudaMemcpyHostToDevice)); + MALLOCMC_CUDA_CHECKED_CALL( + cudaMemcpy(d_correct, &h_correct, sizeof(int), cudaMemcpyHostToDevice)); + + // can be replaced by a call to check_content_fast, + // if the gaussian sum (see below) is not used and you + // want to be a bit faster + CUDA_CHECK_KERNEL_SYNC(check_content<<>>( + d_testData, d_counter, d_sum, static_cast(nSlots), d_correct)); + MALLOCMC_CUDA_CHECKED_CALL( + cudaMemcpy(&h_correct, d_correct, sizeof(int), cudaMemcpyDeviceToHost)); + + // This only works, if the type "allocElem_t" + // can hold all the IDs (usually unsigned long long) + /* + dout() << "verifying on host..."; + unsigned long long h_sum, h_counter; + unsigned long long gaussian_sum = (ELEMS_PER_SLOT*nSlots * + (ELEMS_PER_SLOT*nSlots-1))/2; + MALLOCMC_CUDA_CHECKED_CALL(cudaMemcpy(&h_sum,d_sum,sizeof(unsigned long + long),cudaMemcpyDeviceToHost)); + MALLOCMC_CUDA_CHECKED_CALL(cudaMemcpy(&h_counter,d_counter,sizeof(unsigned + long long),cudaMemcpyDeviceToHost)); if(gaussian_sum != h_sum){ dout() << + "\nGaussian Sum doesn't match: is " << h_sum; dout() << " (should be " << + gaussian_sum << ")\n"; h_correct=false; + } + if(nSlots != h_counter-(blocks*threads)){ + dout() << "\nallocated number of elements doesn't match: is " << + h_counter; dout() << " (should be " << nSlots << ")\n"; h_correct=false; + } + */ + + if(h_correct) + { + dout() << "done\n"; + } + else + { + dout() << "failed\n"; + } + cudaFree(d_correct); + cudaFree(d_sum); + cudaFree(d_counter); + return static_cast(h_correct); +} /** * prints all parameters machine readable @@ -511,81 +571,79 @@ bool verify( * for params, see run_heap_verification-internal parameters */ void print_machine_readable( - const unsigned pagesize, - const unsigned accessblocks, - const unsigned regionsize, - const unsigned wastefactor, - const bool resetfreedpages, - const unsigned blocks, - const unsigned threads, - const unsigned elemsPerSlot, - const size_t allocElemSize, - const size_t heapSize, - const size_t maxSpace, - const size_t maxSlots, - const unsigned long long usedSlots, - const float allocFrac, - const size_t wasted, - const bool correct - ){ - - std::string sep = ","; - std::stringstream h; - std::stringstream v; + const unsigned pagesize, + const unsigned accessblocks, + const unsigned regionsize, + const unsigned wastefactor, + const bool resetfreedpages, + const unsigned blocks, + const unsigned threads, + const unsigned elemsPerSlot, + const size_t allocElemSize, + const size_t heapSize, + const size_t maxSpace, + const size_t maxSlots, + const unsigned long long usedSlots, + const float allocFrac, + const size_t wasted, + const bool correct) +{ + std::string sep = ","; + std::stringstream h; + std::stringstream v; - h << "PagesizeByte" << sep; - v << pagesize << sep; + h << "PagesizeByte" << sep; + v << pagesize << sep; - h << "Accessblocks" << sep; - v << accessblocks << sep; + h << "Accessblocks" << sep; + v << accessblocks << sep; - h << "Regionsize" << sep; - v << regionsize << sep; + h << "Regionsize" << sep; + v << regionsize << sep; - h << "Wastefactor" << sep; - v << wasted << sep; + h << "Wastefactor" << sep; + v << wasted << sep; - h << "ResetFreedPage" << sep; - v << resetfreedpages << sep; + h << "ResetFreedPage" << sep; + v << resetfreedpages << sep; - h << "Gridsize" << sep; - v << blocks << sep; + h << "Gridsize" << sep; + v << blocks << sep; - h << "Blocksize" << sep; - v << threads << sep; + h << "Blocksize" << sep; + v << threads << sep; - h << "ELEMS_PER_SLOT" << sep; - v << elemsPerSlot << sep; + h << "ELEMS_PER_SLOT" << sep; + v << elemsPerSlot << sep; - h << "allocElemByte" << sep; - v << allocElemSize << sep; + h << "allocElemByte" << sep; + v << allocElemSize << sep; - h << "heapsizeByte" << sep; - v << heapSize << sep; + h << "heapsizeByte" << sep; + v << heapSize << sep; - h << "maxSpaceByte" << sep; - v << maxSpace << sep; + h << "maxSpaceByte" << sep; + v << maxSpace << sep; - h << "maxSlots" << sep; - v << maxSlots << sep; + h << "maxSlots" << sep; + v << maxSlots << sep; - h << "usedSlots" << sep; - v << usedSlots << sep; + h << "usedSlots" << sep; + v << usedSlots << sep; - h << "allocFraction" << sep; - v << allocFrac << sep; + h << "allocFraction" << sep; + v << allocFrac << sep; - h << "wastedBytes" << sep; - v << wasted << sep; + h << "wastedBytes" << sep; + v << wasted << sep; - h << "correct" ; - v << correct ; + h << "correct"; + v << correct; - std::cout << h.str() << std::endl; - std::cout << v.str() << std::endl; + std::cout << h.str() << '\n'; + std::cout << v.str() << '\n'; } - /** * Verify the heap allocation of mallocMC * @@ -600,102 +658,111 @@ void print_machine_readable( * @return true if the verification was successful, * false otherwise */ -bool run_heap_verification( +auto run_heap_verification( const size_t heapMB, const unsigned blocks, const unsigned threads, - const bool machine_readable - ){ - - cudaSetDeviceFlags(cudaDeviceMapHost); - - const size_t heapSize = size_t(1024U*1024U) * heapMB; - const size_t slotSize = sizeof(allocElem_t)*ELEMS_PER_SLOT; - const size_t nPointers = ceil(static_cast(heapSize) / slotSize); - const size_t maxSlots = heapSize/slotSize; - const size_t maxSpace = maxSlots*slotSize + nPointers*sizeof(allocElem_t*); - bool correct = true; - const unsigned long long zero = 0; - - dout() << "CreationPolicy Arguments:" << std::endl; - dout() << "Pagesize: " << ScatterConfig::pagesize::value << std::endl; - dout() << "Accessblocks: " << ScatterConfig::accessblocks::value << std::endl; - dout() << "Regionsize: " << ScatterConfig::regionsize::value << std::endl; - dout() << "Wastefactor: " << ScatterConfig::wastefactor::value << std::endl; - dout() << "ResetFreedPages " << ScatterConfig::resetfreedpages::value << std::endl; - dout() << "" << std::endl; - dout() << "Gridsize: " << blocks << std::endl; - dout() << "Blocksize: " << threads << std::endl; - dout() << "Allocated elements: " << ELEMS_PER_SLOT << " x " << sizeof(allocElem_t); - dout() << " Byte (" << slotSize << " Byte)" << std::endl; - dout() << "Heap: " << heapSize << " Byte"; - dout() << " (" << heapSize/pow(1024,2) << " MByte)" << std::endl; - dout() << "max space w/ pointers: " << maxSpace << " Byte"; - dout() << " (" << maxSpace/pow(1024,2) << " MByte)" << std::endl; - dout() << "maximum of elements: " << maxSlots << std::endl; - - // initializing the heap - ScatterAllocator* mMC = new ScatterAllocator(heapSize); - allocElem_t** d_testData; - MALLOCMC_CUDA_CHECKED_CALL(cudaMalloc((void**) &d_testData, nPointers*sizeof(allocElem_t*))); - - // allocating with mallocMC - unsigned long long usedSlots = 0; - unsigned long long sumAllocElems = 0; - allocate(d_testData, &usedSlots, &sumAllocElems, blocks, threads, mMC); - - const float allocFrac = static_cast(usedSlots)*100/maxSlots; - const size_t wasted = heapSize - static_cast(usedSlots) * slotSize; - dout() << "allocated elements: " << usedSlots; - dout() << " (" << allocFrac << "%)" << std::endl; - dout() << "wasted heap space: " << wasted << " Byte"; - dout() << " (" << wasted/pow(1024,2) << " MByte)" << std::endl; - - // verifying on device - correct = correct && verify(d_testData,usedSlots,blocks,threads); - - // damaging one cell - dout() << "damaging of element... "; - CUDA_CHECK_KERNEL_SYNC(damageElement<<<1,1>>>(d_testData)); - dout() << "done" << std::endl; - - // verifying on device - // THIS SHOULD FAIL (damage was done before!). Therefore, we must inverse the logic - correct = correct && !verify(d_testData,usedSlots,blocks,threads); - - - // release all memory - dout() << "deallocation... "; - unsigned long long* d_dealloc_counter; - MALLOCMC_CUDA_CHECKED_CALL(cudaMalloc((void**) &d_dealloc_counter, sizeof(unsigned long long))); - MALLOCMC_CUDA_CHECKED_CALL(cudaMemcpy(d_dealloc_counter,&zero,sizeof(unsigned long long),cudaMemcpyHostToDevice)); - CUDA_CHECK_KERNEL_SYNC(deallocAll<<>>(d_testData,d_dealloc_counter,static_cast(usedSlots), *mMC )); - cudaFree(d_dealloc_counter); - cudaFree(d_testData); - delete mMC; - - dout() << "done "<< std::endl; - - if(machine_readable){ - print_machine_readable( - ScatterConfig::pagesize::value, - ScatterConfig::accessblocks::value, - ScatterConfig::regionsize::value, - ScatterConfig::wastefactor::value, - ScatterConfig::resetfreedpages::value, - blocks, - threads, - ELEMS_PER_SLOT, - sizeof(allocElem_t), - heapSize, - maxSpace, - maxSlots, - usedSlots, - allocFrac, - wasted, - correct - ); - } - - return correct; + const bool machine_readable) -> bool +{ + cudaSetDeviceFlags(cudaDeviceMapHost); + + const size_t heapSize = size_t(1024U * 1024U) * heapMB; + const size_t slotSize = sizeof(allocElem_t) * ELEMS_PER_SLOT; + const size_t nPointers = (heapSize + slotSize - 1) / slotSize; + const size_t maxSlots = heapSize / slotSize; + const size_t maxSpace + = maxSlots * slotSize + nPointers * sizeof(allocElem_t *); + bool correct = true; + const unsigned long long zero = 0; + + dout() << "CreationPolicy Arguments:\n"; + dout() << "Pagesize: " << ScatterConfig::pagesize << '\n'; + dout() << "Accessblocks: " << ScatterConfig::accessblocks << '\n'; + dout() << "Regionsize: " << ScatterConfig::regionsize << '\n'; + dout() << "Wastefactor: " << ScatterConfig::wastefactor << '\n'; + dout() << "ResetFreedPages " << ScatterConfig::resetfreedpages + << '\n'; + dout() << "\n"; + dout() << "Gridsize: " << blocks << '\n'; + dout() << "Blocksize: " << threads << '\n'; + dout() << "Allocated elements: " << ELEMS_PER_SLOT << " x " + << sizeof(allocElem_t); + dout() << " Byte (" << slotSize << " Byte)\n"; + dout() << "Heap: " << heapSize << " Byte"; + dout() << " (" << heapSize / pow(1024, 2) << " MByte)\n"; + dout() << "max space w/ pointers: " << maxSpace << " Byte"; + dout() << " (" << maxSpace / pow(1024, 2) << " MByte)\n"; + dout() << "maximum of elements: " << maxSlots << '\n'; + + // initializing the heap + ScatterAllocator * mMC = new ScatterAllocator(heapSize); + allocElem_t ** d_testData; + MALLOCMC_CUDA_CHECKED_CALL( + cudaMalloc((void **)&d_testData, nPointers * sizeof(allocElem_t *))); + + // allocating with mallocMC + unsigned long long usedSlots = 0; + unsigned long long sumAllocElems = 0; + allocate(d_testData, &usedSlots, &sumAllocElems, blocks, threads, mMC); + + const float allocFrac = static_cast(usedSlots) * 100 / maxSlots; + const size_t wasted = heapSize - static_cast(usedSlots) * slotSize; + dout() << "allocated elements: " << usedSlots; + dout() << " (" << allocFrac << "%)\n"; + dout() << "wasted heap space: " << wasted << " Byte"; + dout() << " (" << wasted / pow(1024, 2) << " MByte)\n"; + + // verifying on device + correct = correct && verify(d_testData, usedSlots, blocks, threads); + + // damaging one cell + dout() << "damaging of element... "; + CUDA_CHECK_KERNEL_SYNC(damageElement<<<1, 1>>>(d_testData)); + dout() << "done\n"; + + // verifying on device + // THIS SHOULD FAIL (damage was done before!). Therefore, we must inverse + // the logic + correct = correct && !verify(d_testData, usedSlots, blocks, threads); + + // release all memory + dout() << "deallocation... "; + unsigned long long * d_dealloc_counter; + MALLOCMC_CUDA_CHECKED_CALL( + cudaMalloc((void **)&d_dealloc_counter, sizeof(unsigned long long))); + MALLOCMC_CUDA_CHECKED_CALL(cudaMemcpy( + d_dealloc_counter, + &zero, + sizeof(unsigned long long), + cudaMemcpyHostToDevice)); + CUDA_CHECK_KERNEL_SYNC(deallocAll<<>>( + d_testData, d_dealloc_counter, static_cast(usedSlots), *mMC)); + cudaFree(d_dealloc_counter); + cudaFree(d_testData); + delete mMC; + + dout() << "done \n"; + + if(machine_readable) + { + print_machine_readable( + ScatterConfig::pagesize, + ScatterConfig::accessblocks, + ScatterConfig::regionsize, + ScatterConfig::wastefactor, + ScatterConfig::resetfreedpages, + blocks, + threads, + ELEMS_PER_SLOT, + sizeof(allocElem_t), + heapSize, + maxSpace, + maxSlots, + usedSlots, + allocFrac, + wasted, + correct); + } + + return correct; } diff --git a/tests/verify_heap_config.hpp b/tests/verify_heap_config.hpp index e76337a6..cdaa306e 100644 --- a/tests/verify_heap_config.hpp +++ b/tests/verify_heap_config.hpp @@ -28,52 +28,51 @@ #pragma once -#include -#include - // basic files for mallocMC -#include "src/include/mallocMC/mallocMC_hostclass.hpp" +#include // Load all available policies for mallocMC -#include "src/include/mallocMC/CreationPolicies.hpp" -#include "src/include/mallocMC/DistributionPolicies.hpp" -#include "src/include/mallocMC/OOMPolicies.hpp" -#include "src/include/mallocMC/ReservePoolPolicies.hpp" -#include "src/include/mallocMC/AlignmentPolicies.hpp" - +#include +#include +#include +#include +#include // configurate the CreationPolicy "Scatter" -struct ScatterConfig{ - typedef boost::mpl::int_<4096> pagesize; - typedef boost::mpl::int_<8> accessblocks; - typedef boost::mpl::int_<16> regionsize; - typedef boost::mpl::int_<2> wastefactor; - typedef boost::mpl::bool_ resetfreedpages; +struct ScatterConfig +{ + static constexpr auto pagesize = 4096; + static constexpr auto accessblocks = 8; + static constexpr auto regionsize = 16; + static constexpr auto wastefactor = 2; + static constexpr auto resetfreedpages = false; }; -struct ScatterHashParams{ - typedef boost::mpl::int_<38183> hashingK; - typedef boost::mpl::int_<17497> hashingDistMP; - typedef boost::mpl::int_<1> hashingDistWP; - typedef boost::mpl::int_<1> hashingDistWPRel; +struct ScatterHashParams +{ + static constexpr auto hashingK = 38183; + static constexpr auto hashingDistMP = 17497; + static constexpr auto hashingDistWP = 1; + static constexpr auto hashingDistWPRel = 1; }; // configure the DistributionPolicy "XMallocSIMD" -struct DistributionConfig{ - typedef ScatterConfig::pagesize pagesize; +struct DistributionConfig +{ + static constexpr auto pagesize = ScatterConfig::pagesize; }; // configure the AlignmentPolicy "Shrink" -struct AlignmentConfig{ - typedef boost::mpl::int_<16> dataAlignment; +struct AlignmentConfig +{ + static constexpr auto dataAlignment = 16; }; // Define a new allocator and call it ScatterAllocator // which resembles the behaviour of ScatterAlloc -typedef mallocMC::Allocator< - mallocMC::CreationPolicies::Scatter, - mallocMC::DistributionPolicies::XMallocSIMD, - mallocMC::OOMPolicies::ReturnNull, - mallocMC::ReservePoolPolicies::SimpleCudaMalloc, - mallocMC::AlignmentPolicies::Shrink - > ScatterAllocator; +using ScatterAllocator = mallocMC::Allocator< + mallocMC::CreationPolicies::Scatter, + mallocMC::DistributionPolicies::XMallocSIMD, + mallocMC::OOMPolicies::ReturnNull, + mallocMC::ReservePoolPolicies::SimpleCudaMalloc, + mallocMC::AlignmentPolicies::Shrink>;