diff --git a/.clang-format b/.clang-format
new file mode 100644
index 00000000..5708a4fb
--- /dev/null
+++ b/.clang-format
@@ -0,0 +1,77 @@
+---
+AccessModifierOffset: -4
+AlignAfterOpenBracket: AlwaysBreak
+AlignConsecutiveAssignments: false
+AlignConsecutiveDeclarations: false
+AlignEscapedNewlines: DontAlign
+AlignOperands: false
+AlignTrailingComments: false
+AllowAllParametersOfDeclarationOnNextLine: false
+AllowShortBlocksOnASingleLine: false
+AllowShortCaseLabelsOnASingleLine: false
+AllowShortFunctionsOnASingleLine: Empty
+AllowShortIfStatementsOnASingleLine: false
+AllowShortLoopsOnASingleLine: true
+AlwaysBreakAfterReturnType: None
+AlwaysBreakBeforeMultilineStrings: true
+AlwaysBreakTemplateDeclarations: Yes
+BinPackArguments: false
+BinPackParameters: false
+BreakBeforeBraces: Custom
+BraceWrapping:
+  AfterClass: true
+  AfterControlStatement: true
+  AfterEnum: true
+  AfterFunction: true
+  AfterNamespace: true
+  AfterStruct: true
+  AfterUnion: true
+  AfterExternBlock: true
+  BeforeCatch: true
+  BeforeElse: true
+  IndentBraces: false
+  SplitEmptyFunction: false
+  SplitEmptyRecord: false
+  SplitEmptyNamespace: false
+BreakBeforeBinaryOperators: All
+BreakBeforeTernaryOperators: true
+BreakConstructorInitializers: AfterColon
+BreakInheritanceList: AfterColon
+BreakStringLiterals: true
+ColumnLimit: 80
+CompactNamespaces: false
+ConstructorInitializerAllOnOneLineOrOnePerLine: true
+ConstructorInitializerIndentWidth: 8
+ContinuationIndentWidth: 4
+Cpp11BracedListStyle: true
+DerivePointerAlignment: false
+FixNamespaceComments: false
+IncludeBlocks: Regroup
+IndentCaseLabels: false
+IndentPPDirectives: None
+IndentWidth: 4
+IndentWrappedFunctionNames: false
+KeepEmptyLinesAtTheStartOfBlocks: false
+Language: Cpp
+NamespaceIndentation: All
+PointerAlignment: Middle
+ReflowComments: true
+SortIncludes: true
+SortUsingDeclarations: true
+SpaceAfterCStyleCast: false
+SpaceAfterTemplateKeyword: false
+SpaceBeforeAssignmentOperators: true
+SpaceBeforeCpp11BracedList: false
+SpaceBeforeCtorInitializerColon: true
+SpaceBeforeInheritanceColon: true
+SpaceBeforeParens: Never
+SpaceBeforeRangeBasedForLoopColon: true
+SpaceInEmptyParentheses: false
+SpacesInAngles: false
+SpacesInCStyleCastParentheses: false
+SpacesInContainerLiterals: false
+SpacesInParentheses: false
+SpacesInSquareBrackets: false
+Standard: Cpp11
+UseTab: Never
+...
diff --git a/.clang-tidy b/.clang-tidy
new file mode 100644
index 00000000..3292a65b
--- /dev/null
+++ b/.clang-tidy
@@ -0,0 +1,3 @@
+---
+Checks:            '*,-llvm-header-guard,-fuchsia-default-arguments-declarations,-cppcoreguidelines-no-malloc,-cppcoreguidelines-owning-memory,-misc-non-private-member-variables-in-classes'
+HeaderFilterRegex: '.*'
diff --git a/.gitignore b/.gitignore
index b8056488..9a640b01 100644
--- a/.gitignore
+++ b/.gitignore
@@ -14,3 +14,5 @@
 
 *~
 /nbproject
+/.vs
+/build
diff --git a/.travis.yml b/.travis.yml
index 8bf68ced..4a169b7a 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -2,7 +2,7 @@ language: cpp
 
 sudo: required
 
-dist: trusty
+dist: bionic
 
 compiler:
   - gcc
@@ -14,6 +14,7 @@ env:
 
 script:
   - mkdir build_tmp && cd build_tmp
+  - CXX=g++-5 && CC=gcc-5
   - cmake -DCMAKE_INSTALL_PREFIX=$INSTALL_DIR $TRAVIS_BUILD_DIR
   - make
   - make install
@@ -28,12 +29,12 @@ before_script:
   - sudo apt-get install -f -qq
   - sudo dpkg --get-selections | grep hold || { echo "All packages OK."; }
   - sudo apt-get install -q -y cmake-data cmake
-  - sudo apt-get install -qq build-essential
-  - gcc --version && g++ --version # 4.8
+  - sudo apt-get install -qq build-essential g++-5
+  - gcc-5 --version && g++-5 --version # 5.5.0
   - apt-cache search nvidia-*
   - sudo apt-get install -qq nvidia-common
-  - sudo apt-get install -qq nvidia-cuda-dev nvidia-cuda-toolkit # 5.5
-  - sudo apt-get install -qq libboost-dev # 1.54.0
+  - sudo apt-get install -qq nvidia-cuda-dev nvidia-cuda-toolkit # 9.1.85
+  - sudo apt-get install -qq libboost-dev # 1.65.1
   - sudo find /usr/ -name libcuda*.so
 
 after_script:
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 928f3015..3b3e5025 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,6 +1,27 @@
 Change Log / Release Log for mallocMC
 ================================================================
 
+2.4.0crp
+--------
+**Date:** 2020-05-28
+
+This release removes the Boost dependency and switched to C++11.
+
+### Changes to mallocMC 2.3.1crp
+
+**Features**
+  - Cleaning, remove Boost dependency & C++11 Migration #169
+
+**Bug fixes**
+  - Choose the value for the -arch nvcc flag depending on CUDA version #164 #165
+ 
+**Misc:**
+  - Travis CI: GCC 5.5.0 + CUDA 9.1.85 #170
+  - Adding headers to projects and applied clang-tidy #171
+  - clang-format #172
+
+Thanks to  Sergei Bastrakov, Bernhard Manfred Gruber and Axel Huebl for contributing to this release!
+
 2.3.1crp
 --------
 **Date:** 2019-02-14
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 4376bd13..ded0f630 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,7 +1,12 @@
-project(mallocMC)
-cmake_minimum_required(VERSION 2.8.12.2)
+project(mallocMC LANGUAGES CUDA CXX)
+cmake_minimum_required(VERSION 3.8)
+
+set(CMAKE_CXX_STANDARD 11)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
 
 # helper for libs and packages
+set(CMAKE_CUDA_STANDARD 11)
+set(CMAKE_CUDA_STANDARD_REQUIRED ON)
 set(CMAKE_PREFIX_PATH "/usr/lib/x86_64-linux-gnu/"
     "$ENV{CUDA_ROOT}" "$ENV{BOOST_ROOT}")
 
@@ -14,64 +19,37 @@ set(CMAKE_PREFIX_PATH "/usr/lib/x86_64-linux-gnu/"
 ################################################################################
 
 if(POLICY CMP0074)
-    cmake_policy(SET CMP0074 NEW)
+  cmake_policy(SET CMP0074 NEW)
 endif()
 
 
 ###############################################################################
 # CUDA
 ###############################################################################
-find_package(CUDA REQUIRED)
-set(CUDA_NVCC_FLAGS "-arch=sm_20;-use_fast_math;")
-set(CUDA_INCLUDE_DIRS ${CMAKE_CURRENT_SOURCE_DIR})
-include_directories(${CUDA_INCLUDE_DIRS})
-cuda_include_directories(${CUDA_INCLUDE_DIRS})
+if(NOT DEFINED COMPUTE_CAPABILITY)
+  set(COMPUTE_CAPABILITY "30")
+endif()
+set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -arch=sm_${COMPUTE_CAPABILITY} -use_fast_math")
 
 OPTION(CUDA_OUTPUT_INTERMEDIATE_CODE "Output ptx code" OFF)
 if(CUDA_OUTPUT_INTERMEDIATE_CODE)
-set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS};-Xptxas;-v;--keep")
-endif(CUDA_OUTPUT_INTERMEDIATE_CODE)
+  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xptxas -v --keep")
+endif()
 
 SET(CUDA_OPTIMIZATION_TYPE "unset" CACHE STRING "CUDA Optimization")
 set_property(CACHE CUDA_OPTIMIZATION_TYPE PROPERTY STRINGS "unset;-G0;-O0;-O1;-O2;-O3")
-if(NOT ${CUDA_OPTIMIZATION_TYPE} STREQUAL  "unset")
-  set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS};${CUDA_OPTIMIZATION_TYPE}")
+if(NOT ${CUDA_OPTIMIZATION_TYPE} STREQUAL "unset")
+  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} ${CUDA_OPTIMIZATION_TYPE}")
 endif()
 
 
-###############################################################################
-# Boost
-###############################################################################
-find_package(Boost 1.48.0 REQUIRED)
-include_directories(SYSTEM ${Boost_INCLUDE_DIRS})
-set(LIBS ${LIBS} ${Boost_LIBRARIES})
-
-# nvcc + boost 1.55 work around
-if(Boost_VERSION EQUAL 105500)
-  set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} \"-DBOOST_NOINLINE=__attribute__((noinline))\" ")
-endif(Boost_VERSION EQUAL 105500)
-
-
 ################################################################################
 # Warnings
 ################################################################################
-# GNU
 if(CMAKE_COMPILER_IS_GNUCXX)
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall")
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wshadow")
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unknown-pragmas")
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wextra")
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unused-parameter")
-  # new warning in gcc 4.8 (flag ignored in previous version)
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unused-local-typedefs")
-# ICC
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wshadow -Wno-unknown-pragmas -Wextra -Wno-unused-parameter -Wno-unused-local-typedefs")
 elseif("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Intel")
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall")
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wshadow")
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DBOOST_NO_VARIADIC_TEMPLATES")
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DBOOST_NO_CXX11_VARIADIC_TEMPLATES")
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DBOOST_NO_FENV_H")
-# PGI
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wshadow")
 elseif("${CMAKE_CXX_COMPILER_ID}" STREQUAL "PGI")
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Minform=inform")
 endif()
@@ -87,28 +65,19 @@ INSTALL(
   DESTINATION include
   PATTERN ".git" EXCLUDE
   PATTERN "mallocMC_config.hpp" EXCLUDE
-  )
+)
 
 
 ###############################################################################
 # Executables
 ###############################################################################
+file(GLOB_RECURSE headers src/include/**)
+add_custom_target(mallocMC SOURCES ${headers}) # create a target with the header files for IDE projects
+source_group(TREE ${CMAKE_CURRENT_LIST_DIR}/src/include FILES ${headers})
+
+include_directories(${CMAKE_CURRENT_LIST_DIR}/src/include)
+add_executable(mallocMC_Example01 EXCLUDE_FROM_ALL examples/mallocMC_example01.cu examples/mallocMC_example01_config.hpp)
+add_executable(mallocMC_Example02 EXCLUDE_FROM_ALL examples/mallocMC_example02.cu)
+add_executable(mallocMC_Example03 EXCLUDE_FROM_ALL examples/mallocMC_example03.cu)
+add_executable(VerifyHeap EXCLUDE_FROM_ALL tests/verify_heap.cu tests/verify_heap_config.hpp)
 add_custom_target(examples DEPENDS mallocMC_Example01 mallocMC_Example02 mallocMC_Example03 VerifyHeap)
-
-cuda_add_executable(mallocMC_Example01
-                    EXCLUDE_FROM_ALL
-                    examples/mallocMC_example01.cu )
-cuda_add_executable(mallocMC_Example02
-                    EXCLUDE_FROM_ALL
-                    examples/mallocMC_example02.cu )
-cuda_add_executable(mallocMC_Example03
-                    EXCLUDE_FROM_ALL
-                    examples/mallocMC_example03.cu )
-cuda_add_executable(VerifyHeap
-                    EXCLUDE_FROM_ALL
-                    tests/verify_heap.cu )
-
-target_link_libraries(mallocMC_Example01 ${LIBS})
-target_link_libraries(mallocMC_Example02 ${LIBS})
-target_link_libraries(mallocMC_Example03 ${LIBS})
-target_link_libraries(VerifyHeap ${LIBS})
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
new file mode 100644
index 00000000..84b2d7d5
--- /dev/null
+++ b/CONTRIBUTING.md
@@ -0,0 +1,15 @@
+# Contributing
+
+## Formatting
+
+Please format your code before before opening pull requests using clang-format and the .clang-format file placed in the repository root.
+
+### Visual Studio and CLion
+Suport for clang-format is built-in since Visual Studio 2017 15.7 and CLion 2019.1.
+The .clang-format file in the repository will be automatically detected and formatting is done as you type, or triggered when pressing the format hotkey.
+
+### Bash
+First install clang-format. Instructions therefore can be found on the web. To format you can run this command in bash:
+```
+find -iname *.cu -o -iname *.hpp | xargs clang-format-10 -i
+```
diff --git a/README.md b/README.md
index d1c3afac..ab984f72 100644
--- a/README.md
+++ b/README.md
@@ -22,6 +22,11 @@ mallocMC is header-only, but requires a few other C++ libraries to be
 available. Our installation notes can be found in [INSTALL.md](INSTALL.md).
 
 
+Contributing
+------------
+
+Rules for contributions are found in [CONTRIBUTING.md](CONTRIBUTING.md).
+
 On the ScatterAlloc Algorithm
 -----------------------------
 
diff --git a/Usage.md b/Usage.md
index 8d25057a..3f8049ea 100644
--- a/Usage.md
+++ b/Usage.md
@@ -19,15 +19,15 @@ Currently, there are the following policy classes available:
 
 |Policy                 | Policy Classes (implementations) | description |
 |-------                |----------------------------------| ----------- |
-|**CreationPolicy**     | Scatter`<conf1,conf2>`           | A scattered allocation to tradeoff fragmentation for allocation time, as proposed in [ScatterAlloc](http://ieeexplore.ieee.org/xpl/articleDetails.jsp?arnumber=6339604). `conf1` configures the heap layout, `conf2` determines the hashing parameters|
+|**CreationPolicy**     | Scatter`<conf1,conf2>`         | A scattered allocation to tradeoff fragmentation for allocation time, as proposed in [ScatterAlloc](http://ieeexplore.ieee.org/xpl/articleDetails.jsp?arnumber=6339604). `conf1` configures the heap layout, `conf2` determines the hashing parameters|
 |                       | OldMalloc                        | device-side malloc/new and free/delete syscalls as implemented on NVidia CUDA graphics cards with compute capability sm_20 and higher |
-|**DistributionPolicy** | XMallocSIMD`<conf>`              | SIMD optimization for warp-wide allocation on NVIDIA CUDA accelerators, as proposed by [XMalloc](http://ieeexplore.ieee.org/xpl/articleDetails.jsp?arnumber=5577907). `conf` is used to determine the pagesize. If used in combination with *Scatter*, the pagesizes must match |
+|**DistributionPolicy** | XMallocSIMD`<conf>`             | SIMD optimization for warp-wide allocation on NVIDIA CUDA accelerators, as proposed by [XMalloc](http://ieeexplore.ieee.org/xpl/articleDetails.jsp?arnumber=5577907). `conf` is used to determine the pagesize. If used in combination with *Scatter*, the pagesizes must match |
 |                       | Noop                             | no workload distribution at all |
-|**OOMPolicy**          | ReturnNull                       | pointers will be *NULL*, if the request could not be fulfilled |
+|**OOMPolicy**          | ReturnNull                       | pointers will be *nullptr*, if the request could not be fulfilled |
 |                       | ~~BadAllocException~~            | will throw a `std::bad_alloc` exception. The accelerator has to support exceptions |
 |**ReservePoolPolicy**  | SimpleCudaMalloc                 | allocate a fixed heap with `CudaMalloc` |
 |                       | CudaSetLimits                    | call to `CudaSetLimits` to increase the available Heap (e.g. when using *OldMalloc*) |
-|**AlignmentPolicy**    | Shrink`<conf>`                   | shrinks the pool so that the starting pointer is well aligned, applies padding to requested memory chunks. `conf` is used to determine the alignment|
+|**AlignmentPolicy**    | Shrink`<conf>`                  | shrinks the pool so that the starting pointer is well aligned, applies padding to requested memory chunks. `conf` is used to determine the alignment|
 |                       | Noop                             | no alignment at all |
 
 The user has to choose one of each policy that will form a useful allocator
@@ -45,7 +45,7 @@ to the policy class:
 ```c++
 // configure the AlignmentPolicy "Shrink"
 struct ShrinkConfig : mallocMC::AlignmentPolicies::Shrink<>::Properties {
-  typedef boost::mpl::int_<16> dataAlignment;
+  static constexpr auto dataAlignment = 16;
 };
 ```
 
@@ -57,29 +57,29 @@ parameters to create the desired allocator type:
 ```c++
 using namespace mallocMC;
 
-typedef mallocMC::Allocator<
+using Allocator1 = mallocMC::Allocator<
   CreationPolicy::OldMalloc,
   DistributionPolicy::Noop,
   OOMPolicy::ReturnNull,
   ReservePoolPolicy::CudaSetLimits,
   AlignmentPolicy::Noop
-> Allocator1;
+>;
 ```
 
 `Allocator1` will resemble the behaviour of classical device-side allocation known
 from NVIDIA CUDA since compute capability sm_20. To get a more novel allocator, one
-could create the following typedef instead:
+could create the following alias instead:
 
 ```c++
 using namespace mallocMC;
 
-typedef mallocMC::Allocator<
+using ScatterAllocator = mallocMC::Allocator<
   CreationPolicies::Scatter<>,
   DistributionPolicies::XMallocSIMD<>,
   OOMPolicies::ReturnNull,
   ReservePoolPolicies::SimpleCudaMalloc,
   AlignmentPolicies::Shrink<ShrinkConfig>
-> ScatterAllocator;
+>;
 ```
 
 Notice, how the policy classes `Scatter` and `XMallocSIMD` are instantiated without
@@ -122,13 +122,13 @@ A simplistic example would look like this:
 
 namespace mallocMC = MC;
 
-typedef MC::Allocator<
+using ScatterAllocator = MC::Allocator<
   MC::CreationPolicies::Scatter<>,
   MC::DistributionPolicies::XMallocSIMD<>,
   MC::OOMPolicies::ReturnNull,
   MC::ReservePoolPolicies::SimpleCudaMalloc,
   MC::AlignmentPolicies::Shrink<ShrinkConfig>
-  > ScatterAllocator;
+>;
 
 __global__ exampleKernel(ScatterAllocator::AllocatorHandle sah)
 {
diff --git a/examples/mallocMC_example01.cu b/examples/mallocMC_example01.cu
index 7ac9c4ef..d0b8c898 100644
--- a/examples/mallocMC_example01.cu
+++ b/examples/mallocMC_example01.cu
@@ -26,131 +26,140 @@
   THE SOFTWARE.
 */
 
-#include <iostream>
-#include <cassert>
-#include <vector>
-#include <numeric>
+#include "mallocMC_example01_config.hpp"
 
+#include <cassert>
 #include <cuda.h>
-#include "mallocMC_example01_config.cu"
+#include <iostream>
+#include <numeric>
+#include <vector>
 
 void run();
 
-int main()
+auto main() -> int
 {
-  cudaDeviceProp deviceProp;
-  cudaGetDeviceProperties(&deviceProp, 0);
-
-  if( deviceProp.major < 2 ) {
-    std::cerr << "Error: Compute Capability >= 2.0 required. (is ";
-    std::cerr << deviceProp.major << "."<< deviceProp.minor << ")" << std::endl;
-    return 1;
-  }
-
-  cudaSetDevice(0);
-  run();
-  cudaDeviceReset();
-
-  return 0;
+    int computeCapabilityMajor = 0;
+    cudaDeviceGetAttribute(
+        &computeCapabilityMajor, cudaDevAttrComputeCapabilityMajor, 0);
+    int computeCapabilityMinor = 0;
+    cudaDeviceGetAttribute(
+        &computeCapabilityMinor, cudaDevAttrComputeCapabilityMinor, 0);
+
+    if(computeCapabilityMajor < 2)
+    {
+        std::cerr << "Error: Compute Capability >= 2.0 required. (is ";
+        std::cerr << computeCapabilityMajor << "." << computeCapabilityMinor
+                  << ")" << std::endl;
+        return 1;
+    }
+
+    cudaSetDevice(0);
+    run();
+    cudaDeviceReset();
+
+    return 0;
 }
 
+__device__ int ** arA;
+__device__ int ** arB;
+__device__ int ** arC;
 
-__device__ int** arA;
-__device__ int** arB;
-__device__ int** arC;
-
-
-__global__ void createArrayPointers(int x, int y, ScatterAllocator::AllocatorHandle mMC){
-  arA = (int**) mMC.malloc(sizeof(int*) * x*y);
-  arB = (int**) mMC.malloc(sizeof(int*) * x*y);
-  arC = (int**) mMC.malloc(sizeof(int*) * x*y);
+__global__ void
+createArrayPointers(int x, int y, ScatterAllocator::AllocatorHandle mMC)
+{
+    arA = (int **)mMC.malloc(sizeof(int *) * x * y);
+    arB = (int **)mMC.malloc(sizeof(int *) * x * y);
+    arC = (int **)mMC.malloc(sizeof(int *) * x * y);
 }
 
+__global__ void
+fillArrays(int length, int * d, ScatterAllocator::AllocatorHandle mMC)
+{
+    int id = threadIdx.x + blockIdx.x * blockDim.x;
 
-__global__ void fillArrays(int length, int* d, ScatterAllocator::AllocatorHandle mMC){
-  int id = threadIdx.x + blockIdx.x*blockDim.x;
-
-  arA[id] = (int*) mMC.malloc(length*sizeof(int));
-  arB[id] = (int*) mMC.malloc(length*sizeof(int));
-  arC[id] = (int*) mMC.malloc(sizeof(int)*length);
+    arA[id] = (int *)mMC.malloc(length * sizeof(int));
+    arB[id] = (int *)mMC.malloc(length * sizeof(int));
+    arC[id] = (int *)mMC.malloc(sizeof(int) * length);
 
-  for(int i=0 ; i<length; ++i){
-    arA[id][i] = id*length+i;
-    arB[id][i] = id*length+i;
-  }
+    for(int i = 0; i < length; ++i)
+    {
+        arA[id][i] = id * length + i;
+        arB[id][i] = id * length + i;
+    }
 }
 
-
-__global__ void addArrays(int length, int* d){
-  int id = threadIdx.x + blockIdx.x*blockDim.x;
-
-  d[id] = 0;
-  for(int i=0 ; i<length; ++i){
-    arC[id][i] = arA[id][i] + arB[id][i];
-    d[id] += arC[id][i];
-  }
+__global__ void addArrays(int length, int * d)
+{
+    int id = threadIdx.x + blockIdx.x * blockDim.x;
+
+    d[id] = 0;
+    for(int i = 0; i < length; ++i)
+    {
+        arC[id][i] = arA[id][i] + arB[id][i];
+        d[id] += arC[id][i];
+    }
 }
 
-
-__global__ void freeArrays(ScatterAllocator::AllocatorHandle mMC){
-  int id = threadIdx.x + blockIdx.x*blockDim.x;
-  mMC.free(arA[id]);
-  mMC.free(arB[id]);
-  mMC.free(arC[id]);
+__global__ void freeArrays(ScatterAllocator::AllocatorHandle mMC)
+{
+    int id = threadIdx.x + blockIdx.x * blockDim.x;
+    mMC.free(arA[id]);
+    mMC.free(arB[id]);
+    mMC.free(arC[id]);
 }
 
-
-__global__ void freeArrayPointers(ScatterAllocator::AllocatorHandle mMC){
-  mMC.free(arA);
-  mMC.free(arB);
-  mMC.free(arC);
+__global__ void freeArrayPointers(ScatterAllocator::AllocatorHandle mMC)
+{
+    mMC.free(arA);
+    mMC.free(arB);
+    mMC.free(arC);
 }
 
-
 void run()
 {
-  size_t block = 32;
-  size_t grid = 32;
-  int length = 100;
-  assert((unsigned)length<= block*grid); //necessary for used algorithm
-
-  //init the heap
-  std::cerr << "initHeap...";
-  ScatterAllocator mMC(1U*1024U*1024U*1024U); //1GB for device-side malloc
-  std::cerr << "done" << std::endl;
+    size_t block = 32;
+    size_t grid = 32;
+    int length = 100;
+    assert((unsigned)length <= block * grid); // necessary for used algorithm
 
-  std::cout << ScatterAllocator::info("\n") << std::endl;
+    // init the heap
+    std::cerr << "initHeap...";
+    ScatterAllocator mMC(
+        1U * 1024U * 1024U * 1024U); // 1GB for device-side malloc
+    std::cerr << "done" << std::endl;
 
-  // device-side pointers
-  int*  d;
-  cudaMalloc((void**) &d, sizeof(int)*block*grid);
+    std::cout << ScatterAllocator::info("\n") << std::endl;
 
-  // host-side pointers
-  std::vector<int> array_sums(block*grid,0);
+    // device-side pointers
+    int * d;
+    cudaMalloc((void **)&d, sizeof(int) * block * grid);
 
-  // create arrays of arrays on the device
-  createArrayPointers<<<1,1>>>(grid,block, mMC );
+    // host-side pointers
+    std::vector<int> array_sums(block * grid, 0);
 
-  // fill 2 of them all with ascending values
-  fillArrays<<<grid,block>>>(length, d, mMC );
+    // create arrays of arrays on the device
+    createArrayPointers<<<1, 1>>>(grid, block, mMC);
 
-  // add the 2 arrays (vector addition within each thread)
-  // and do a thread-wise reduce to d
-  addArrays<<<grid,block>>>(length, d);
+    // fill 2 of them all with ascending values
+    fillArrays<<<grid, block>>>(length, d, mMC);
 
-  cudaMemcpy(&array_sums[0],d,sizeof(int)*block*grid,cudaMemcpyDeviceToHost);
+    // add the 2 arrays (vector addition within each thread)
+    // and do a thread-wise reduce to d
+    addArrays<<<grid, block>>>(length, d);
 
-  mMC.getAvailableSlots(1024U*1024U); //get available megabyte-sized slots
+    cudaMemcpy(
+        &array_sums[0], d, sizeof(int) * block * grid, cudaMemcpyDeviceToHost);
 
-  int sum = std::accumulate(array_sums.begin(),array_sums.end(),0);
-  std::cout << "The sum of the arrays on GPU is " << sum << std::endl;
+    mMC.getAvailableSlots(1024U * 1024U); // get available megabyte-sized slots
 
-  int n = block*grid*length;
-  int gaussian = n*(n-1);
-  std::cout << "The gaussian sum as comparison: " << gaussian << std::endl;
+    int sum = std::accumulate(array_sums.begin(), array_sums.end(), 0);
+    std::cout << "The sum of the arrays on GPU is " << sum << std::endl;
 
-  freeArrays<<<grid,block>>>( mMC );
-  freeArrayPointers<<<1,1>>>( mMC );
-  cudaFree(d);
+    int n = block * grid * length;
+    int gaussian = n * (n - 1);
+    std::cout << "The gaussian sum as comparison: " << gaussian << std::endl;
 
+    freeArrays<<<grid, block>>>(mMC);
+    freeArrayPointers<<<1, 1>>>(mMC);
+    cudaFree(d);
 }
diff --git a/examples/mallocMC_example01_config.cu b/examples/mallocMC_example01_config.hpp
similarity index 57%
rename from examples/mallocMC_example01_config.cu
rename to examples/mallocMC_example01_config.hpp
index 14ebf743..7b424b0b 100644
--- a/examples/mallocMC_example01_config.cu
+++ b/examples/mallocMC_example01_config.hpp
@@ -28,53 +28,52 @@
 
 #pragma once
 
-#include <boost/mpl/int.hpp>
-#include <boost/mpl/bool.hpp>
-
 // basic files for mallocMC
-#include "src/include/mallocMC/mallocMC_hostclass.hpp"
+#include <mallocMC/mallocMC_hostclass.hpp>
 
 // Load all available policies for mallocMC
-#include "src/include/mallocMC/CreationPolicies.hpp"
-#include "src/include/mallocMC/DistributionPolicies.hpp"
-#include "src/include/mallocMC/OOMPolicies.hpp"
-#include "src/include/mallocMC/ReservePoolPolicies.hpp"
-#include "src/include/mallocMC/AlignmentPolicies.hpp"
-    
-
+#include <mallocMC/AlignmentPolicies.hpp>
+#include <mallocMC/CreationPolicies.hpp>
+#include <mallocMC/DistributionPolicies.hpp>
+#include <mallocMC/OOMPolicies.hpp>
+#include <mallocMC/ReservePoolPolicies.hpp>
 
 // configurate the CreationPolicy "Scatter" to modify the default behaviour
-struct ScatterHeapConfig : mallocMC::CreationPolicies::Scatter<>::HeapProperties{
-    typedef boost::mpl::int_<4096>  pagesize;
-    typedef boost::mpl::int_<8>     accessblocks;
-    typedef boost::mpl::int_<16>    regionsize;
-    typedef boost::mpl::int_<2>     wastefactor;
-    typedef boost::mpl::bool_<false> resetfreedpages;
+struct ScatterHeapConfig : mallocMC::CreationPolicies::Scatter<>::HeapProperties
+{
+    static constexpr auto pagesize = 4096;
+    static constexpr auto accessblocks = 8;
+    static constexpr auto regionsize = 16;
+    static constexpr auto wastefactor = 2;
+    static constexpr auto resetfreedpages = false;
 };
 
-struct ScatterHashConfig : mallocMC::CreationPolicies::Scatter<>::HashingProperties{
-    typedef boost::mpl::int_<38183> hashingK;
-    typedef boost::mpl::int_<17497> hashingDistMP;
-    typedef boost::mpl::int_<1>     hashingDistWP;
-    typedef boost::mpl::int_<1>     hashingDistWPRel;
+struct ScatterHashConfig :
+        mallocMC::CreationPolicies::Scatter<>::HashingProperties
+{
+    static constexpr auto hashingK = 38183;
+    static constexpr auto hashingDistMP = 17497;
+    static constexpr auto hashingDistWP = 1;
+    static constexpr auto hashingDistWPRel = 1;
 };
 
 // configure the DistributionPolicy "XMallocSIMD"
-struct XMallocConfig : mallocMC::DistributionPolicies::XMallocSIMD<>::Properties {
-  typedef ScatterHeapConfig::pagesize pagesize;
+struct XMallocConfig : mallocMC::DistributionPolicies::XMallocSIMD<>::Properties
+{
+    static constexpr auto pagesize = ScatterHeapConfig::pagesize;
 };
 
 // configure the AlignmentPolicy "Shrink"
-struct ShrinkConfig : mallocMC::AlignmentPolicies::Shrink<>::Properties {
-  typedef boost::mpl::int_<16> dataAlignment;
+struct ShrinkConfig : mallocMC::AlignmentPolicies::Shrink<>::Properties
+{
+    static constexpr auto dataAlignment = 16;
 };
 
 // Define a new allocator and call it ScatterAllocator
 // which resembles the behaviour of ScatterAlloc
-typedef mallocMC::Allocator< 
-  mallocMC::CreationPolicies::Scatter<ScatterHeapConfig, ScatterHashConfig>,
-  mallocMC::DistributionPolicies::XMallocSIMD<XMallocConfig>,
-  mallocMC::OOMPolicies::ReturnNull,
-  mallocMC::ReservePoolPolicies::SimpleCudaMalloc,
-  mallocMC::AlignmentPolicies::Shrink<ShrinkConfig>
-  > ScatterAllocator;
+using ScatterAllocator = mallocMC::Allocator<
+    mallocMC::CreationPolicies::Scatter<ScatterHeapConfig, ScatterHashConfig>,
+    mallocMC::DistributionPolicies::XMallocSIMD<XMallocConfig>,
+    mallocMC::OOMPolicies::ReturnNull,
+    mallocMC::ReservePoolPolicies::SimpleCudaMalloc,
+    mallocMC::AlignmentPolicies::Shrink<ShrinkConfig>>;
diff --git a/examples/mallocMC_example02.cu b/examples/mallocMC_example02.cu
index d00d00e7..0dd60600 100644
--- a/examples/mallocMC_example02.cu
+++ b/examples/mallocMC_example02.cu
@@ -26,84 +26,88 @@
   THE SOFTWARE.
 */
 
-#include <iostream>
 #include <cassert>
-#include <vector>
-#include <numeric>
-
 #include <cuda.h>
-#include <boost/mpl/int.hpp>
-#include <boost/mpl/bool.hpp>
+#include <iostream>
+#include <numeric>
+#include <vector>
 
 ///////////////////////////////////////////////////////////////////////////////
 // includes for mallocMC
 ///////////////////////////////////////////////////////////////////////////////
 // basic files for mallocMC
-#include "src/include/mallocMC/mallocMC_hostclass.hpp"
+#include <mallocMC/mallocMC_hostclass.hpp>
 
 // Load all available policies for mallocMC
-#include "src/include/mallocMC/CreationPolicies.hpp"
-#include "src/include/mallocMC/DistributionPolicies.hpp"
-#include "src/include/mallocMC/OOMPolicies.hpp"
-#include "src/include/mallocMC/ReservePoolPolicies.hpp"
-#include "src/include/mallocMC/AlignmentPolicies.hpp"
+#include <mallocMC/AlignmentPolicies.hpp>
+#include <mallocMC/CreationPolicies.hpp>
+#include <mallocMC/DistributionPolicies.hpp>
+#include <mallocMC/OOMPolicies.hpp>
+#include <mallocMC/ReservePoolPolicies.hpp>
 
 ///////////////////////////////////////////////////////////////////////////////
 // Configuration for mallocMC
 ///////////////////////////////////////////////////////////////////////////////
 
 // configurate the CreationPolicy "Scatter"
-struct ScatterConfig{
-    typedef boost::mpl::int_<4096>  pagesize;
-    typedef boost::mpl::int_<8>     accessblocks;
-    typedef boost::mpl::int_<16>    regionsize;
-    typedef boost::mpl::int_<2>     wastefactor;
-    typedef boost::mpl::bool_<false> resetfreedpages;
+struct ScatterConfig
+{
+    static constexpr auto pagesize = 4096;
+    static constexpr auto accessblocks = 8;
+    static constexpr auto regionsize = 16;
+    static constexpr auto wastefactor = 2;
+    static constexpr auto resetfreedpages = false;
 };
 
-struct ScatterHashParams{
-    typedef boost::mpl::int_<38183> hashingK;
-    typedef boost::mpl::int_<17497> hashingDistMP;
-    typedef boost::mpl::int_<1>     hashingDistWP;
-    typedef boost::mpl::int_<1>     hashingDistWPRel;
+struct ScatterHashParams
+{
+    static constexpr auto hashingK = 38183;
+    static constexpr auto hashingDistMP = 17497;
+    static constexpr auto hashingDistWP = 1;
+    static constexpr auto hashingDistWPRel = 1;
 };
 
 // configure the DistributionPolicy "XMallocSIMD"
-struct DistributionConfig{
-  typedef ScatterConfig::pagesize pagesize;
+struct DistributionConfig
+{
+    static constexpr auto pagesize = ScatterConfig::pagesize;
 };
 
 // configure the AlignmentPolicy "Shrink"
-struct AlignmentConfig{
-  typedef boost::mpl::int_<16> dataAlignment;
+struct AlignmentConfig
+{
+    static constexpr auto dataAlignment = 16;
 };
 
 // Define a new mMCator and call it ScatterAllocator
 // which resembles the behaviour of ScatterAlloc
-typedef mallocMC::Allocator<
-  mallocMC::CreationPolicies::Scatter<ScatterConfig,ScatterHashParams>,
-  mallocMC::DistributionPolicies::XMallocSIMD<DistributionConfig>,
-  mallocMC::OOMPolicies::ReturnNull,
-  mallocMC::ReservePoolPolicies::SimpleCudaMalloc,
-  mallocMC::AlignmentPolicies::Shrink<AlignmentConfig>
-  > ScatterAllocator;
-
+using ScatterAllocator = mallocMC::Allocator<
+    mallocMC::CreationPolicies::Scatter<ScatterConfig, ScatterHashParams>,
+    mallocMC::DistributionPolicies::XMallocSIMD<DistributionConfig>,
+    mallocMC::OOMPolicies::ReturnNull,
+    mallocMC::ReservePoolPolicies::SimpleCudaMalloc,
+    mallocMC::AlignmentPolicies::Shrink<AlignmentConfig>>;
 
 ///////////////////////////////////////////////////////////////////////////////
 // End of mallocMC configuration
 ///////////////////////////////////////////////////////////////////////////////
 
-
 void run();
 
-int main()
+auto main() -> int
 {
-    cudaDeviceProp deviceProp;
-    cudaGetDeviceProperties(&deviceProp, 0);
-
-    if( deviceProp.major < int(2) ) {
+    int computeCapabilityMajor = 0;
+    cudaDeviceGetAttribute(
+        &computeCapabilityMajor, cudaDevAttrComputeCapabilityMajor, 0);
+    int computeCapabilityMinor = 0;
+    cudaDeviceGetAttribute(
+        &computeCapabilityMinor, cudaDevAttrComputeCapabilityMinor, 0);
+
+    if(computeCapabilityMajor < int(2))
+    {
         std::cerr << "Error: Compute Capability >= 2.0 required. (is ";
-        std::cerr << deviceProp.major << "."<< deviceProp.minor << ")" << std::endl;
+        std::cerr << computeCapabilityMajor << "." << computeCapabilityMinor
+                  << ")" << std::endl;
         return 1;
     }
 
@@ -114,106 +118,110 @@ int main()
     return 0;
 }
 
+__device__ int ** arA;
+__device__ int ** arB;
+__device__ int ** arC;
 
-__device__ int** arA;
-__device__ int** arB;
-__device__ int** arC;
-
-
-__global__ void createArrayPointers(int x, int y, ScatterAllocator::AllocatorHandle  mMC){
-    arA = (int**) mMC.malloc(sizeof(int*) * x*y);
-    arB = (int**) mMC.malloc(sizeof(int*) * x*y);
-    arC = (int**) mMC.malloc(sizeof(int*) * x*y);
+__global__ void
+createArrayPointers(int x, int y, ScatterAllocator::AllocatorHandle mMC)
+{
+    arA = (int **)mMC.malloc(sizeof(int *) * x * y);
+    arB = (int **)mMC.malloc(sizeof(int *) * x * y);
+    arC = (int **)mMC.malloc(sizeof(int *) * x * y);
 }
 
+__global__ void
+fillArrays(int length, int * d, ScatterAllocator::AllocatorHandle mMC)
+{
+    int id = threadIdx.x + blockIdx.x * blockDim.x;
 
-__global__ void fillArrays(int length, int* d, ScatterAllocator::AllocatorHandle mMC){
-    int id = threadIdx.x + blockIdx.x*blockDim.x;
-
-    arA[id] = (int*) mMC.malloc(sizeof(int)*length);
-    arB[id] = (int*) mMC.malloc(sizeof(int)*length);
-    arC[id] = (int*) mMC.malloc(sizeof(int)*length);
+    arA[id] = (int *)mMC.malloc(sizeof(int) * length);
+    arB[id] = (int *)mMC.malloc(sizeof(int) * length);
+    arC[id] = (int *)mMC.malloc(sizeof(int) * length);
 
-    for(int i=0 ; i<length; ++i){
-        arA[id][i] = id*length+i;
-        arB[id][i] = id*length+i;
+    for(int i = 0; i < length; ++i)
+    {
+        arA[id][i] = id * length + i;
+        arB[id][i] = id * length + i;
     }
 }
 
-
-__global__ void addArrays(int length, int* d){
-    int id = threadIdx.x + blockIdx.x*blockDim.x;
+__global__ void addArrays(int length, int * d)
+{
+    int id = threadIdx.x + blockIdx.x * blockDim.x;
 
     d[id] = 0;
-    for(int i=0 ; i<length; ++i){
+    for(int i = 0; i < length; ++i)
+    {
         arC[id][i] = arA[id][i] + arB[id][i];
         d[id] += arC[id][i];
     }
 }
 
-
-__global__ void freeArrays(ScatterAllocator::AllocatorHandle mMC){
-    int id = threadIdx.x + blockIdx.x*blockDim.x;
+__global__ void freeArrays(ScatterAllocator::AllocatorHandle mMC)
+{
+    int id = threadIdx.x + blockIdx.x * blockDim.x;
     mMC.free(arA[id]);
     mMC.free(arB[id]);
     mMC.free(arC[id]);
 }
 
-
-__global__ void freeArrayPointers(ScatterAllocator::AllocatorHandle mMC){
+__global__ void freeArrayPointers(ScatterAllocator::AllocatorHandle mMC)
+{
     mMC.free(arA);
     mMC.free(arB);
     mMC.free(arC);
 }
 
-
 void run()
 {
     size_t block = 32;
     size_t grid = 32;
     int length = 100;
-    assert((unsigned)length <= block*grid); //necessary for used algorithm
+    assert((unsigned)length <= block * grid); // necessary for used algorithm
 
-    //init the heap
+    // init the heap
     std::cerr << "initHeap...";
-    ScatterAllocator mMC(1U*1024U*1024U*1024U); //1GB for device-side malloc
+    ScatterAllocator mMC(
+        1U * 1024U * 1024U * 1024U); // 1GB for device-side malloc
     std::cerr << "done" << std::endl;
 
     // device-side pointers
-    int*  d;
-    cudaMalloc((void**) &d, sizeof(int)*block*grid);
+    int * d;
+    cudaMalloc((void **)&d, sizeof(int) * block * grid);
 
     // host-side pointers
-    std::vector<int> array_sums(block*grid,0);
+    std::vector<int> array_sums(block * grid, 0);
 
     // create arrays of arrays on the device
-    createArrayPointers<<<1,1>>>(grid, block, mMC );
+    createArrayPointers<<<1, 1>>>(grid, block, mMC);
 
     // fill 2 of them all with ascending values
-    fillArrays<<<grid,block>>>(length, d, mMC );
+    fillArrays<<<grid, block>>>(length, d, mMC);
 
     // add the 2 arrays (vector addition within each thread)
     // and do a thread-wise reduce to d
-    addArrays<<<grid,block>>>(length, d);
+    addArrays<<<grid, block>>>(length, d);
 
-    cudaMemcpy(&array_sums[0], d, sizeof(int)*block*grid, cudaMemcpyDeviceToHost);
+    cudaMemcpy(
+        &array_sums[0], d, sizeof(int) * block * grid, cudaMemcpyDeviceToHost);
 
     int sum = std::accumulate(array_sums.begin(), array_sums.end(), 0);
     std::cout << "The sum of the arrays on GPU is " << sum << std::endl;
 
-    int n = block*grid*length;
-    int gaussian = n*(n-1);
+    int n = block * grid * length;
+    int gaussian = n * (n - 1);
     std::cout << "The gaussian sum as comparison: " << gaussian << std::endl;
 
     // checking the free memory of the allocator
-    if(mallocMC::Traits<ScatterAllocator>::providesAvailableSlots){
+    if(mallocMC::Traits<ScatterAllocator>::providesAvailableSlots)
+    {
         std::cout << "there are ";
-        std::cout << mMC.getAvailableSlots(1024U*1024U);
+        std::cout << mMC.getAvailableSlots(1024U * 1024U);
         std::cout << " Slots of size 1MB available" << std::endl;
     }
 
-    freeArrays<<<grid, block>>>( mMC );
-    freeArrayPointers<<<1, 1>>>( mMC );
+    freeArrays<<<grid, block>>>(mMC);
+    freeArrayPointers<<<1, 1>>>(mMC);
     cudaFree(d);
-
 }
diff --git a/examples/mallocMC_example03.cu b/examples/mallocMC_example03.cu
index a3a271b9..9ea7dde7 100644
--- a/examples/mallocMC_example03.cu
+++ b/examples/mallocMC_example03.cu
@@ -26,93 +26,87 @@
   THE SOFTWARE.
 */
 
-#include <iostream>
 #include <assert.h>
-#include <vector>
+#include <cuda.h>
+#include <iostream>
 #include <numeric>
 #include <stdio.h>
-
-#include <cuda.h>
-#include <boost/mpl/int.hpp>
-#include <boost/mpl/bool.hpp>
-
+#include <vector>
 
 ///////////////////////////////////////////////////////////////////////////////
 // includes for mallocMC
 ///////////////////////////////////////////////////////////////////////////////
-#include "src/include/mallocMC/mallocMC_hostclass.hpp"
-
-#include "src/include/mallocMC/CreationPolicies.hpp"
-#include "src/include/mallocMC/DistributionPolicies.hpp"
-#include "src/include/mallocMC/OOMPolicies.hpp"
-#include "src/include/mallocMC/ReservePoolPolicies.hpp"
-#include "src/include/mallocMC/AlignmentPolicies.hpp"
-
+#include <mallocMC/AlignmentPolicies.hpp>
+#include <mallocMC/CreationPolicies.hpp>
+#include <mallocMC/DistributionPolicies.hpp>
+#include <mallocMC/OOMPolicies.hpp>
+#include <mallocMC/ReservePoolPolicies.hpp>
+#include <mallocMC/mallocMC_hostclass.hpp>
 
 ///////////////////////////////////////////////////////////////////////////////
 // Configuration for mallocMC
 ///////////////////////////////////////////////////////////////////////////////
 
 // configurate the CreationPolicy "Scatter"
-struct ScatterConfig{
-    typedef boost::mpl::int_<4096>  pagesize;
-    typedef boost::mpl::int_<8>     accessblocks;
-    typedef boost::mpl::int_<16>    regionsize;
-    typedef boost::mpl::int_<2>     wastefactor;
-    typedef boost::mpl::bool_<false> resetfreedpages;
+struct ScatterConfig
+{
+    static constexpr auto pagesize = 4096;
+    static constexpr auto accessblocks = 8;
+    static constexpr auto regionsize = 16;
+    static constexpr auto wastefactor = 2;
+    static constexpr auto resetfreedpages = false;
 };
 
-struct ScatterHashParams{
-    typedef boost::mpl::int_<38183> hashingK;
-    typedef boost::mpl::int_<17497> hashingDistMP;
-    typedef boost::mpl::int_<1>     hashingDistWP;
-    typedef boost::mpl::int_<1>     hashingDistWPRel;
+struct ScatterHashParams
+{
+    static constexpr auto hashingK = 38183;
+    static constexpr auto hashingDistMP = 17497;
+    static constexpr auto hashingDistWP = 1;
+    static constexpr auto hashingDistWPRel = 1;
 };
 
-
 // configure the AlignmentPolicy "Shrink"
-struct AlignmentConfig{
-    typedef boost::mpl::int_<16> dataAlignment;
+struct AlignmentConfig
+{
+    static constexpr auto dataAlignment = 16;
 };
 
 // Define a new mMCator and call it ScatterAllocator
 // which resembles the behaviour of ScatterAlloc
-typedef mallocMC::Allocator<
+using ScatterAllocator = mallocMC::Allocator<
     mallocMC::CreationPolicies::Scatter<ScatterConfig, ScatterHashParams>,
     mallocMC::DistributionPolicies::Noop,
     mallocMC::OOMPolicies::ReturnNull,
     mallocMC::ReservePoolPolicies::SimpleCudaMalloc,
-    mallocMC::AlignmentPolicies::Shrink<AlignmentConfig>
-> ScatterAllocator;
+    mallocMC::AlignmentPolicies::Shrink<AlignmentConfig>>;
 
 ///////////////////////////////////////////////////////////////////////////////
 // End of mallocMC configuration
 ///////////////////////////////////////////////////////////////////////////////
 
+__device__ int * arA;
 
-__device__ int* arA;
-
-
-__global__ void exampleKernel(ScatterAllocator::AllocatorHandle mMC){
+__global__ void exampleKernel(ScatterAllocator::AllocatorHandle mMC)
+{
     unsigned x = 42;
-    if(threadIdx.x==0)
-        arA = (int*) mMC.malloc(sizeof(int) * 32);
+    if(threadIdx.x == 0)
+        arA = (int *)mMC.malloc(sizeof(int) * 32);
 
     x = mMC.getAvailableSlots(1);
     __syncthreads();
     arA[threadIdx.x] = threadIdx.x;
-    printf("tid: %d array: %d slots %d\n", threadIdx.x, arA[threadIdx.x],x);
+    printf("tid: %d array: %d slots %d\n", threadIdx.x, arA[threadIdx.x], x);
 
     if(threadIdx.x == 0)
         mMC.free(arA);
 }
 
-
-int main()
+auto main() -> int
 {
-    ScatterAllocator mMC(1U*1024U*1024U*1024U); //1GB for device-side malloc
+    ScatterAllocator mMC(
+        1U * 1024U * 1024U * 1024U); // 1GB for device-side malloc
 
-    exampleKernel<<<1,32>>>( mMC );
+    exampleKernel<<<1, 32>>>(mMC);
     std::cout << "Slots from Host: " << mMC.getAvailableSlots(1) << std::endl;
 
     return 0;
diff --git a/src/include/mallocMC/AlignmentPolicies.hpp b/src/include/mallocMC/AlignmentPolicies.hpp
index c471b696..ece52367 100644
--- a/src/include/mallocMC/AlignmentPolicies.hpp
+++ b/src/include/mallocMC/AlignmentPolicies.hpp
@@ -28,9 +28,7 @@
 
 #pragma once
 
-#include "alignmentPolicies/Shrink.hpp"
-#include "alignmentPolicies/Shrink_impl.hpp"
-
 #include "alignmentPolicies/Noop.hpp"
 #include "alignmentPolicies/Noop_impl.hpp"
-
+#include "alignmentPolicies/Shrink.hpp"
+#include "alignmentPolicies/Shrink_impl.hpp"
diff --git a/src/include/mallocMC/CreationPolicies.hpp b/src/include/mallocMC/CreationPolicies.hpp
index 56f6e23f..d987c18e 100644
--- a/src/include/mallocMC/CreationPolicies.hpp
+++ b/src/include/mallocMC/CreationPolicies.hpp
@@ -28,8 +28,7 @@
 
 #pragma once
 
-#include "creationPolicies/Scatter.hpp"
-#include "creationPolicies/Scatter_impl.hpp"
-
 #include "creationPolicies/OldMalloc.hpp"
 #include "creationPolicies/OldMalloc_impl.hpp"
+#include "creationPolicies/Scatter.hpp"
+#include "creationPolicies/Scatter_impl.hpp"
diff --git a/src/include/mallocMC/DistributionPolicies.hpp b/src/include/mallocMC/DistributionPolicies.hpp
index f534c57d..c601f819 100644
--- a/src/include/mallocMC/DistributionPolicies.hpp
+++ b/src/include/mallocMC/DistributionPolicies.hpp
@@ -30,6 +30,5 @@
 
 #include "distributionPolicies/Noop.hpp"
 #include "distributionPolicies/Noop_impl.hpp"
-
 #include "distributionPolicies/XMallocSIMD.hpp"
 #include "distributionPolicies/XMallocSIMD_impl.hpp"
diff --git a/src/include/mallocMC/OOMPolicies.hpp b/src/include/mallocMC/OOMPolicies.hpp
index 67eda519..60bcba22 100644
--- a/src/include/mallocMC/OOMPolicies.hpp
+++ b/src/include/mallocMC/OOMPolicies.hpp
@@ -28,8 +28,7 @@
 
 #pragma once
 
-#include "oOMPolicies/ReturnNull.hpp"
-#include "oOMPolicies/ReturnNull_impl.hpp"
-
 #include "oOMPolicies/BadAllocException.hpp"
 #include "oOMPolicies/BadAllocException_impl.hpp"
+#include "oOMPolicies/ReturnNull.hpp"
+#include "oOMPolicies/ReturnNull_impl.hpp"
diff --git a/src/include/mallocMC/ReservePoolPolicies.hpp b/src/include/mallocMC/ReservePoolPolicies.hpp
index 1cc3aa7b..d01bac5d 100644
--- a/src/include/mallocMC/ReservePoolPolicies.hpp
+++ b/src/include/mallocMC/ReservePoolPolicies.hpp
@@ -28,8 +28,7 @@
 
 #pragma once
 
-#include "reservePoolPolicies/SimpleCudaMalloc.hpp"
-#include "reservePoolPolicies/SimpleCudaMalloc_impl.hpp"
-
 #include "reservePoolPolicies/CudaSetLimits.hpp"
 #include "reservePoolPolicies/CudaSetLimits_impl.hpp"
+#include "reservePoolPolicies/SimpleCudaMalloc.hpp"
+#include "reservePoolPolicies/SimpleCudaMalloc_impl.hpp"
diff --git a/src/include/mallocMC/alignmentPolicies/Noop.hpp b/src/include/mallocMC/alignmentPolicies/Noop.hpp
index 3879da6f..bc07ae6b 100644
--- a/src/include/mallocMC/alignmentPolicies/Noop.hpp
+++ b/src/include/mallocMC/alignmentPolicies/Noop.hpp
@@ -27,16 +27,17 @@
 
 #pragma once
 
-namespace mallocMC{
-namespace AlignmentPolicies{
-
-  /**
-   * @brief a policy that does nothing
-   *
-   * This AlignmentPolicy will not perform any distribution, but only return
-   * its input (identity function)
-   */
-  class Noop;
-
-} //namespace AlignmentPolicies
-} //namespace mallocMC
+namespace mallocMC
+{
+    namespace AlignmentPolicies
+    {
+        /**
+         * @brief a policy that does nothing
+         *
+         * This AlignmentPolicy will not perform any distribution, but only
+         * return its input (identity function)
+         */
+        class Noop;
+
+    } // namespace AlignmentPolicies
+} // namespace mallocMC
diff --git a/src/include/mallocMC/alignmentPolicies/Noop_impl.hpp b/src/include/mallocMC/alignmentPolicies/Noop_impl.hpp
index e1c034d0..02ad382f 100644
--- a/src/include/mallocMC/alignmentPolicies/Noop_impl.hpp
+++ b/src/include/mallocMC/alignmentPolicies/Noop_impl.hpp
@@ -27,34 +27,39 @@
 
 #pragma once
 
-#include <boost/cstdint.hpp>
-#include <string>
-
-#include "Noop.hpp"
 #include "../mallocMC_prefixes.hpp"
+#include "Noop.hpp"
 
-namespace mallocMC{
-namespace AlignmentPolicies{
-
-  class Noop{
-    typedef boost::uint32_t uint32;
-
-    public:
-
-    static boost::tuple<void*,size_t> alignPool(void* memory, size_t memsize){
-      return boost::make_tuple(memory,memsize);
-    }
-
-    MAMC_HOST MAMC_ACCELERATOR
-    static uint32 applyPadding(uint32 bytes){
-      return bytes;
-    }
-
-    static std::string classname(){
-      return "Noop";
-    }
-
-  };
-
-} //namespace AlignmentPolicies
-} //namespace mallocMC
+#include <cstdint>
+#include <string>
+#include <tuple>
+
+namespace mallocMC
+{
+    namespace AlignmentPolicies
+    {
+        class Noop
+        {
+            using uint32 = std::uint32_t;
+
+        public:
+            static auto alignPool(void * memory, size_t memsize)
+                -> std::tuple<void *, size_t>
+            {
+                return std::make_tuple(memory, memsize);
+            }
+
+            MAMC_HOST MAMC_ACCELERATOR static auto applyPadding(uint32 bytes)
+                -> uint32
+            {
+                return bytes;
+            }
+
+            static auto classname() -> std::string
+            {
+                return "Noop";
+            }
+        };
+
+    } // namespace AlignmentPolicies
+} // namespace mallocMC
diff --git a/src/include/mallocMC/alignmentPolicies/Shrink.hpp b/src/include/mallocMC/alignmentPolicies/Shrink.hpp
index 4bafeff8..6ed70e49 100644
--- a/src/include/mallocMC/alignmentPolicies/Shrink.hpp
+++ b/src/include/mallocMC/alignmentPolicies/Shrink.hpp
@@ -31,31 +31,32 @@
 
 #pragma once
 
-#include <boost/mpl/int.hpp>
-
-namespace mallocMC{
-namespace AlignmentPolicies{
-
-namespace ShrinkConfig{
-  struct DefaultShrinkConfig{
-    typedef boost::mpl::int_<16> dataAlignment;
-  };
-}
-
-  /**
-   * @brief Provides proper alignment of pool and pads memory requests
-   *
-   * This AlignmentPolicy is based on ideas from ScatterAlloc
-   * (http://ieeexplore.ieee.org/xpl/articleDetails.jsp?arnumber=6339604). It
-   * performs alignment operations on big memory pools and requests to allocate
-   * memory. Memory pools are truncated at the beginning until the pointer to
-   * the memory fits the alignment. Requests to allocate memory are padded
-   * until their size is a multiple of the alignment.
-   *
-   * @tparam T_Config (optional) The alignment to use
-   */
-  template<typename T_Config = ShrinkConfig::DefaultShrinkConfig>
-  class Shrink;
-
-} //namespace AlignmentPolicies
-} //namespace mallocMC
+namespace mallocMC
+{
+    namespace AlignmentPolicies
+    {
+        namespace ShrinkConfig
+        {
+            struct DefaultShrinkConfig
+            {
+                static constexpr auto dataAlignment = 16;
+            };
+        } // namespace ShrinkConfig
+
+        /**
+         * @brief Provides proper alignment of pool and pads memory requests
+         *
+         * This AlignmentPolicy is based on ideas from ScatterAlloc
+         * (http://ieeexplore.ieee.org/xpl/articleDetails.jsp?arnumber=6339604).
+         * It performs alignment operations on big memory pools and requests to
+         * allocate memory. Memory pools are truncated at the beginning until
+         * the pointer to the memory fits the alignment. Requests to allocate
+         * memory are padded until their size is a multiple of the alignment.
+         *
+         * @tparam T_Config (optional) The alignment to use
+         */
+        template<typename T_Config = ShrinkConfig::DefaultShrinkConfig>
+        class Shrink;
+
+    } // namespace AlignmentPolicies
+} // namespace mallocMC
diff --git a/src/include/mallocMC/alignmentPolicies/Shrink_impl.hpp b/src/include/mallocMC/alignmentPolicies/Shrink_impl.hpp
index 60757519..783efcc7 100644
--- a/src/include/mallocMC/alignmentPolicies/Shrink_impl.hpp
+++ b/src/include/mallocMC/alignmentPolicies/Shrink_impl.hpp
@@ -31,34 +31,42 @@
 
 #pragma once
 
-#include <boost/cstdint.hpp>
-#include <boost/static_assert.hpp>
-#include <iostream>
-#include <string>
-#include <sstream>
-#include <boost/tuple/tuple.hpp>
-
-#include "Shrink.hpp"
 #include "../mallocMC_prefixes.hpp"
+#include "Shrink.hpp"
 
-namespace mallocMC{
-namespace AlignmentPolicies{
-
-namespace Shrink2NS{
-    
-  template<int PSIZE> struct __PointerEquivalent{ typedef unsigned int type;};
-  template<>
-  struct __PointerEquivalent<8>{ typedef unsigned long long int type; };
-}// namespace ShrinkNS
-
-  template<typename T_Config>
-  class Shrink{
-    public:
-    typedef T_Config Properties;
+#include <cstdint>
+#include <iostream>
+#include <sstream>
+#include <string>
 
-    private:
-    typedef boost::uint32_t uint32;
-    typedef Shrink2NS::__PointerEquivalent<sizeof(char*)>::type PointerEquivalent;
+namespace mallocMC
+{
+    namespace AlignmentPolicies
+    {
+        namespace Shrink2NS
+        {
+            template<int PSIZE>
+            struct __PointerEquivalent
+            {
+                using type = unsigned int;
+            };
+            template<>
+            struct __PointerEquivalent<8>
+            {
+                using type = unsigned long long;
+            };
+        } // namespace Shrink2NS
+
+        template<typename T_Config>
+        class Shrink
+        {
+        public:
+            using Properties = T_Config;
+
+        private:
+            using uint32 = std::uint32_t;
+            using PointerEquivalent
+                = Shrink2NS::__PointerEquivalent<sizeof(char *)>::type;
 
 /** Allow for a hierarchical validation of parameters:
  *
@@ -70,54 +78,65 @@ namespace Shrink2NS{
  * default-struct < template-struct < command-line parameter
  */
 #ifndef MALLOCMC_AP_SHRINK_DATAALIGNMENT
-#define MALLOCMC_AP_SHRINK_DATAALIGNMENT Properties::dataAlignment::value
+#define MALLOCMC_AP_SHRINK_DATAALIGNMENT (Properties::dataAlignment)
 #endif
-    BOOST_STATIC_CONSTEXPR uint32 dataAlignment = MALLOCMC_AP_SHRINK_DATAALIGNMENT;
-
-    // \TODO: The static_cast can be removed once the minimal dependencies of
-    //        this project is are at least CUDA 7.0 and gcc 4.8.2
-    BOOST_STATIC_ASSERT(static_cast<uint32>(dataAlignment) > 0);
-    //dataAlignment must also be a power of 2!
-    BOOST_STATIC_ASSERT(dataAlignment && !(dataAlignment & (dataAlignment-1)) ); 
-
-    public:
-    static boost::tuple<void*,size_t> alignPool(void* memory, size_t memsize){
-      PointerEquivalent alignmentstatus = ((PointerEquivalent)memory) & (dataAlignment -1);
-      if(alignmentstatus != 0)
-      {
-        std::cout << "Heap Warning: memory to use not ";
-        std::cout << dataAlignment << " byte aligned..."        << std::endl;
-        std::cout << "Before:"                                  << std::endl;
-        std::cout << "dataAlignment:   " << dataAlignment       << std::endl;
-        std::cout << "Alignmentstatus: " << alignmentstatus     << std::endl;
-        std::cout << "size_t memsize   " << memsize << " byte"  << std::endl;
-        std::cout << "void *memory     " << memory              << std::endl;
-
-        memory   = (void*)(((PointerEquivalent)memory) + dataAlignment - alignmentstatus);
-        memsize -= (size_t)dataAlignment + (size_t)alignmentstatus;
-
-        std::cout << "Was shrunk automatically to: "            << std::endl;
-        std::cout << "size_t memsize   " << memsize << " byte"  << std::endl;
-        std::cout << "void *memory     " << memory              << std::endl;
-      }
-
-      return boost::make_tuple(memory,memsize);
-    }
-
-    MAMC_HOST
-    MAMC_ACCELERATOR
-    static uint32 applyPadding(uint32 bytes){
-      return (bytes + dataAlignment - 1) & ~(dataAlignment-1);
-    }
-
-    MAMC_HOST
-    static std::string classname(){
-      std::stringstream ss;
-      ss << "Shrink[" << dataAlignment << "]";
-      return ss.str();
-    }
-
-  };
-
-} //namespace AlignmentPolicies
-} //namespace mallocMC
+            static constexpr uint32 dataAlignment
+                = MALLOCMC_AP_SHRINK_DATAALIGNMENT;
+
+            // dataAlignment must be a power of 2!
+            static_assert(
+                dataAlignment != 0
+                    && (dataAlignment & (dataAlignment - 1)) == 0,
+                "dataAlignment must also be a power of 2");
+
+        public:
+            static auto alignPool(void * memory, size_t memsize)
+                -> std::tuple<void *, size_t>
+            {
+                PointerEquivalent alignmentstatus
+                    = ((PointerEquivalent)memory) & (dataAlignment - 1);
+                if(alignmentstatus != 0)
+                {
+                    std::cout << "Heap Warning: memory to use not ";
+                    std::cout << dataAlignment << " byte aligned..."
+                              << std::endl;
+                    std::cout << "Before:" << std::endl;
+                    std::cout << "dataAlignment:   " << dataAlignment
+                              << std::endl;
+                    std::cout << "Alignmentstatus: " << alignmentstatus
+                              << std::endl;
+                    std::cout << "size_t memsize   " << memsize << " byte"
+                              << std::endl;
+                    std::cout << "void *memory     " << memory << std::endl;
+
+                    memory
+                        = (void *)(((PointerEquivalent)memory) + dataAlignment - alignmentstatus);
+                    memsize -= (size_t)dataAlignment + (size_t)alignmentstatus;
+
+                    std::cout << "Was shrunk automatically to: " << std::endl;
+                    std::cout << "size_t memsize   " << memsize << " byte"
+                              << std::endl;
+                    std::cout << "void *memory     " << memory << std::endl;
+                }
+
+                return std::make_tuple(memory, memsize);
+            }
+
+            MAMC_HOST
+            MAMC_ACCELERATOR
+            static auto applyPadding(uint32 bytes) -> uint32
+            {
+                return (bytes + dataAlignment - 1) & ~(dataAlignment - 1);
+            }
+
+            MAMC_HOST
+            static auto classname() -> std::string
+            {
+                std::stringstream ss;
+                ss << "Shrink[" << dataAlignment << "]";
+                return ss.str();
+            }
+        };
+
+    } // namespace AlignmentPolicies
+} // namespace mallocMC
diff --git a/src/include/mallocMC/allocator.hpp b/src/include/mallocMC/allocator.hpp
index a811b488..0f7c6561 100644
--- a/src/include/mallocMC/allocator.hpp
+++ b/src/include/mallocMC/allocator.hpp
@@ -28,67 +28,57 @@
 
 #pragma once
 
-#include "mallocMC_utils.hpp"
+#include "device_allocator.hpp"
+#include "mallocMC_allocator_handle.hpp"
 #include "mallocMC_constraints.hpp"
 #include "mallocMC_prefixes.hpp"
 #include "mallocMC_traits.hpp"
-#include "mallocMC_allocator_handle.hpp"
+#include "mallocMC_utils.hpp"
 
-#include <boost/cstdint.hpp>
-#include <boost/tuple/tuple.hpp>
-#include <boost/static_assert.hpp>
+#include <cstdint>
 #include <sstream>
+#include <tuple>
 #include <vector>
 
-namespace mallocMC{
-
-namespace detail{
-
-    template<
-        typename T_Allocator,
-        bool T_providesAvailableSlots
-    >
-    struct GetAvailableSlotsIfAvailHost
+namespace mallocMC
+{
+    namespace detail
     {
-        MAMC_HOST static
-        unsigned
-        getAvailableSlots(
-            size_t,
-            T_Allocator &
-        )
+        template<typename T_Allocator, bool T_providesAvailableSlots>
+        struct GetAvailableSlotsIfAvailHost
         {
-            return 0;
-        }
-    };
-
-    template<class T_Allocator>
-    struct GetAvailableSlotsIfAvailHost<T_Allocator, true>
-    {
-        MAMC_HOST
-        static unsigned
-        getAvailableSlots(
-            size_t slotSize,
-            T_Allocator& alloc
-        ){
-            return T_Allocator::CreationPolicy::getAvailableSlotsHost(slotSize, alloc.getAllocatorHandle().devAllocator);
-        }
-    };
-
-}
-
+            MAMC_HOST static auto getAvailableSlots(size_t, T_Allocator &)
+                -> unsigned
+            {
+                return 0;
+            }
+        };
+
+        template<class T_Allocator>
+        struct GetAvailableSlotsIfAvailHost<T_Allocator, true>
+        {
+            MAMC_HOST
+            static auto getAvailableSlots(size_t slotSize, T_Allocator & alloc)
+                -> unsigned
+            {
+                return T_Allocator::CreationPolicy::getAvailableSlotsHost(
+                    slotSize, alloc.getAllocatorHandle().devAllocator);
+            }
+        };
+    } // namespace detail
 
     struct HeapInfo
     {
-        void* p;
+        void * p;
         size_t size;
     };
 
     /**
      * @brief "HostClass" that combines all policies to a useful allocator
      *
-     * This class implements the necessary glue-logic to form an actual allocator
-     * from the provided policies. It implements the public interface and
-     * executes some constraint checking based on an instance of the class
+     * This class implements the necessary glue-logic to form an actual
+     * allocator from the provided policies. It implements the public interface
+     * and executes some constraint checking based on an instance of the class
      * PolicyConstraints.
      *
      * @tparam T_CreationPolicy The desired type of a CreationPolicy
@@ -98,37 +88,34 @@ namespace detail{
      * @tparam T_AlignmentPolicy The desired type of a AlignmentPolicy
      */
     template<
-       typename T_CreationPolicy,
-       typename T_DistributionPolicy,
-       typename T_OOMPolicy,
-       typename T_ReservePoolPolicy,
-       typename T_AlignmentPolicy
-    >
+        typename T_CreationPolicy,
+        typename T_DistributionPolicy,
+        typename T_OOMPolicy,
+        typename T_ReservePoolPolicy,
+        typename T_AlignmentPolicy>
     class Allocator :
-        public PolicyConstraints<
-            T_CreationPolicy,
-            T_DistributionPolicy,
-            T_OOMPolicy,
-            T_ReservePoolPolicy,
-            T_AlignmentPolicy
-        >
+            public PolicyConstraints<
+                T_CreationPolicy,
+                T_DistributionPolicy,
+                T_OOMPolicy,
+                T_ReservePoolPolicy,
+                T_AlignmentPolicy>
     {
-        typedef boost::uint32_t uint32;
+        using uint32 = std::uint32_t;
 
     public:
-        typedef T_CreationPolicy CreationPolicy;
-        typedef T_DistributionPolicy DistributionPolicy;
-        typedef T_OOMPolicy OOMPolicy;
-        typedef T_ReservePoolPolicy ReservePoolPolicy;
-        typedef T_AlignmentPolicy AlignmentPolicy;
-        typedef std::vector< HeapInfo > HeapInfoVector;
-        typedef DeviceAllocator<
+        using CreationPolicy = T_CreationPolicy;
+        using DistributionPolicy = T_DistributionPolicy;
+        using OOMPolicy = T_OOMPolicy;
+        using ReservePoolPolicy = T_ReservePoolPolicy;
+        using AlignmentPolicy = T_AlignmentPolicy;
+        using HeapInfoVector = std::vector<HeapInfo>;
+        using DevAllocator = DeviceAllocator<
             CreationPolicy,
             DistributionPolicy,
             OOMPolicy,
-            AlignmentPolicy
-        > DevAllocator;
-        typedef AllocatorHandleImpl<Allocator> AllocatorHandle;
+            AlignmentPolicy>;
+        using AllocatorHandle = AllocatorHandleImpl<Allocator>;
 
     private:
         AllocatorHandle allocatorHandle;
@@ -139,29 +126,13 @@ namespace detail{
          * @param size number of bytes
          */
         MAMC_HOST
-        void
-        alloc(
-            size_t size
-        )
+        void alloc(size_t size)
         {
-            void* pool = ReservePoolPolicy::setMemPool( size );
-            boost::tie(
-                pool,
-                size
-            ) = AlignmentPolicy::alignPool(
-                pool,
-                size
-            );
-            DevAllocator* devAllocatorPtr;
-            cudaMalloc(
-                ( void** ) &devAllocatorPtr,
-                sizeof( DevAllocator )
-            );
-            CreationPolicy::initHeap(
-                devAllocatorPtr,
-                pool,
-                size
-            );
+            void * pool = ReservePoolPolicy::setMemPool(size);
+            std::tie(pool, size) = AlignmentPolicy::alignPool(pool, size);
+            DevAllocator * devAllocatorPtr;
+            cudaMalloc((void **)&devAllocatorPtr, sizeof(DevAllocator));
+            CreationPolicy::initHeap(devAllocatorPtr, pool, size);
 
             allocatorHandle.devAllocator = devAllocatorPtr;
             heapInfos.p = pool;
@@ -176,33 +147,28 @@ namespace detail{
         MAMC_HOST
         void free()
         {
-            cudaFree( allocatorHandle.devAllocator );
-            ReservePoolPolicy::resetMemPool( heapInfos.p );
-            allocatorHandle.devAllocator = NULL;
+            cudaFree(allocatorHandle.devAllocator);
+            ReservePoolPolicy::resetMemPool(heapInfos.p);
+            allocatorHandle.devAllocator = nullptr;
             heapInfos.size = 0;
-            heapInfos.p = NULL;
+            heapInfos.p = nullptr;
         }
 
         /* forbid to copy the allocator */
         MAMC_HOST
-        Allocator( const Allocator& );
+        Allocator(const Allocator &);
 
     public:
-
-
         MAMC_HOST
-        Allocator(
-            size_t size = 8U * 1024U * 1024U
-        ) :
-            allocatorHandle( NULL )
+        Allocator(size_t size = 8U * 1024U * 1024U) : allocatorHandle(nullptr)
         {
-            alloc( size );
+            alloc(size);
         }
 
         MAMC_HOST
-        ~Allocator( )
+        ~Allocator()
         {
-            free( );
+            free();
         }
 
         /** destroy current heap data and resize the heap
@@ -210,18 +176,14 @@ namespace detail{
          * @param size number of bytes
          */
         MAMC_HOST
-        void
-        destructiveResize(
-            size_t size
-        )
+        void destructiveResize(size_t size)
         {
-            free( );
-            alloc( size );
+            free();
+            alloc(size);
         }
 
         MAMC_HOST
-        AllocatorHandle
-        getAllocatorHandle( )
+        auto getAllocatorHandle() -> AllocatorHandle
         {
             return allocatorHandle;
         }
@@ -232,45 +194,41 @@ namespace detail{
             return getAllocatorHandle();
         }
 
-        MAMC_HOST static
-        std::string
-        info(
-            std::string linebreak = " "
-        )
+        MAMC_HOST static auto info(std::string linebreak = " ") -> std::string
         {
             std::stringstream ss;
-            ss << "CreationPolicy:      " << CreationPolicy::classname( ) << "    " << linebreak;
-            ss << "DistributionPolicy:  " << DistributionPolicy::classname( ) << "" << linebreak;
-            ss << "OOMPolicy:           " << OOMPolicy::classname( ) << "         " << linebreak;
-            ss << "ReservePoolPolicy:   " << ReservePoolPolicy::classname( ) << " " << linebreak;
-            ss << "AlignmentPolicy:     " << AlignmentPolicy::classname( ) << "   " << linebreak;
+            ss << "CreationPolicy:      " << CreationPolicy::classname()
+               << "    " << linebreak;
+            ss << "DistributionPolicy:  " << DistributionPolicy::classname()
+               << "" << linebreak;
+            ss << "OOMPolicy:           " << OOMPolicy::classname()
+               << "         " << linebreak;
+            ss << "ReservePoolPolicy:   " << ReservePoolPolicy::classname()
+               << " " << linebreak;
+            ss << "AlignmentPolicy:     " << AlignmentPolicy::classname()
+               << "   " << linebreak;
             return ss.str();
         }
 
-        // polymorphism over the availability of getAvailableSlots for calling from the host
+        // polymorphism over the availability of getAvailableSlots for calling
+        // from the host
         MAMC_HOST
-        unsigned
-        getAvailableSlots(
-            size_t slotSize
-        )
+        auto getAvailableSlots(size_t slotSize) -> unsigned
         {
-            slotSize = AlignmentPolicy::applyPadding( slotSize );
+            slotSize = AlignmentPolicy::applyPadding(slotSize);
             return detail::GetAvailableSlotsIfAvailHost<
                 Allocator,
-                Traits<Allocator>::providesAvailableSlots
-            >::getAvailableSlots( slotSize, *this );
+                Traits<Allocator>::providesAvailableSlots>::
+                getAvailableSlots(slotSize, *this);
         }
 
         MAMC_HOST
-        HeapInfoVector
-        getHeapLocations( )
+        auto getHeapLocations() -> HeapInfoVector
         {
-          HeapInfoVector v;
-          v.push_back( heapInfos );
-          return v;
+            HeapInfoVector v;
+            v.push_back(heapInfos);
+            return v;
         }
-
     };
 
-} //namespace mallocMC
-
+} // namespace mallocMC
diff --git a/src/include/mallocMC/creationPolicies/OldMalloc.hpp b/src/include/mallocMC/creationPolicies/OldMalloc.hpp
index e48b4890..4b8db31f 100644
--- a/src/include/mallocMC/creationPolicies/OldMalloc.hpp
+++ b/src/include/mallocMC/creationPolicies/OldMalloc.hpp
@@ -27,18 +27,18 @@
 
 #pragma once
 
-
-namespace mallocMC{
-namespace CreationPolicies{
-    
-  /**
-   * @brief classic malloc/free behaviour known from CUDA
-   *
-   * This CreationPolicy implements the classic device-side malloc and free
-   * system calls that is offered by CUDA-capable accelerator of compute
-   * capability 2.0 and higher
-   */
-  class OldMalloc;
-
-} //namespace CreationPolicies
-} //namespace mallocMC
+namespace mallocMC
+{
+    namespace CreationPolicies
+    {
+        /**
+         * @brief classic malloc/free behaviour known from CUDA
+         *
+         * This CreationPolicy implements the classic device-side malloc and
+         * free system calls that is offered by CUDA-capable accelerator of
+         * compute capability 2.0 and higher
+         */
+        class OldMalloc;
+
+    } // namespace CreationPolicies
+} // namespace mallocMC
diff --git a/src/include/mallocMC/creationPolicies/OldMalloc_impl.hpp b/src/include/mallocMC/creationPolicies/OldMalloc_impl.hpp
index 162eb661..6c9b7475 100644
--- a/src/include/mallocMC/creationPolicies/OldMalloc_impl.hpp
+++ b/src/include/mallocMC/creationPolicies/OldMalloc_impl.hpp
@@ -27,45 +27,47 @@
 
 #pragma once
 
-#include <boost/cstdint.hpp>
-#include <boost/mpl/bool.hpp>
-
 #include "OldMalloc.hpp"
 
-namespace mallocMC{
-namespace CreationPolicies{
-
-  class OldMalloc
-  {
-    typedef boost::uint32_t uint32;
-
-    public:
-    typedef boost::mpl::bool_<false> providesAvailableSlots;
-
-    __device__ void* create(uint32 bytes)
-    {
-      return ::malloc(static_cast<size_t>(bytes));
-    }
+#include <cstdint>
 
-    __device__ void destroy(void* mem)
+namespace mallocMC
+{
+    namespace CreationPolicies
     {
-      free(mem);
-    }
-
-    __device__ bool isOOM(void* p, size_t s){
-      return s && (p == NULL);
-    }
-
-    template < typename T >
-    static void* initHeap(T* dAlloc, void*, size_t){
-      return dAlloc;
-    }
-
-    static std::string classname(){
-      return "OldMalloc";
-    }
-
-  };
-
-} //namespace CreationPolicies
-} //namespace mallocMC
+        class OldMalloc
+        {
+            using uint32 = std::uint32_t;
+
+        public:
+            static constexpr auto providesAvailableSlots = false;
+
+            __device__ auto create(uint32 bytes) const -> void *
+            {
+                return ::malloc(static_cast<size_t>(bytes));
+            }
+
+            __device__ void destroy(void * mem) const
+            {
+                ::free(mem);
+            }
+
+            __device__ auto isOOM(void * p, size_t s) const -> bool
+            {
+                return s != 0 && (p == nullptr);
+            }
+
+            template<typename T>
+            static auto initHeap(T * dAlloc, void *, size_t) -> void *
+            {
+                return dAlloc;
+            }
+
+            static auto classname() -> std::string
+            {
+                return "OldMalloc";
+            }
+        };
+
+    } // namespace CreationPolicies
+} // namespace mallocMC
diff --git a/src/include/mallocMC/creationPolicies/Scatter.hpp b/src/include/mallocMC/creationPolicies/Scatter.hpp
index 499bd73e..0db2af43 100644
--- a/src/include/mallocMC/creationPolicies/Scatter.hpp
+++ b/src/include/mallocMC/creationPolicies/Scatter.hpp
@@ -33,52 +33,52 @@
 
 #pragma once
 
-#include <boost/mpl/bool.hpp>
-#include <boost/mpl/int.hpp>
+namespace mallocMC
+{
+    namespace CreationPolicies
+    {
+        namespace ScatterConf
+        {
+            struct DefaultScatterConfig
+            {
+                static constexpr auto pagesize = 4096;
+                static constexpr auto accessblocks = 8;
+                static constexpr auto regionsize = 16;
+                static constexpr auto wastefactor = 2;
+                static constexpr auto resetfreedpages = false;
+            };
 
-namespace mallocMC{
-namespace CreationPolicies{
-namespace ScatterConf{
-  struct DefaultScatterConfig{
-    typedef boost::mpl::int_<4096>  pagesize;
-    typedef boost::mpl::int_<8>     accessblocks;
-    typedef boost::mpl::int_<16>    regionsize;
-    typedef boost::mpl::int_<2>     wastefactor;
-    typedef boost::mpl::bool_<false> resetfreedpages;
-  };
+            struct DefaultScatterHashingParams
+            {
+                static constexpr auto hashingK = 38183;
+                static constexpr auto hashingDistMP = 17497;
+                static constexpr auto hashingDistWP = 1;
+                static constexpr auto hashingDistWPRel = 1;
+            };
+        } // namespace ScatterConf
 
-  struct DefaultScatterHashingParams{
-    typedef boost::mpl::int_<38183> hashingK;
-    typedef boost::mpl::int_<17497> hashingDistMP;
-    typedef boost::mpl::int_<1>     hashingDistWP;
-    typedef boost::mpl::int_<1>     hashingDistWPRel;
-  };  
-}
+        /**
+         * @brief fast memory allocation based on ScatterAlloc
+         *
+         * This CreationPolicy implements a fast memory allocator that trades
+         * speed for fragmentation of memory. This is based on the memory
+         * allocator "ScatterAlloc"
+         * (http://ieeexplore.ieee.org/xpl/articleDetails.jsp?arnumber=6339604),
+         * and is extended to report free memory slots of a given size (both on
+         * host and accelerator). To work properly, this policy class requires a
+         * pre-allocated heap on the accelerator and works only with Nvidia CUDA
+         * capable accelerators that have at least compute capability 2.0.
+         *
+         * @tparam T_Config (optional) configure the heap layout. The
+         *        default can be obtained through Scatter<>::HeapProperties
+         * @tparam T_Hashing (optional) configure the parameters for
+         *        the hashing formula. The default can be obtained through
+         *        Scatter<>::HashingProperties
+         */
+        template<
+            class T_Config = ScatterConf::DefaultScatterConfig,
+            class T_Hashing = ScatterConf::DefaultScatterHashingParams>
+        class Scatter;
 
-  /**
-   * @brief fast memory allocation based on ScatterAlloc
-   *
-   * This CreationPolicy implements a fast memory allocator that trades speed
-   * for fragmentation of memory. This is based on the memory allocator
-   * "ScatterAlloc"
-   * (http://ieeexplore.ieee.org/xpl/articleDetails.jsp?arnumber=6339604), and
-   * is extended to report free memory slots of a given size (both on host and
-   * accelerator).
-   * To work properly, this policy class requires a pre-allocated heap on the
-   * accelerator and works only with Nvidia CUDA capable accelerators that have
-   * at least compute capability 2.0.
-   *
-   * @tparam T_Config (optional) configure the heap layout. The
-   *        default can be obtained through Scatter<>::HeapProperties
-   * @tparam T_Hashing (optional) configure the parameters for
-   *        the hashing formula. The default can be obtained through
-   *        Scatter<>::HashingProperties
-   */
-  template<
-  class T_Config = ScatterConf::DefaultScatterConfig,
-  class T_Hashing = ScatterConf::DefaultScatterHashingParams
-  >
-  class Scatter;
-
-}// namespace CreationPolicies
-}// namespace mallocMC
+    } // namespace CreationPolicies
+} // namespace mallocMC
diff --git a/src/include/mallocMC/creationPolicies/Scatter_impl.hpp b/src/include/mallocMC/creationPolicies/Scatter_impl.hpp
index 88e3a611..7bb00ba7 100644
--- a/src/include/mallocMC/creationPolicies/Scatter_impl.hpp
+++ b/src/include/mallocMC/creationPolicies/Scatter_impl.hpp
@@ -33,57 +33,64 @@
 
 #pragma once
 
-#include <cstdio>
-#include <boost/cstdint.hpp> /* uint32_t */
-#include <iostream>
-#include <string>
-#include <cassert>
-#include <stdexcept>
-#include <boost/mpl/bool.hpp>
-
 #include "../mallocMC_utils.hpp"
 #include "Scatter.hpp"
 
-namespace mallocMC{
-namespace CreationPolicies{
-
-namespace ScatterKernelDetail{
-  template <typename T_Allocator>
-  __global__ void initKernel(T_Allocator* heap, void* heapmem, size_t memsize){
-    heap->pool = heapmem;
-    heap->initDeviceFunction(heapmem, memsize);
-  }
-
-
-  template < typename T_Allocator >
-  __global__ void getAvailableSlotsKernel(T_Allocator* heap, size_t slotSize, unsigned* slots){
-    int gid       = threadIdx.x + blockIdx.x*blockDim.x;
-    int nWorker   = gridDim.x * blockDim.x;
-    unsigned temp = heap->getAvailaibleSlotsDeviceFunction(slotSize, gid, nWorker);
-    if(temp) atomicAdd(slots, temp);
-  }
-
+#include <cassert>
+#include <cstdint> /* uint32_t */
+#include <cstdio>
+#include <iostream>
+#include <stdexcept>
+#include <string>
 
-  template <typename T_Allocator>
-  __global__ void finalizeKernel(T_Allocator* heap){
-    heap->finalizeDeviceFunction();
-  }
+namespace mallocMC
+{
+    namespace CreationPolicies
+    {
+        namespace ScatterKernelDetail
+        {
+            template<typename T_Allocator>
+            __global__ void
+            initKernel(T_Allocator * heap, void * heapmem, size_t memsize)
+            {
+                heap->pool = heapmem;
+                heap->initDeviceFunction(heapmem, memsize);
+            }
 
-} //namespace ScatterKernelDetail
+            template<typename T_Allocator>
+            __global__ void getAvailableSlotsKernel(
+                T_Allocator * heap,
+                size_t slotSize,
+                unsigned * slots)
+            {
+                int gid = threadIdx.x + blockIdx.x * blockDim.x;
+                int nWorker = gridDim.x * blockDim.x;
+                unsigned temp = heap->getAvailaibleSlotsDeviceFunction(
+                    slotSize, gid, nWorker);
+                if(temp)
+                    atomicAdd(slots, temp);
+            }
 
-  template<class T_Config, class T_Hashing>
-  class Scatter
-  {
+            template<typename T_Allocator>
+            __global__ void finalizeKernel(T_Allocator * heap)
+            {
+                heap->finalizeDeviceFunction();
+            }
 
-    public:
-      typedef T_Config  HeapProperties;
-      typedef T_Hashing HashingProperties;
-      struct  Properties : HeapProperties, HashingProperties{};
-      typedef boost::mpl::bool_<true>  providesAvailableSlots;
+        } // namespace ScatterKernelDetail
 
-    private:
-      typedef boost::uint32_t uint32;
+        template<class T_Config, class T_Hashing>
+        class Scatter
+        {
+        public:
+            using HeapProperties = T_Config;
+            using HashingProperties = T_Hashing;
+            struct Properties : HeapProperties, HashingProperties
+            {};
+            static constexpr auto providesAvailableSlots = true;
 
+        private:
+            using uint32 = std::uint32_t;
 
 /** Allow for a hierarchical validation of parameters:
  *
@@ -95,884 +102,1080 @@ namespace ScatterKernelDetail{
  * default-struct < template-struct < command-line parameter
  */
 #ifndef MALLOCMC_CP_SCATTER_PAGESIZE
-#define MALLOCMC_CP_SCATTER_PAGESIZE  static_cast<uint32>(HeapProperties::pagesize::value)
+#define MALLOCMC_CP_SCATTER_PAGESIZE (HeapProperties::pagesize)
 #endif
-      BOOST_STATIC_CONSTEXPR uint32 pagesize      = MALLOCMC_CP_SCATTER_PAGESIZE;
+            static constexpr uint32 pagesize = MALLOCMC_CP_SCATTER_PAGESIZE;
 
 #ifndef MALLOCMC_CP_SCATTER_ACCESSBLOCKS
-#define MALLOCMC_CP_SCATTER_ACCESSBLOCKS static_cast<uint32>(HeapProperties::accessblocks::value)
+#define MALLOCMC_CP_SCATTER_ACCESSBLOCKS (HeapProperties::accessblocks)
 #endif
-      BOOST_STATIC_CONSTEXPR uint32 accessblocks  = MALLOCMC_CP_SCATTER_ACCESSBLOCKS;
+            static constexpr uint32 accessblocks
+                = MALLOCMC_CP_SCATTER_ACCESSBLOCKS;
 
 #ifndef MALLOCMC_CP_SCATTER_REGIONSIZE
-#define MALLOCMC_CP_SCATTER_REGIONSIZE static_cast<uint32>(HeapProperties::regionsize::value)
+#define MALLOCMC_CP_SCATTER_REGIONSIZE (HeapProperties::regionsize)
 #endif
-      BOOST_STATIC_CONSTEXPR uint32 regionsize    = MALLOCMC_CP_SCATTER_REGIONSIZE;
+            static constexpr uint32 regionsize = MALLOCMC_CP_SCATTER_REGIONSIZE;
 
 #ifndef MALLOCMC_CP_SCATTER_WASTEFACTOR
-#define MALLOCMC_CP_SCATTER_WASTEFACTOR static_cast<uint32>(HeapProperties::wastefactor::value)
+#define MALLOCMC_CP_SCATTER_WASTEFACTOR (HeapProperties::wastefactor)
 #endif
-      BOOST_STATIC_CONSTEXPR uint32 wastefactor   = MALLOCMC_CP_SCATTER_WASTEFACTOR;
+            static constexpr uint32 wastefactor
+                = MALLOCMC_CP_SCATTER_WASTEFACTOR;
 
 #ifndef MALLOCMC_CP_SCATTER_RESETFREEDPAGES
-#define MALLOCMC_CP_SCATTER_RESETFREEDPAGES static_cast<bool>(HeapProperties::resetfreedpages::value)
+#define MALLOCMC_CP_SCATTER_RESETFREEDPAGES (HeapProperties::resetfreedpages)
 #endif
-      BOOST_STATIC_CONSTEXPR bool resetfreedpages = MALLOCMC_CP_SCATTER_RESETFREEDPAGES;
-
+            static constexpr bool resetfreedpages
+                = MALLOCMC_CP_SCATTER_RESETFREEDPAGES;
 
-    public:
-      BOOST_STATIC_CONSTEXPR uint32 _pagesize       = pagesize;
-      BOOST_STATIC_CONSTEXPR uint32 _accessblocks   = accessblocks;
-      BOOST_STATIC_CONSTEXPR uint32 _regionsize     = regionsize;
-      BOOST_STATIC_CONSTEXPR uint32 _wastefactor    = wastefactor;
-      BOOST_STATIC_CONSTEXPR bool _resetfreedpages  = resetfreedpages;
+        public:
+            static constexpr uint32 _pagesize = pagesize;
+            static constexpr uint32 _accessblocks = accessblocks;
+            static constexpr uint32 _regionsize = regionsize;
+            static constexpr uint32 _wastefactor = wastefactor;
+            static constexpr bool _resetfreedpages = resetfreedpages;
 
-    private:
+        private:
 #if _DEBUG || ANALYSEHEAP
-    public:
+        public:
 #endif
-      //BOOST_STATIC_CONSTEXPR uint32 minChunkSize0 = pagesize/(32*32);
-      BOOST_STATIC_CONSTEXPR uint32 minChunkSize1 = 0x10;
-      BOOST_STATIC_CONSTEXPR uint32 HierarchyThreshold =  (pagesize - 2*sizeof(uint32))/33;
-      BOOST_STATIC_CONSTEXPR uint32 minSegmentSize = 32*minChunkSize1 + sizeof(uint32);
-      BOOST_STATIC_CONSTEXPR uint32 tmp_maxOPM = minChunkSize1 > HierarchyThreshold ? 0 : (pagesize + (minSegmentSize-1)) / minSegmentSize;
-      BOOST_STATIC_CONSTEXPR uint32 maxOnPageMasks = 32 > tmp_maxOPM ? tmp_maxOPM : 32;
+            // static constexpr uint32 minChunkSize0 = pagesize/(32*32);
+            static constexpr uint32 minChunkSize1 = 0x10;
+            static constexpr uint32 HierarchyThreshold
+                = (pagesize - 2 * sizeof(uint32)) / 33;
+            static constexpr uint32 minSegmentSize
+                = 32 * minChunkSize1 + sizeof(uint32);
+            static constexpr uint32 tmp_maxOPM
+                = minChunkSize1 > HierarchyThreshold
+                ? 0
+                : (pagesize + (minSegmentSize - 1)) / minSegmentSize;
+            static constexpr uint32 maxOnPageMasks
+                = 32 > tmp_maxOPM ? tmp_maxOPM : 32;
 
 #ifndef MALLOCMC_CP_SCATTER_HASHINGK
-#define MALLOCMC_CP_SCATTER_HASHINGK    static_cast<uint32>(HashingProperties::hashingK::value)
+#define MALLOCMC_CP_SCATTER_HASHINGK (HashingProperties::hashingK)
 #endif
-     BOOST_STATIC_CONSTEXPR uint32 hashingK       = MALLOCMC_CP_SCATTER_HASHINGK;
+            static constexpr uint32 hashingK = MALLOCMC_CP_SCATTER_HASHINGK;
 
 #ifndef MALLOCMC_CP_SCATTER_HASHINGDISTMP
-#define MALLOCMC_CP_SCATTER_HASHINGDISTMP static_cast<uint32>(HashingProperties::hashingDistMP::value)
+#define MALLOCMC_CP_SCATTER_HASHINGDISTMP (HashingProperties::hashingDistMP)
 #endif
-     BOOST_STATIC_CONSTEXPR uint32 hashingDistMP  = MALLOCMC_CP_SCATTER_HASHINGDISTMP;
+            static constexpr uint32 hashingDistMP
+                = MALLOCMC_CP_SCATTER_HASHINGDISTMP;
 
 #ifndef MALLOCMC_CP_SCATTER_HASHINGDISTWP
-#define MALLOCMC_CP_SCATTER_HASHINGDISTWP static_cast<uint32>(HashingProperties::hashingDistWP::value)
+#define MALLOCMC_CP_SCATTER_HASHINGDISTWP (HashingProperties::hashingDistWP)
 #endif
-     BOOST_STATIC_CONSTEXPR uint32 hashingDistWP  = MALLOCMC_CP_SCATTER_HASHINGDISTWP;
+            static constexpr uint32 hashingDistWP
+                = MALLOCMC_CP_SCATTER_HASHINGDISTWP;
 
 #ifndef MALLOCMC_CP_SCATTER_HASHINGDISTWPREL
-#define MALLOCMC_CP_SCATTER_HASHINGDISTWPREL static_cast<uint32>(HashingProperties::hashingDistWPRel::value)
+#define MALLOCMC_CP_SCATTER_HASHINGDISTWPREL \
+    (HashingProperties::hashingDistWPRel)
 #endif
-     BOOST_STATIC_CONSTEXPR uint32 hashingDistWPRel = MALLOCMC_CP_SCATTER_HASHINGDISTWPREL;
+            static constexpr uint32 hashingDistWPRel
+                = MALLOCMC_CP_SCATTER_HASHINGDISTWPREL;
+
+            /**
+             * Page Table Entry struct
+             * The PTE holds basic information about each page
+             */
+            struct PTE
+            {
+                uint32 chunksize;
+                uint32 count;
+                uint32 bitmask;
 
+                __device__ void init()
+                {
+                    chunksize = 0;
+                    count = 0;
+                    bitmask = 0;
+                }
+            };
+
+            /**
+             * Page struct
+             * The page struct is used to access the data on the page more
+             * efficiently and to clear the area on the page, which might hold
+             * bitsfields later one
+             */
+            struct Page
+            {
+                char data[pagesize];
+
+                /**
+                 * The pages init method
+                 * This method initializes the region on the page which might
+                 * hold bit fields when the page is used for a small chunk size
+                 * @param previous_chunksize the chunksize which was uses for
+                 * the page before
+                 */
+                __device__ void init()
+                {
+                    // clear the entire data which can hold bitfields
+                    uint32 * write
+                        = (uint32 *)(data + pagesize - (int)(sizeof(uint32) * maxOnPageMasks));
+                    while(write < (uint32 *)(data + pagesize)) *write++ = 0;
+                }
+            };
+
+            // the data used by the allocator
+
+            volatile PTE * _ptes;
+            volatile uint32 * _regions;
+            Page * _page;
+            uint32 _numpages;
+            size_t _memsize;
+            uint32 _pagebasedMutex;
+            volatile uint32 _firstFreePageBased;
+            volatile uint32 _firstfreeblock;
+
+            /**
+             * randInit should create an random offset which can be used
+             * as the initial position in a bitfield
+             */
+            static __device__ inline auto randInit() -> uint32
+            {
+                // start with the laneid offset
+                return laneid();
+            }
 
-      /**
-       * Page Table Entry struct
-       * The PTE holds basic information about each page
-       */
-      struct PTE
-      {
-        uint32 chunksize;
-        uint32 count;
-        uint32 bitmask;
+            /**
+             * randInextspot delivers the next free spot in a bitfield
+             * it searches for the next unset bit to the left of spot and
+             * returns its offset. if there are no unset bits to the left
+             * then it wraps around
+             * @param bitfield the bitfield to be searched for
+             * @param spot the spot from which to search to the left
+             * @param spots number of bits that can be used
+             * @return next free spot in the bitfield
+             */
+            static __device__ inline auto
+            nextspot(uint32 bitfield, uint32 spot, uint32 spots) -> uint32
+            {
+                // wrap around the bitfields from the current spot to the left
+                bitfield = ((bitfield >> (spot + 1))
+                            | (bitfield << (spots - (spot + 1))))
+                    & ((1 << spots) - 1);
+                // compute the step from the current spot in the bitfield
+                const uint32 step = __ffs(~bitfield);
+                // and return the new spot
+                return (spot + step) % spots;
+            }
 
-        __device__ void init()
-        {
-          chunksize = 0;
-          count = 0;
-          bitmask = 0;
-        }
-      };
-
-      /**
-       * Page struct
-       * The page struct is used to access the data on the page more efficiently
-       * and to clear the area on the page, which might hold bitsfields later one
-       */
-      struct PAGE
-      {
-        char data[pagesize];
-
-        /**
-         * The pages init method
-         * This method initializes the region on the page which might hold
-         * bit fields when the page is used for a small chunk size
-         * @param previous_chunksize the chunksize which was uses for the page before
-         */
-        __device__ void init()
-        {
-          //clear the entire data which can hold bitfields
-          uint32* write = (uint32*)(data + pagesize - (int)(sizeof(uint32)*maxOnPageMasks));
-          while(write < (uint32*)(data + pagesize))
-            *write++ = 0;
-        }
-      };
-
-      // the data used by the allocator
-
-      volatile PTE* _ptes;
-      volatile uint32* _regions;
-      PAGE* _page;
-      uint32 _numpages;
-      size_t _memsize;
-      uint32 _pagebasedMutex;
-      volatile uint32 _firstFreePageBased;
-      volatile uint32 _firstfreeblock;
-
-
-      /**
-       * randInit should create an random offset which can be used
-       * as the initial position in a bitfield
-       */
-      __device__ inline uint32 randInit()
-      {
-        //start with the laneid offset
-        return laneid();
-      }
-
-      /**
-       * randInextspot delivers the next free spot in a bitfield
-       * it searches for the next unset bit to the left of spot and
-       * returns its offset. if there are no unset bits to the left
-       * then it wraps around
-       * @param bitfield the bitfield to be searched for
-       * @param spot the spot from which to search to the left
-       * @param spots number of bits that can be used
-       * @return next free spot in the bitfield
-       */
-      __device__ inline uint32 nextspot(uint32 bitfield, uint32 spot, uint32 spots)
-      {
-        //wrap around the bitfields from the current spot to the left
-        bitfield = ((bitfield >> (spot + 1)) | (bitfield << (spots - (spot + 1))))&((1<<spots)-1);
-        //compute the step from the current spot in the bitfield
-        uint32 step = __ffs(~bitfield);
-        //and return the new spot
-        return (spot + step) % spots;
-      }
-
-
-      /**
-       * onPageMasksPosition returns a pointer to the beginning of the onpagemasks inside a page.
-       * @param page the page that holds the masks
-       * @param the number of hierarchical page tables (bitfields) that are used inside this mask.
-       * @return pointer to the first address inside the page that holds metadata bitfields.
-       */
-      __device__ inline uint32* onPageMasksPosition(uint32 page, uint32 nMasks){
-        return (uint32*)(_page[page].data + pagesize - (int)sizeof(uint32)*nMasks);
-      }
-
-      /**
-       * usespot marks finds one free spot in the bitfield, marks it and returns its offset
-       * @param bitfield pointer to the bitfield to use
-       * @param spots overall number of spots the bitfield is responsible for
-       * @return if there is a free spot it returns the spot'S offset, otherwise -1
-       */
-      __device__ inline int usespot(uint32 *bitfield, uint32 spots)
-      {
-        //get first spot
-        uint32 spot = randInit() % spots;
-        for(;;)
-        {
-          uint32 mask = 1 << spot;
-          uint32 old = atomicOr(bitfield, mask);
-          if( (old & mask) == 0)
-            return spot;
-          // note: __popc(old) == spots should be sufficient,
-          //but if someone corrupts the memory we end up in an endless loop in here...
-          if(__popc(old) >= spots)
-            return -1;
-          spot = nextspot(old, spot, spots);
-        }
-      }
-
-
-      /**
-       * calcAdditionalChunks determines the number of chunks that are contained in the last segment of a hierarchical page
-       *
-       * The additional checks are necessary to ensure correct results for very large pages and small chunksizes
-       *
-       * @param fullsegments the number of segments that can be completely filled in a page. This may NEVER be bigger than 32!
-       * @param segmentsize the number of bytes that are contained in a completely filled segment (32 chunks)
-       * @param chunksize the chosen allocation size within the page
-       * @return the number of additional chunks that will not fit in one of the fullsegments. For any correct input, this number is smaller than 32
-       */
-      __device__ inline uint32 calcAdditionalChunks(uint32 fullsegments, uint32 segmentsize, uint32 chunksize){
-        if(fullsegments != 32){
-          return max(0,(int)pagesize - (int)fullsegments*segmentsize - (int)sizeof(uint32))/chunksize;
-        }else
-          return 0;
-      }
-
-
-      /**
-       * addChunkHierarchy finds a free chunk on a page which uses bit fields on the page
-       * @param chunksize the chunksize of the page
-       * @param fullsegments the number of full segments on the page (a 32 bits on the page)
-       * @param additional_chunks the number of additional chunks in last segment (less than 32 bits on the page)
-       * @param page the page to use
-       * @return pointer to a free chunk on the page, 0 if we were unable to obtain a free chunk
-       */
-      __device__ inline void* addChunkHierarchy(uint32 chunksize, uint32 fullsegments, uint32 additional_chunks, uint32 page)
-      {
-        uint32 segments = fullsegments + (additional_chunks > 0 ? 1 : 0);
-        uint32 spot = randInit() % segments;
-        uint32 mask = _ptes[page].bitmask;
-        if((mask & (1 << spot)) != 0)
-          spot = nextspot(mask, spot, segments);
-        uint32 tries = segments - __popc(mask);
-        uint32* onpagemasks = onPageMasksPosition(page,segments);
-        for(uint32 i = 0; i < tries; ++i)
-        {
-          int hspot = usespot(onpagemasks + spot, spot < fullsegments ? 32 : additional_chunks);
-          if(hspot != -1)
-            return _page[page].data + (32*spot + hspot)*chunksize;
-          else
-            atomicOr((uint32*)&_ptes[page].bitmask, 1 << spot);
-          spot = nextspot(mask, spot, segments);
-        }
-        return 0;
-      }
-
-      /**
-       * addChunkNoHierarchy finds a free chunk on a page which uses the bit fields of the pte only
-       * @param chunksize the chunksize of the page
-       * @param page the page to use
-       * @param spots the number of chunks which fit on the page
-       * @return pointer to a free chunk on the page, 0 if we were unable to obtain a free chunk
-       */
-      __device__ inline void* addChunkNoHierarchy(uint32 chunksize, uint32 page, uint32 spots)
-      {
-        int spot = usespot((uint32*)&_ptes[page].bitmask, spots);
-        if(spot == -1)
-          return 0; //that should be impossible :)
-        return _page[page].data + spot*chunksize;
-      }
-
-      /**
-       * tryUsePage tries to use the page for the allocation request
-       * @param page the page to use
-       * @param chunksize the chunksize of the page
-       * @return pointer to a free chunk on the page, 0 if we were unable to obtain a free chunk
-       */
-      __device__ inline void* tryUsePage(uint32 page, uint32 chunksize)
-      {
-
-        void* chunk_ptr = NULL;
-
-        //increse the fill level
-        uint32 filllevel = atomicAdd((uint32*)&(_ptes[page].count), 1);
-        //recheck chunck size (it could be that the page got freed in the meanwhile...)
-        if(!resetfreedpages || _ptes[page].chunksize == chunksize)
-        {
-          if(chunksize <= HierarchyThreshold)
-          {
-            //more chunks than can be covered by the pte's single bitfield can be used
-            uint32 segmentsize = chunksize*32 + sizeof(uint32);
-            uint32 fullsegments = min(32,pagesize / segmentsize);
-            uint32 additional_chunks = calcAdditionalChunks(fullsegments, segmentsize, chunksize);
-            if(filllevel < fullsegments * 32 + additional_chunks)
-              chunk_ptr = addChunkHierarchy(chunksize, fullsegments, additional_chunks, page);
-          }
-          else
-          {
-            uint32 chunksinpage = min(pagesize / chunksize, 32);
-            if(filllevel < chunksinpage)
-              chunk_ptr = addChunkNoHierarchy(chunksize, page, chunksinpage);
-          }
-        }
-
-        //this one is full/not useable
-        if(chunk_ptr == NULL)
-          atomicSub((uint32*)&(_ptes[page].count), 1);
-
-        return chunk_ptr;
-      }
-
-
-      /**
-       * allocChunked tries to allocate the demanded number of bytes on one of the pages
-       * @param bytes the number of bytes to allocate
-       * @return pointer to a free chunk on a page, 0 if we were unable to obtain a free chunk
-       */
-      __device__ void* allocChunked(uint32 bytes)
-      {
-        uint32 pagesperblock = _numpages/accessblocks;
-        uint32 reloff = warpSize*bytes / pagesize;
-        uint32 startpage = (bytes*hashingK + hashingDistMP*smid() + (hashingDistWP+hashingDistWPRel*reloff)*warpid() ) % pagesperblock;
-        uint32 maxchunksize = min(pagesize,wastefactor*bytes);
-        uint32 startblock = _firstfreeblock;
-        uint32 ptetry = startpage + startblock*pagesperblock;
-        uint32 checklevel = regionsize*3/4;
-        for(uint32 finder = 0; finder < 2; ++finder)
-        {
-          for(uint32 b = startblock; b < accessblocks; ++b)
-          {
-            while(ptetry < (b+1)*pagesperblock)
+            /**
+             * onPageMasksPosition returns a pointer to the beginning of the
+             * onpagemasks inside a page.
+             * @param page the page that holds the masks
+             * @param the number of hierarchical page tables (bitfields) that
+             * are used inside this mask.
+             * @return pointer to the first address inside the page that holds
+             * metadata bitfields.
+             */
+            __device__ inline auto
+            onPageMasksPosition(uint32 page, uint32 nMasks) -> uint32 *
+            {
+                return (
+                    uint32 *)(_page[page].data + pagesize - (int)sizeof(uint32) * nMasks);
+            }
+
+            /**
+             * usespot marks finds one free spot in the bitfield, marks it and
+             * returns its offset
+             * @param bitfield pointer to the bitfield to use
+             * @param spots overall number of spots the bitfield is responsible
+             * for
+             * @return if there is a free spot it returns the spot'S offset,
+             * otherwise -1
+             */
+            static __device__ inline auto
+            usespot(uint32 * bitfield, uint32 spots) -> int
             {
-              uint32 region = ptetry/regionsize;
-              uint32 regionfilllevel = _regions[region];
-              if(regionfilllevel < checklevel )
-              {
-                for( ; ptetry < (region+1)*regionsize; ++ptetry)
+                // get first spot
+                uint32 spot = randInit() % spots;
+                for(;;)
                 {
-                  uint32 chunksize = _ptes[ptetry].chunksize;
-                  if(chunksize >= bytes && chunksize <= maxchunksize)
-                  {
-                    void * res = tryUsePage(ptetry, chunksize);
-                    if(res != 0)  return res;
-                  }
-                  else if(chunksize == 0)
-                  {
-                    //lets open up a new page
-                    //it is already padded
-                    uint32 new_chunksize = max(bytes,minChunkSize1);
-                    uint32 beforechunksize = atomicCAS((uint32*)&_ptes[ptetry].chunksize, 0, new_chunksize);
-                    if(beforechunksize == 0)
+                    const uint32 mask = 1 << spot;
+                    const uint32 old = atomicOr(bitfield, mask);
+                    if((old & mask) == 0)
+                        return spot;
+                    // note: __popc(old) == spots should be sufficient,
+                    // but if someone corrupts the memory we end up in an
+                    // endless loop in here...
+                    if(__popc(old) >= spots)
+                        return -1;
+                    spot = nextspot(old, spot, spots);
+                }
+            }
+
+            /**
+             * calcAdditionalChunks determines the number of chunks that are
+             * contained in the last segment of a hierarchical page
+             *
+             * The additional checks are necessary to ensure correct results for
+             * very large pages and small chunksizes
+             *
+             * @param fullsegments the number of segments that can be completely
+             * filled in a page. This may NEVER be bigger than 32!
+             * @param segmentsize the number of bytes that are contained in a
+             * completely filled segment (32 chunks)
+             * @param chunksize the chosen allocation size within the page
+             * @return the number of additional chunks that will not fit in one
+             * of the fullsegments. For any correct input, this number is
+             * smaller than 32
+             */
+            static __device__ inline auto calcAdditionalChunks(
+                uint32 fullsegments,
+                uint32 segmentsize,
+                uint32 chunksize) -> uint32
+            {
+                if(fullsegments != 32)
+                    return max(0,
+                               (int)pagesize - (int)fullsegments * segmentsize
+                                   - (int)sizeof(uint32))
+                        / chunksize;
+                else
+                    return 0;
+            }
+
+            /**
+             * addChunkHierarchy finds a free chunk on a page which uses bit
+             * fields on the page
+             * @param chunksize the chunksize of the page
+             * @param fullsegments the number of full segments on the page (a 32
+             * bits on the page)
+             * @param additional_chunks the number of additional chunks in last
+             * segment (less than 32 bits on the page)
+             * @param page the page to use
+             * @return pointer to a free chunk on the page, 0 if we were unable
+             * to obtain a free chunk
+             */
+            __device__ inline auto addChunkHierarchy(
+                uint32 chunksize,
+                uint32 fullsegments,
+                uint32 additional_chunks,
+                uint32 page) -> void *
+            {
+                const uint32 segments
+                    = fullsegments + (additional_chunks > 0 ? 1 : 0);
+                uint32 spot = randInit() % segments;
+                const uint32 mask = _ptes[page].bitmask;
+                if((mask & (1 << spot)) != 0)
+                    spot = nextspot(mask, spot, segments);
+                const uint32 tries = segments - __popc(mask);
+                uint32 * onpagemasks = onPageMasksPosition(page, segments);
+                for(uint32 i = 0; i < tries; ++i)
+                {
+                    const int hspot = usespot(
+                        &onpagemasks[spot],
+                        spot < fullsegments ? 32 : additional_chunks);
+                    if(hspot != -1)
+                        return _page[page].data
+                            + (32 * spot + hspot) * chunksize;
+                    atomicOr((uint32 *)&_ptes[page].bitmask, 1 << spot);
+                    spot = nextspot(mask, spot, segments);
+                }
+                return 0;
+            }
+
+            /**
+             * addChunkNoHierarchy finds a free chunk on a page which uses the
+             * bit fields of the pte only
+             * @param chunksize the chunksize of the page
+             * @param page the page to use
+             * @param spots the number of chunks which fit on the page
+             * @return pointer to a free chunk on the page, 0 if we were unable
+             * to obtain a free chunk
+             */
+            __device__ inline auto
+            addChunkNoHierarchy(uint32 chunksize, uint32 page, uint32 spots)
+                -> void *
+            {
+                const int spot = usespot((uint32 *)&_ptes[page].bitmask, spots);
+                if(spot == -1)
+                    return 0; // that should be impossible :)
+                return _page[page].data + spot * chunksize;
+            }
+
+            /**
+             * tryUsePage tries to use the page for the allocation request
+             * @param page the page to use
+             * @param chunksize the chunksize of the page
+             * @return pointer to a free chunk on the page, 0 if we were unable
+             * to obtain a free chunk
+             */
+            __device__ inline auto tryUsePage(uint32 page, uint32 chunksize)
+                -> void *
+            {
+                void * chunk_ptr = nullptr;
+
+                // increse the fill level
+                const uint32 filllevel
+                    = atomicAdd((uint32 *)&(_ptes[page].count), 1);
+                // recheck chunck size (it could be that the page got freed in
+                // the meanwhile...)
+                if(!resetfreedpages || _ptes[page].chunksize == chunksize)
+                {
+                    if(chunksize <= HierarchyThreshold)
                     {
-                      void * res = tryUsePage(ptetry, new_chunksize);
-                      if(res != 0)  return res;
+                        // more chunks than can be covered by the pte's single
+                        // bitfield can be used
+                        const uint32 segmentsize
+                            = chunksize * 32 + sizeof(uint32);
+                        const uint32 fullsegments
+                            = min(32, pagesize / segmentsize);
+                        const uint32 additional_chunks = calcAdditionalChunks(
+                            fullsegments, segmentsize, chunksize);
+                        if(filllevel < fullsegments * 32 + additional_chunks)
+                            chunk_ptr = addChunkHierarchy(
+                                chunksize,
+                                fullsegments,
+                                additional_chunks,
+                                page);
                     }
-                    else if(beforechunksize >= bytes &&  beforechunksize <= maxchunksize)
+                    else
                     {
-                      //someone else aquired the page, but we can also use it
-                      void * res = tryUsePage(ptetry, beforechunksize);
-                      if(res != 0)  return res;
+                        const uint32 chunksinpage
+                            = min(pagesize / chunksize, 32);
+                        if(filllevel < chunksinpage)
+                            chunk_ptr = addChunkNoHierarchy(
+                                chunksize, page, chunksinpage);
                     }
-                  }
                 }
-                //could not alloc in region, tell that
-                if(regionfilllevel + 1 <= regionsize)
-                  atomicMax((uint32*)(_regions + region), regionfilllevel+1);
-              }
-              else
-                ptetry += regionsize;
-              //ptetry = (region+1)*regionsize;
+
+                // this one is full/not useable
+                if(chunk_ptr == nullptr)
+                    atomicSub((uint32 *)&(_ptes[page].count), 1);
+
+                return chunk_ptr;
             }
-            //randomize the thread writing the info
-            //if(warpid() + laneid() == 0)
-            if(b > startblock)
-              _firstfreeblock = b;
-          }
-
-          //we are really full :/ so lets search every page for a spot!
-          startblock = 0;
-          checklevel = regionsize + 1;
-          ptetry = 0;
-        }
-        return 0;
-      }
-
-
-      /**
-       * deallocChunked frees the chunk on the page and updates all data accordingly
-       * @param mem pointer to the chunk
-       * @param page the page the chunk is on
-       * @param chunksize the chunksize used for the page
-       */
-      __device__ void deallocChunked(void* mem, uint32 page, uint32 chunksize)
-      {
-        uint32 inpage_offset = ((char*)mem - _page[page].data);
-        if(chunksize <= HierarchyThreshold)
-        {
-          //one more level in hierarchy
-          uint32 segmentsize = chunksize*32 + sizeof(uint32);
-          uint32 fullsegments = min(32,pagesize / segmentsize);
-          uint32 additional_chunks = calcAdditionalChunks(fullsegments,segmentsize,chunksize);
-          uint32 segment = inpage_offset / (chunksize*32);
-          uint32 withinsegment = (inpage_offset - segment*(chunksize*32))/chunksize;
-          //mark it as free
-          uint32 nMasks = fullsegments + (additional_chunks > 0 ? 1 : 0);
-          uint32* onpagemasks = onPageMasksPosition(page,nMasks);
-          uint32 old = atomicAnd(onpagemasks + segment, ~(1 << withinsegment));
-
-          // always do this, since it might fail due to a race-condition with addChunkHierarchy
-          atomicAnd((uint32*)&_ptes[page].bitmask, ~(1 << segment));
-        }
-        else
-        {
-          uint32 segment = inpage_offset / chunksize;
-          atomicAnd((uint32*)&_ptes[page].bitmask, ~(1 << segment));
-        }
-        //reduce filllevel as free
-        uint32 oldfilllevel = atomicSub((uint32*)&_ptes[page].count, 1);
 
+            /**
+             * allocChunked tries to allocate the demanded number of bytes on
+             * one of the pages
+             * @param bytes the number of bytes to allocate
+             * @return pointer to a free chunk on a page, 0 if we were unable to
+             * obtain a free chunk
+             */
+            __device__ auto allocChunked(uint32 bytes) -> void *
+            {
+                const uint32 pagesperblock = _numpages / accessblocks;
+                const uint32 reloff = warpSize * bytes / pagesize;
+                const uint32 startpage
+                    = (bytes * hashingK + hashingDistMP * smid()
+                       + (hashingDistWP + hashingDistWPRel * reloff) * warpid())
+                    % pagesperblock;
+                const uint32 maxchunksize = min(pagesize, wastefactor * bytes);
+                uint32 startblock = _firstfreeblock;
+                uint32 ptetry = startpage + startblock * pagesperblock;
+                uint32 checklevel = regionsize * 3 / 4;
+                for(uint32 finder = 0; finder < 2; ++finder)
+                {
+                    for(uint32 b = startblock; b < accessblocks; ++b)
+                    {
+                        while(ptetry < (b + 1) * pagesperblock)
+                        {
+                            const uint32 region = ptetry / regionsize;
+                            const uint32 regionfilllevel = _regions[region];
+                            if(regionfilllevel < checklevel)
+                            {
+                                for(; ptetry < (region + 1) * regionsize;
+                                    ++ptetry)
+                                {
+                                    const uint32 chunksize
+                                        = _ptes[ptetry].chunksize;
+                                    if(chunksize >= bytes
+                                       && chunksize <= maxchunksize)
+                                    {
+                                        void * res
+                                            = tryUsePage(ptetry, chunksize);
+                                        if(res != 0)
+                                            return res;
+                                    }
+                                    else if(chunksize == 0)
+                                    {
+                                        // lets open up a new page
+                                        // it is already padded
+                                        const uint32 new_chunksize
+                                            = max(bytes, minChunkSize1);
+                                        const uint32 beforechunksize
+                                            = atomicCAS(
+                                                (uint32 *)&_ptes[ptetry]
+                                                    .chunksize,
+                                                0,
+                                                new_chunksize);
+                                        if(beforechunksize == 0)
+                                        {
+                                            void * res = tryUsePage(
+                                                ptetry, new_chunksize);
+                                            if(res != 0)
+                                                return res;
+                                        }
+                                        else if(
+                                            beforechunksize >= bytes
+                                            && beforechunksize <= maxchunksize)
+                                        {
+                                            // someone else aquired the page,
+                                            // but we can also use it
+                                            void * res = tryUsePage(
+                                                ptetry, beforechunksize);
+                                            if(res != 0)
+                                                return res;
+                                        }
+                                    }
+                                }
+                                // could not alloc in region, tell that
+                                if(regionfilllevel + 1 <= regionsize)
+                                    atomicMax(
+                                        (uint32 *)(_regions + region),
+                                        regionfilllevel + 1);
+                            }
+                            else
+                                ptetry += regionsize;
+                            // ptetry = (region+1)*regionsize;
+                        }
+                        // randomize the thread writing the info
+                        // if(warpid() + laneid() == 0)
+                        if(b > startblock)
+                            _firstfreeblock = b;
+                    }
+
+                    // we are really full :/ so lets search every page for a
+                    // spot!
+                    startblock = 0;
+                    checklevel = regionsize + 1;
+                    ptetry = 0;
+                }
+                return 0;
+            }
 
-        if(resetfreedpages)
-        {
-          if(oldfilllevel == 1)
-          {
-            //this page now got free!
-            // -> try lock it
-            uint32 old = atomicCAS((uint32*)&_ptes[page].count, 0, pagesize);
-            if(old == 0)
+            /**
+             * deallocChunked frees the chunk on the page and updates all data
+             * accordingly
+             * @param mem pointer to the chunk
+             * @param page the page the chunk is on
+             * @param chunksize the chunksize used for the page
+             */
+            __device__ void
+            deallocChunked(void * mem, uint32 page, uint32 chunksize)
             {
-              //clean the bits for the hierarchy
-              _page[page].init();
-              //remove chunk information
-              _ptes[page].chunksize = 0;
-              __threadfence();
-              //unlock it
-              atomicSub((uint32*)&_ptes[page].count, pagesize);
+                const uint32 inpage_offset = ((char *)mem - _page[page].data);
+                if(chunksize <= HierarchyThreshold)
+                {
+                    // one more level in hierarchy
+                    const uint32 segmentsize = chunksize * 32 + sizeof(uint32);
+                    const uint32 fullsegments = min(32, pagesize / segmentsize);
+                    const uint32 additional_chunks = calcAdditionalChunks(
+                        fullsegments, segmentsize, chunksize);
+                    const uint32 segment = inpage_offset / (chunksize * 32);
+                    const uint32 withinsegment
+                        = (inpage_offset - segment * (chunksize * 32))
+                        / chunksize;
+                    // mark it as free
+                    const uint32 nMasks
+                        = fullsegments + (additional_chunks > 0 ? 1 : 0);
+                    uint32 * onpagemasks = onPageMasksPosition(page, nMasks);
+                    uint32 old = atomicAnd(
+                        &onpagemasks[segment], ~(1 << withinsegment));
+
+                    // always do this, since it might fail due to a
+                    // race-condition with addChunkHierarchy
+                    atomicAnd((uint32 *)&_ptes[page].bitmask, ~(1 << segment));
+                }
+                else
+                {
+                    const uint32 segment = inpage_offset / chunksize;
+                    atomicAnd((uint32 *)&_ptes[page].bitmask, ~(1 << segment));
+                }
+                // reduce filllevel as free
+                const uint32 oldfilllevel
+                    = atomicSub((uint32 *)&_ptes[page].count, 1);
+
+                if(resetfreedpages)
+                {
+                    if(oldfilllevel == 1)
+                    {
+                        // this page now got free!
+                        // -> try lock it
+                        const uint32 old = atomicCAS(
+                            (uint32 *)&_ptes[page].count, 0, pagesize);
+                        if(old == 0)
+                        {
+                            // clean the bits for the hierarchy
+                            _page[page].init();
+                            // remove chunk information
+                            _ptes[page].chunksize = 0;
+                            __threadfence();
+                            // unlock it
+                            atomicSub((uint32 *)&_ptes[page].count, pagesize);
+                        }
+                    }
+                }
+
+                // meta information counters ... should not be changed by too
+                // many threads, so..
+                if(oldfilllevel == pagesize / 2 / chunksize)
+                {
+                    const uint32 region = page / regionsize;
+                    _regions[region] = 0;
+                    const uint32 block
+                        = region * regionsize * accessblocks / _numpages;
+                    if(warpid() + laneid() == 0)
+                        atomicMin((uint32 *)&_firstfreeblock, block);
+                }
             }
-          }
-        }
 
-        //meta information counters ... should not be changed by too many threads, so..
-        if(oldfilllevel == pagesize / 2 / chunksize)
-        {
-          uint32 region = page / regionsize;
-          _regions[region] = 0;
-          uint32 block = region * regionsize * accessblocks / _numpages ;
-          if(warpid() + laneid() == 0)
-            atomicMin((uint32*)&_firstfreeblock, block);
-        }
-      }
-
-      /**
-       * markpages markes a fixed number of pages as used
-       * @param startpage first page to mark
-       * @param pages number of pages to mark
-       * @param bytes number of overall bytes to mark pages for
-       * @return true on success, false if one of the pages is not free
-       */
-      __device__ bool markpages(uint32 startpage, uint32 pages, uint32 bytes)
-      {
-        int abord = -1;
-        for(uint32 trypage = startpage; trypage < startpage + pages; ++trypage)
-        {
-          uint32 old = atomicCAS((uint32*)&_ptes[trypage].chunksize, 0, bytes);
-          if(old != 0)
-          {
-            abord = trypage;
-            break;
-          }
-        }
-        if(abord == -1)
-          return true;
-        for(uint32 trypage = startpage; trypage < abord; ++trypage)
-          atomicCAS((uint32*)&_ptes[trypage].chunksize, bytes, 0);
-        return false;
-      }
-
-      /**
-       * allocPageBasedSingleRegion tries to allocate the demanded number of bytes on a continues sequence of pages
-       * @param startpage first page to be used
-       * @param endpage last page to be used
-       * @param bytes number of overall bytes to mark pages for
-       * @return pointer to the first page to use, 0 if we were unable to use all the requested pages
-       */
-      __device__ void* allocPageBasedSingleRegion(uint32 startpage, uint32 endpage, uint32 bytes)
-      {
-        uint32 pagestoalloc = divup(bytes, pagesize);
-        uint32 freecount = 0;
-        bool left_free = false;
-        for(uint32 search_page = startpage+1; search_page > endpage; )
-        {
-          --search_page;
-          if(_ptes[search_page].chunksize == 0)
-          {
-            if(++freecount == pagestoalloc)
+            /**
+             * markpages markes a fixed number of pages as used
+             * @param startpage first page to mark
+             * @param pages number of pages to mark
+             * @param bytes number of overall bytes to mark pages for
+             * @return true on success, false if one of the pages is not free
+             */
+            __device__ auto
+            markpages(uint32 startpage, uint32 pages, uint32 bytes) -> bool
             {
-              //try filling it up
-              if(markpages(search_page, pagestoalloc, bytes))
-              {
-                //mark that we filled up everything up to here
-                if(!left_free)
-                  atomicCAS((uint32*)&_firstFreePageBased, startpage, search_page - 1);
-                return _page[search_page].data;
-              }
+                int abord = -1;
+                for(uint32 trypage = startpage; trypage < startpage + pages;
+                    ++trypage)
+                {
+                    const uint32 old = atomicCAS(
+                        (uint32 *)&_ptes[trypage].chunksize, 0, bytes);
+                    if(old != 0)
+                    {
+                        abord = trypage;
+                        break;
+                    }
+                }
+                if(abord == -1)
+                    return true;
+                for(uint32 trypage = startpage; trypage < abord; ++trypage)
+                    atomicCAS((uint32 *)&_ptes[trypage].chunksize, bytes, 0);
+                return false;
             }
-          }
-          else
-          {
-            left_free = true;
-            freecount = 0;
-          }
-        }
-        return 0;
-      }
-
-      /**
-       * allocPageBasedSingle tries to allocate the demanded number of bytes on a continues sequence of pages
-       * @param bytes number of overall bytes to mark pages for
-       * @return pointer to the first page to use, 0 if we were unable to use all the requested pages
-       * @pre only a single thread of a warp is allowed to call the function concurrently
-       */
-      __device__ void* allocPageBasedSingle(uint32 bytes)
-      {
-        //acquire mutex
-        while(atomicExch(&_pagebasedMutex,1) != 0);
-        //search for free spot from the back
-        uint32 spage = _firstFreePageBased;
-        void* res = allocPageBasedSingleRegion(spage, 0, bytes);
-        if(res == 0)
-          //also check the rest of the pages
-          res = allocPageBasedSingleRegion(_numpages, spage, bytes);
-
-        //free mutex
-        atomicExch(&_pagebasedMutex,0);
-        return res;
-      }
-      /**
-       * allocPageBased tries to allocate the demanded number of bytes on a continues sequence of pages
-       * @param bytes number of overall bytes to mark pages for
-       * @return pointer to the first page to use, 0 if we were unable to use all the requested pages
-       */
-      __device__ void* allocPageBased(uint32 bytes)
-      {
-        //this is rather slow, but we dont expect that to happen often anyway
-
-        //only one thread per warp can acquire the mutex
-        void* res = 0;
-        for(
+
+            /**
+             * allocPageBasedSingleRegion tries to allocate the demanded number
+             * of bytes on a continues sequence of pages
+             * @param startpage first page to be used
+             * @param endpage last page to be used
+             * @param bytes number of overall bytes to mark pages for
+             * @return pointer to the first page to use, 0 if we were unable to
+             * use all the requested pages
+             */
+            __device__ auto allocPageBasedSingleRegion(
+                uint32 startpage,
+                uint32 endpage,
+                uint32 bytes) -> void *
+            {
+                const uint32 pagestoalloc = divup(bytes, pagesize);
+                uint32 freecount = 0;
+                bool left_free = false;
+                for(uint32 search_page = startpage + 1; search_page > endpage;)
+                {
+                    --search_page;
+                    if(_ptes[search_page].chunksize == 0)
+                    {
+                        if(++freecount == pagestoalloc)
+                        {
+                            // try filling it up
+                            if(markpages(search_page, pagestoalloc, bytes))
+                            {
+                                // mark that we filled up everything up to here
+                                if(!left_free)
+                                    atomicCAS(
+                                        (uint32 *)&_firstFreePageBased,
+                                        startpage,
+                                        search_page - 1);
+                                return _page[search_page].data;
+                            }
+                        }
+                    }
+                    else
+                    {
+                        left_free = true;
+                        freecount = 0;
+                    }
+                }
+                return 0;
+            }
+
+            /**
+             * allocPageBasedSingle tries to allocate the demanded number of
+             * bytes on a continues sequence of pages
+             * @param bytes number of overall bytes to mark pages for
+             * @return pointer to the first page to use, 0 if we were unable to
+             * use all the requested pages
+             * @pre only a single thread of a warp is allowed to call the
+             * function concurrently
+             */
+            __device__ auto allocPageBasedSingle(uint32 bytes) -> void *
+            {
+                // acquire mutex
+                while(atomicExch(&_pagebasedMutex, 1) != 0)
+                    ;
+                // search for free spot from the back
+                const uint32 spage = _firstFreePageBased;
+                void * res = allocPageBasedSingleRegion(spage, 0, bytes);
+                if(res == 0)
+                    // also check the rest of the pages
+                    res = allocPageBasedSingleRegion(_numpages, spage, bytes);
+
+                // free mutex
+                atomicExch(&_pagebasedMutex, 0);
+                return res;
+            }
+            /**
+             * allocPageBased tries to allocate the demanded number of bytes on
+             * a continues sequence of pages
+             * @param bytes number of overall bytes to mark pages for
+             * @return pointer to the first page to use, 0 if we were unable to
+             * use all the requested pages
+             */
+            __device__ auto allocPageBased(uint32 bytes) -> void *
+            {
+                // this is rather slow, but we dont expect that to happen often
+                // anyway
+
+                // only one thread per warp can acquire the mutex
+                void * res = 0;
 #if(__CUDACC_VER_MAJOR__ >= 9)
-          unsigned int __mask = __activemask(),
+                const unsigned int mask = __activemask();
 #else
-          unsigned int __mask = __ballot(1),
+                const unsigned int mask = __ballot(1);
 #endif
-          __num = __popc(__mask),
-          __lanemask = mallocMC::lanemask_lt(),
-          __local_id = __popc(__lanemask & __mask),
-          __active = 0;
-          __active < __num;
-          ++__active
-        )
-          if (__active == __local_id)
-            res = allocPageBasedSingle(bytes);
-        return res;
-      }
-
-      /**
-       * deallocPageBased frees the memory placed on a sequence of pages
-       * @param mem pointer to the first page
-       * @param page the first page
-       * @param bytes the number of bytes to be freed
-       */
-      __device__ void deallocPageBased(void* mem, uint32 page, uint32 bytes)
-      {
-        uint32 pages = divup(bytes,pagesize);
-        for(uint32 p = page; p < page+pages; ++p)
-          _page[p].init();
-        __threadfence();
-        for(uint32 p = page; p < page+pages; ++p)
-          atomicCAS((uint32*)&_ptes[p].chunksize, bytes, 0);
-        atomicMax((uint32*)&_firstFreePageBased, page+pages-1);
-      }
-
-
-    public:
-      /**
-       * create allocates the requested number of bytes via the heap. Coalescing has to be done before by another policy.
-       * @param bytes number of bytes to allocate
-       * @return pointer to the allocated memory
-       */
-      __device__ void* create(uint32 bytes)
-      {
-        if(bytes == 0)
-          return 0;
-        //take care of padding
-        //bytes = (bytes + dataAlignment - 1) & ~(dataAlignment-1); // in alignment-policy
-        if(bytes < pagesize)
-          //chunck based
-          return allocChunked(bytes);
-        else
-          //allocate a range of pages
-          return allocPageBased(bytes);
-      }
-
-      /**
-       * destroy frees the memory regions previously acllocted via create
-       * @param mempointer to the memory region to free
-       */
-      __device__ void destroy(void* mem)
-      {
-        if(mem == 0)
-          return;
-        //lets see on which page we are on
-        uint32 page = ((char*)mem - (char*)_page)/pagesize;
-        uint32 chunksize = _ptes[page].chunksize;
-
-        //is the pointer the beginning of a chunk?
-        uint32 inpage_offset = ((char*)mem - _page[page].data);
-        uint32 block = inpage_offset/chunksize;
-        uint32 inblockoffset = inpage_offset - block*chunksize;
-        if(inblockoffset != 0)
-        {
-          uint32* counter = (uint32*)(_page[page].data + block*chunksize);
-          //coalesced mem free
-          uint32 old = atomicSub(counter, 1);
-          if(old != 1)
-            return;
-          mem = (void*) counter;
-        }
-
-        if(chunksize < pagesize)
-          deallocChunked(mem, page, chunksize);
-        else
-          deallocPageBased(mem, page, chunksize);
-      }
-
-      /**
-       * init inits the heap data structures
-       * the init method must be called before the heap can be used. the method can be called
-       * with an arbitrary number of threads, which will increase the inits efficiency
-       * @param memory pointer to the memory used for the heap
-       * @param memsize size of the memory in bytes
-       */
-      __device__ void initDeviceFunction(void* memory, size_t memsize)
-      {
-        uint32 linid = threadIdx.x + blockDim.x*(threadIdx.y + threadIdx.z*blockDim.y);
-        uint32 threads = blockDim.x*blockDim.y*blockDim.z;
-        uint32 linblockid = blockIdx.x + gridDim.x*(blockIdx.y + blockIdx.z*gridDim.y);
-        uint32 blocks =  gridDim.x*gridDim.y*gridDim.z;
-        linid = linid + linblockid*threads;
-
-        uint32 numregions = ((unsigned long long)memsize)/( ((unsigned long long)regionsize)*(sizeof(PTE)+pagesize)+sizeof(uint32));
-        uint32 numpages = numregions*regionsize;
-        //pointer is copied (copy is called page)
-        PAGE* page = (PAGE*)(memory);
-        //sec check for alignment
-        //copy is checked
-        //PointerEquivalent alignmentstatus = ((PointerEquivalent)page) & (16 -1);
-        //if(alignmentstatus != 0)
-        //{
-        //  if(linid == 0){
-        //    printf("c Before:\n");
-        //    printf("c dataAlignment:   %d\n",16);
-        //    printf("c Alignmentstatus: %d\n",alignmentstatus);
-        //    printf("c size_t memsize   %llu byte\n", memsize);
-        //    printf("c void *memory     %p\n", page);
-        //  }
-        //  //copy is adjusted, potentially pointer to higher address now.
-        //  page =(PAGE*)(((PointerEquivalent)page) + 16 - alignmentstatus);
-        //  if(linid == 0) printf("c Heap Warning: memory to use not 16 byte aligned...\n");
-        //}
-        PTE* ptes = (PTE*)(page + numpages);
-        uint32* regions = (uint32*)(ptes + numpages);
-        //sec check for mem size
-        //this check refers to the original memory-pointer, which was not adjusted!
-        if( (void*)(regions + numregions) > (((char*)memory) + memsize) )
-        {
-          --numregions;
-          numpages = min(numregions*regionsize,numpages);
-          if(linid == 0) printf("c Heap Warning: needed to reduce number of regions to stay within memory limit\n");
-        }
-        //if(linid == 0) printf("Heap info: wasting %d bytes\n",(((POINTEREQUIVALENT)memory) + memsize) - (POINTEREQUIVALENT)(regions + numregions));
-
-        //if(linid == 0 && alignmentstatus != 0){
-        //  printf("c Was shrinked automatically to:\n");
-        //  printf("c size_t memsize   %llu byte\n", memsize);
-        //  printf("c void *memory     %p\n", page);
-        //}
-        threads = threads*blocks;
-
-        for(uint32 i = linid; i < numpages; i+= threads)
-        {
-          ptes[i].init();
-          page[i].init();
-        }
-        for(uint32 i = linid; i < numregions; i+= threads)
-          regions[i] = 0;
+                const unsigned int num = __popc(mask);
+                const unsigned int lanemask = mallocMC::lanemask_lt();
+                const unsigned int local_id = __popc(lanemask & mask);
+                for(unsigned int active = 0; active < num; ++active)
+                    if(active == local_id)
+                        res = allocPageBasedSingle(bytes);
+                return res;
+            }
 
-        if(linid == 0)
-        {
-          _memsize = memsize;
-          _numpages = numpages;
-          _ptes = (volatile PTE*)ptes;
-          _page = page;
-          _regions =  regions;
-          _firstfreeblock = 0;
-          _pagebasedMutex = 0;
-          _firstFreePageBased = numpages-1;
-
-          if( (char*) (_page+numpages) > (char*)(memory) + memsize)
-            printf("error in heap alloc: numpages too high\n");
-        }
-
-      }
-
-      __device__ bool isOOM(void* p, size_t s){
-        // one thread that requested memory returned null
-        return  s && (p == NULL);
-      }
-
-
-      template < typename T_DeviceAllocator >
-      static void* initHeap( T_DeviceAllocator* heap, void* pool, size_t memsize){
-        if( pool == NULL && memsize != 0 )
-        {
-          throw std::invalid_argument(
-            "Scatter policy cannot use NULL for non-empty memory pools. "
-            "Maybe you are using an incompatible ReservePoolPolicy or AlignmentPolicy."
-          );
-        }
-        ScatterKernelDetail::initKernel<<<1,256>>>(heap, pool, memsize);
-        return heap;
-      }
-
-      /** counts how many elements of a size fit inside a given page
-       *
-       * Examines a (potentially already used) page to find how many elements
-       * of size chunksize still fit on the page. This includes hierarchically
-       * organized pages and empty pages. The algorithm determines the number
-       * of chunks in the page in a manner similar to the allocation algorithm
-       * of CreationPolicies::Scatter.
-       *
-       * @param page the number of the page to examine. The page needs to be
-       *        formatted with a chunksize and potentially a hierarchy.
-       * @param chunksize the size of element that should be placed inside the
-       *        page. This size must be appropriate to the formatting of the
-       *        page.
-       */
-      __device__ unsigned countFreeChunksInPage(uint32 page, uint32 chunksize){
-        uint32 filledChunks = _ptes[page].count;
-        if(chunksize <= HierarchyThreshold)
-        {
-          uint32 segmentsize = chunksize*32 + sizeof(uint32); //each segment can hold 32 2nd-level chunks
-          uint32 fullsegments = min(32,pagesize / segmentsize); //there might be space for more than 32 segments with 32 2nd-level chunks
-          uint32 additional_chunks = calcAdditionalChunks(fullsegments, segmentsize, chunksize);
-          uint32 level2Chunks = fullsegments * 32 + additional_chunks;
-          return level2Chunks - filledChunks;
-        }else{
-          uint32 chunksinpage = min(pagesize / chunksize, 32); //without hierarchy, there can not be more than 32 chunks
-          return chunksinpage - filledChunks;
-        }
-      }
-
-
-      /** counts the number of available slots inside the heap
-       *
-       * Searches the heap for all possible locations of an element with size
-       * slotSize. The used traversal algorithms are similar to the allocation
-       * strategy of CreationPolicies::Scatter, to ensure comparable results.
-       * There are 3 different algorithms, based on the size of the requested
-       * slot: 1 slot spans over multiple pages, 1 slot fits in one chunk
-       * within a page, 1 slot fits in a fraction of a chunk.
-       *
-       * @param slotSize the amount of bytes that a single slot accounts for
-       * @param gid the id of the thread. this id does not have to correspond
-       *        with threadId.x, but there must be a continous range of ids
-       *        beginning from 0.
-       * @param stride the stride should be equal to the number of different
-       *        gids (and therefore of value max(gid)-1)
-       */
-      __device__ unsigned getAvailaibleSlotsDeviceFunction(size_t slotSize, int gid, int stride)
-      {
-        unsigned slotcount = 0;
-        if(slotSize < pagesize){ // multiple slots per page
-          for(uint32 currentpage = gid; currentpage < _numpages; currentpage += stride){
-            uint32 maxchunksize = min(pagesize, wastefactor*(uint32)slotSize);
-            uint32 region = currentpage/regionsize;
-            uint32 regionfilllevel = _regions[region];
-
-            uint32 chunksize = _ptes[currentpage].chunksize;
-            if(chunksize >= slotSize && chunksize <= maxchunksize){ //how many chunks left? (each chunk is big enough)
-              slotcount += countFreeChunksInPage(currentpage, chunksize);
-            }else if(chunksize == 0){
-              chunksize  = max((uint32)slotSize, minChunkSize1); //ensure minimum chunk size
-              slotcount += countFreeChunksInPage(currentpage, chunksize); //how many chunks fit in one page?
-            }else{
-              continue; //the chunks on this page are too small for the request :(
+            /**
+             * deallocPageBased frees the memory placed on a sequence of pages
+             * @param mem pointer to the first page
+             * @param page the first page
+             * @param bytes the number of bytes to be freed
+             */
+            __device__ void
+            deallocPageBased(void * mem, uint32 page, uint32 bytes)
+            {
+                const uint32 pages = divup(bytes, pagesize);
+                for(uint32 p = page; p < page + pages; ++p) _page[p].init();
+                __threadfence();
+                for(uint32 p = page; p < page + pages; ++p)
+                    atomicCAS((uint32 *)&_ptes[p].chunksize, bytes, 0);
+                atomicMax((uint32 *)&_firstFreePageBased, page + pages - 1);
+            }
+
+        public:
+            /**
+             * create allocates the requested number of bytes via the heap.
+             * Coalescing has to be done before by another policy.
+             * @param bytes number of bytes to allocate
+             * @return pointer to the allocated memory
+             */
+            __device__ auto create(uint32 bytes) -> void *
+            {
+                if(bytes == 0)
+                    return 0;
+                // take care of padding
+                // bytes = (bytes + dataAlignment - 1) & ~(dataAlignment-1); //
+                // in alignment-policy
+                if(bytes < pagesize)
+                    // chunck based
+                    return allocChunked(bytes);
+                else
+                    // allocate a range of pages
+                    return allocPageBased(bytes);
             }
-          }
-        }else{ // 1 slot needs multiple pages
-          if(gid > 0) return 0; //do this serially
-          uint32 pagestoalloc = divup((uint32)slotSize, pagesize);
-          uint32 freecount = 0;
-          for(uint32 currentpage = _numpages; currentpage > 0;){ //this already includes all superblocks
-            --currentpage;
-            if(_ptes[currentpage].chunksize == 0){
-              if(++freecount == pagestoalloc){
-                freecount = 0;
-                ++slotcount;
-              }
-            }else{ // the sequence of free pages was interrupted
-              freecount = 0;
+
+            /**
+             * destroy frees the memory regions previously acllocted via create
+             * @param mempointer to the memory region to free
+             */
+            __device__ void destroy(void * mem)
+            {
+                if(mem == 0)
+                    return;
+                // lets see on which page we are on
+                const uint32 page = ((char *)mem - (char *)_page) / pagesize;
+                const uint32 chunksize = _ptes[page].chunksize;
+
+                // is the pointer the beginning of a chunk?
+                const uint32 inpage_offset = ((char *)mem - _page[page].data);
+                const uint32 block = inpage_offset / chunksize;
+                const uint32 inblockoffset = inpage_offset - block * chunksize;
+                if(inblockoffset != 0)
+                {
+                    uint32 * counter
+                        = (uint32 *)(_page[page].data + block * chunksize);
+                    // coalesced mem free
+                    uint32 old = atomicSub(counter, 1);
+                    if(old != 1)
+                        return;
+                    mem = (void *)counter;
+                }
+
+                if(chunksize < pagesize)
+                    deallocChunked(mem, page, chunksize);
+                else
+                    deallocPageBased(mem, page, chunksize);
             }
-          }
-        }
-        return slotcount;
-      }
-
-
-      /** Count, how many elements can be allocated at maximum
-       *
-       * Takes an input size and determines, how many elements of this size can
-       * be allocated with the CreationPolicy Scatter. This will return the
-       * maximum number of free slots of the indicated size. It is not
-       * guaranteed where these slots are (regarding fragmentation). Therefore,
-       * the practically usable number of slots might be smaller. This function
-       * is executed in parallel. Speedup can possibly increased by a higher
-       * amount ofparallel workers.
-       *
-       * @param slotSize the size of allocatable elements to count
-       * @param obj a reference to the allocator instance (host-side)
-       */
-    public:
-      template<typename T_DeviceAllocator>
-      static unsigned getAvailableSlotsHost(size_t const slotSize, T_DeviceAllocator* heap){
-        unsigned h_slots = 0;
-        unsigned* d_slots;
-        cudaMalloc((void**) &d_slots, sizeof(unsigned));
-        cudaMemcpy(d_slots, &h_slots, sizeof(unsigned), cudaMemcpyHostToDevice);
-
-        ScatterKernelDetail::getAvailableSlotsKernel<<<64,256>>>(heap, slotSize, d_slots);
-
-        cudaMemcpy(&h_slots, d_slots, sizeof(unsigned), cudaMemcpyDeviceToHost);
-        cudaFree(d_slots);
-        return h_slots;
-      }
-
-
-      /** Count, how many elements can be allocated at maximum
-       *
-       * Takes an input size and determines, how many elements of this size can
-       * be allocated with the CreationPolicy Scatter. This will return the
-       * maximum number of free slots of the indicated size. It is not
-       * guaranteed where these slots are (regarding fragmentation). Therefore,
-       * the practically usable number of slots might be smaller. This function
-       * is executed separately for each warp and does not cooperate with other
-       * warps. Maximum speed is expected if every thread in the warp executes
-       * the function.
-       * Uses 256 byte of shared memory.
-       *
-       * @param slotSize the size of allocatable elements to count
-       */
-      __device__ unsigned getAvailableSlotsAccelerator(size_t slotSize){
-        int linearId;
-        int wId = warpid_withinblock(); //do not use warpid-function, since this value is not guaranteed to be stable across warp lifetime
+
+            /**
+             * init inits the heap data structures
+             * the init method must be called before the heap can be used. the
+             * method can be called with an arbitrary number of threads, which
+             * will increase the inits efficiency
+             * @param memory pointer to the memory used for the heap
+             * @param memsize size of the memory in bytes
+             */
+            __device__ void initDeviceFunction(void * memory, size_t memsize)
+            {
+                uint32 linid = threadIdx.x
+                    + blockDim.x * (threadIdx.y + threadIdx.z * blockDim.y);
+                uint32 threads = blockDim.x * blockDim.y * blockDim.z;
+                const uint32 linblockid = blockIdx.x
+                    + gridDim.x * (blockIdx.y + blockIdx.z * gridDim.y);
+                uint32 blocks = gridDim.x * gridDim.y * gridDim.z;
+                linid += linblockid * threads;
+
+                uint32 numregions = ((unsigned long long)memsize)
+                    / (((unsigned long long)regionsize)
+                           * (sizeof(PTE) + pagesize)
+                       + sizeof(uint32));
+
+                uint32 numpages = numregions * regionsize;
+                // pointer is copied (copy is called page)
+                Page * page = (Page *)memory;
+                // sec check for alignment
+                // copy is checked
+                // PointerEquivalent alignmentstatus = ((PointerEquivalent)page)
+                // & (16 -1); if(alignmentstatus != 0)
+                //{
+                //  if(linid == 0){
+                //    printf("c Before:\n");
+                //    printf("c dataAlignment:   %d\n",16);
+                //    printf("c Alignmentstatus: %d\n",alignmentstatus);
+                //    printf("c size_t memsize   %llu byte\n", memsize);
+                //    printf("c void *memory     %p\n", page);
+                //  }
+                //  //copy is adjusted, potentially pointer to higher address
+                //  now. page =(Page*)(((PointerEquivalent)page) + 16 -
+                //  alignmentstatus); if(linid == 0) printf("c Heap Warning:
+                //  memory to use not 16 byte aligned...\n");
+                //}
+                PTE * ptes = (PTE *)(page + numpages);
+                uint32 * regions = (uint32 *)(ptes + numpages);
+                // sec check for mem size
+                // this check refers to the original memory-pointer, which was
+                // not adjusted!
+                if((char *)(regions + numregions)
+                   > (((char *)memory) + memsize))
+                {
+                    --numregions;
+                    numpages = min(numregions * regionsize, numpages);
+                    if(linid == 0)
+                        printf(
+                            "c Heap Warning: needed to reduce number of "
+                            "regions to stay within memory limit\n");
+                }
+                // if(linid == 0) printf("Heap info: wasting %d
+                // bytes\n",(((POINTEREQUIVALENT)memory) + memsize) -
+                // (POINTEREQUIVALENT)(regions + numregions));
+
+                // if(linid == 0 && alignmentstatus != 0){
+                //  printf("c Was shrinked automatically to:\n");
+                //  printf("c size_t memsize   %llu byte\n", memsize);
+                //  printf("c void *memory     %p\n", page);
+                //}
+                threads *= blocks;
+
+                for(uint32 i = linid; i < numpages; i += threads)
+                {
+                    ptes[i].init();
+                    page[i].init();
+                }
+                for(uint32 i = linid; i < numregions; i += threads)
+                    regions[i] = 0;
+
+                if(linid == 0)
+                {
+                    _memsize = memsize;
+                    _numpages = numpages;
+                    _ptes = (volatile PTE *)ptes;
+                    _page = page;
+                    _regions = regions;
+                    _firstfreeblock = 0;
+                    _pagebasedMutex = 0;
+                    _firstFreePageBased = numpages - 1;
+
+                    if((char *)&_page[numpages] > (char *)memory + memsize)
+                        printf("error in heap alloc: numpages too high\n");
+                }
+            }
+
+            static __device__ auto isOOM(void * p, size_t s) -> bool
+            {
+                // one thread that requested memory returned null
+                return s && (p == nullptr);
+            }
+
+            template<typename T_DeviceAllocator>
+            static auto
+            initHeap(T_DeviceAllocator * heap, void * pool, size_t memsize)
+                -> void *
+            {
+                if(pool == nullptr && memsize != 0)
+                {
+                    throw std::invalid_argument(
+                        "Scatter policy cannot use nullptr for non-empty "
+                        "memory pools. "
+                        "Maybe you are using an incompatible ReservePoolPolicy "
+                        "or AlignmentPolicy.");
+                }
+                ScatterKernelDetail::initKernel<<<1, 256>>>(
+                    heap, pool, memsize);
+                return heap;
+            }
+
+            /** counts how many elements of a size fit inside a given page
+             *
+             * Examines a (potentially already used) page to find how many
+             * elements of size chunksize still fit on the page. This includes
+             * hierarchically organized pages and empty pages. The algorithm
+             * determines the number of chunks in the page in a manner similar
+             * to the allocation algorithm of CreationPolicies::Scatter.
+             *
+             * @param page the number of the page to examine. The page needs to
+             * be formatted with a chunksize and potentially a hierarchy.
+             * @param chunksize the size of element that should be placed inside
+             * the page. This size must be appropriate to the formatting of the
+             *        page.
+             */
+            __device__ auto countFreeChunksInPage(uint32 page, uint32 chunksize)
+                -> unsigned
+            {
+                const uint32 filledChunks = _ptes[page].count;
+                if(chunksize <= HierarchyThreshold)
+                {
+                    const uint32 segmentsize = chunksize * 32
+                        + sizeof(uint32); // each segment can hold 32 2nd-level
+                                          // chunks
+                    const uint32 fullsegments = min(
+                        32, pagesize / segmentsize); // there might be space for
+                                                     // more than 32 segments
+                                                     // with 32 2nd-level chunks
+                    const uint32 additional_chunks = calcAdditionalChunks(
+                        fullsegments, segmentsize, chunksize);
+                    const uint32 level2Chunks
+                        = fullsegments * 32 + additional_chunks;
+                    return level2Chunks - filledChunks;
+                }
+                else
+                {
+                    const uint32 chunksinpage = min(
+                        pagesize / chunksize,
+                        32); // without hierarchy, there can not be more than 32
+                             // chunks
+                    return chunksinpage - filledChunks;
+                }
+            }
+
+            /** counts the number of available slots inside the heap
+             *
+             * Searches the heap for all possible locations of an element with
+             * size slotSize. The used traversal algorithms are similar to the
+             * allocation strategy of CreationPolicies::Scatter, to ensure
+             * comparable results. There are 3 different algorithms, based on
+             * the size of the requested slot: 1 slot spans over multiple pages,
+             * 1 slot fits in one chunk within a page, 1 slot fits in a fraction
+             * of a chunk.
+             *
+             * @param slotSize the amount of bytes that a single slot accounts
+             * for
+             * @param gid the id of the thread. this id does not have to
+             * correspond with threadId.x, but there must be a continous range
+             * of ids beginning from 0.
+             * @param stride the stride should be equal to the number of
+             * different gids (and therefore of value max(gid)-1)
+             */
+            __device__ auto getAvailaibleSlotsDeviceFunction(
+                size_t slotSize,
+                int gid,
+                int stride) -> unsigned
+            {
+                unsigned slotcount = 0;
+                if(slotSize < pagesize)
+                { // multiple slots per page
+                    for(uint32 currentpage = gid; currentpage < _numpages;
+                        currentpage += stride)
+                    {
+                        const uint32 maxchunksize
+                            = min(pagesize, wastefactor * (uint32)slotSize);
+                        const uint32 region = currentpage / regionsize;
+                        const uint32 regionfilllevel = _regions[region];
+
+                        uint32 chunksize = _ptes[currentpage].chunksize;
+                        if(chunksize >= slotSize && chunksize <= maxchunksize)
+                        { // how many chunks left? (each chunk is big enough)
+                            slotcount += countFreeChunksInPage(
+                                currentpage, chunksize);
+                        }
+                        else if(chunksize == 0)
+                        {
+                            chunksize = max(
+                                (uint32)slotSize,
+                                minChunkSize1); // ensure minimum chunk size
+                            slotcount += countFreeChunksInPage(
+                                currentpage,
+                                chunksize); // how many chunks fit in one page?
+                        }
+                        else
+                        {
+                            continue; // the chunks on this page are too small
+                                      // for the request :(
+                        }
+                    }
+                }
+                else
+                { // 1 slot needs multiple pages
+                    if(gid > 0)
+                        return 0; // do this serially
+                    const uint32 pagestoalloc
+                        = divup((uint32)slotSize, pagesize);
+                    uint32 freecount = 0;
+                    for(uint32 currentpage = _numpages; currentpage > 0;)
+                    { // this already includes all superblocks
+                        --currentpage;
+                        if(_ptes[currentpage].chunksize == 0)
+                        {
+                            if(++freecount == pagestoalloc)
+                            {
+                                freecount = 0;
+                                ++slotcount;
+                            }
+                        }
+                        else
+                        { // the sequence of free pages was interrupted
+                            freecount = 0;
+                        }
+                    }
+                }
+                return slotcount;
+            }
+
+            /** Count, how many elements can be allocated at maximum
+             *
+             * Takes an input size and determines, how many elements of this
+             * size can be allocated with the CreationPolicy Scatter. This will
+             * return the maximum number of free slots of the indicated size. It
+             * is not guaranteed where these slots are (regarding
+             * fragmentation). Therefore, the practically usable number of slots
+             * might be smaller. This function is executed in parallel. Speedup
+             * can possibly increased by a higher amount ofparallel workers.
+             *
+             * @param slotSize the size of allocatable elements to count
+             * @param obj a reference to the allocator instance (host-side)
+             */
+        public:
+            template<typename T_DeviceAllocator>
+            static auto getAvailableSlotsHost(
+                size_t const slotSize,
+                T_DeviceAllocator * heap) -> unsigned
+            {
+                unsigned h_slots = 0;
+                unsigned * d_slots;
+                cudaMalloc((void **)&d_slots, sizeof(unsigned));
+                cudaMemcpy(
+                    d_slots,
+                    &h_slots,
+                    sizeof(unsigned),
+                    cudaMemcpyHostToDevice);
+
+                ScatterKernelDetail::getAvailableSlotsKernel<<<64, 256>>>(
+                    heap, slotSize, d_slots);
+
+                cudaMemcpy(
+                    &h_slots,
+                    d_slots,
+                    sizeof(unsigned),
+                    cudaMemcpyDeviceToHost);
+                cudaFree(d_slots);
+                return h_slots;
+            }
+
+            /** Count, how many elements can be allocated at maximum
+             *
+             * Takes an input size and determines, how many elements of this
+             * size can be allocated with the CreationPolicy Scatter. This will
+             * return the maximum number of free slots of the indicated size. It
+             * is not guaranteed where these slots are (regarding
+             * fragmentation). Therefore, the practically usable number of slots
+             * might be smaller. This function is executed separately for each
+             * warp and does not cooperate with other warps. Maximum speed is
+             * expected if every thread in the warp executes the function. Uses
+             * 256 byte of shared memory.
+             *
+             * @param slotSize the size of allocatable elements to count
+             */
+            __device__ auto getAvailableSlotsAccelerator(size_t slotSize)
+                -> unsigned
+            {
+                const int wId
+                    = warpid_withinblock(); // do not use warpid-function, since
+                                            // this value is not guaranteed to
+                                            // be stable across warp lifetime
 
 #if(__CUDACC_VER_MAJOR__ >= 9)
-        uint32 activeThreads  = __popc(__activemask());
+                const uint32 activeThreads = __popc(__activemask());
 #else
-        uint32 activeThreads  = __popc(__ballot(true));
+                const uint32 activeThreads = __popc(__ballot(true));
 #endif
-        __shared__ uint32 activePerWarp[MaxThreadsPerBlock::value / WarpSize::value]; //maximum number of warps in a block
-        __shared__ unsigned warpResults[MaxThreadsPerBlock::value / WarpSize::value];
-        warpResults[wId]   = 0;
-        activePerWarp[wId] = 0;
-
-        // the active threads obtain an id from 0 to activeThreads-1
-        if(slotSize>0) linearId = atomicAdd(&activePerWarp[wId], 1);
-        else return 0;
-
-        //printf("Block %d, id %d: activeThreads=%d linearId=%d\n",blockIdx.x,threadIdx.x,activeThreads,linearId);
-        unsigned temp = getAvailaibleSlotsDeviceFunction(slotSize, linearId, activeThreads);
-        if(temp) atomicAdd(&warpResults[wId], temp);
-        __threadfence_block();
-        return warpResults[wId];
-      }
-
-
-      static std::string classname(){
-        std::stringstream ss;
-        ss << "Scatter[";
-        ss << pagesize        << ",";
-        ss << accessblocks    << ",";
-        ss << regionsize      << ",";
-        ss << wastefactor     << ",";
-        ss << resetfreedpages << ",";
-        ss << hashingK        << ",";
-        ss << hashingDistMP   << ",";
-        ss << hashingDistWP   << ",";
-        ss << hashingDistWPRel<< "]";
-        return ss.str();
-      }
-
-  };
-
-} //namespace CreationPolicies
-} //namespace mallocMC
+                __shared__ uint32 activePerWarp
+                    [MaxThreadsPerBlock::value
+                     / WarpSize::value]; // maximum number of warps in a block
+                __shared__ unsigned
+                    warpResults[MaxThreadsPerBlock::value / WarpSize::value];
+                warpResults[wId] = 0;
+                activePerWarp[wId] = 0;
+
+                // the active threads obtain an id from 0 to activeThreads-1
+                if(slotSize == 0)
+                    return 0;
+                const int linearId = atomicAdd(&activePerWarp[wId], 1);
+
+                // printf("Block %d, id %d: activeThreads=%d
+                // linearId=%d\n",blockIdx.x,threadIdx.x,activeThreads,linearId);
+                const unsigned temp = getAvailaibleSlotsDeviceFunction(
+                    slotSize, linearId, activeThreads);
+                if(temp)
+                    atomicAdd(&warpResults[wId], temp);
+                __threadfence_block();
+                return warpResults[wId];
+            }
+
+            static auto classname() -> std::string
+            {
+                std::stringstream ss;
+                ss << "Scatter[";
+                ss << pagesize << ",";
+                ss << accessblocks << ",";
+                ss << regionsize << ",";
+                ss << wastefactor << ",";
+                ss << resetfreedpages << ",";
+                ss << hashingK << ",";
+                ss << hashingDistMP << ",";
+                ss << hashingDistWP << ",";
+                ss << hashingDistWPRel << "]";
+                return ss.str();
+            }
+        };
+
+    } // namespace CreationPolicies
+} // namespace mallocMC
diff --git a/src/include/mallocMC/device_allocator.hpp b/src/include/mallocMC/device_allocator.hpp
index 6c24fcfb..cf938be7 100644
--- a/src/include/mallocMC/device_allocator.hpp
+++ b/src/include/mallocMC/device_allocator.hpp
@@ -28,74 +28,60 @@
 
 #pragma once
 
-#include "mallocMC_utils.hpp"
 #include "mallocMC_constraints.hpp"
 #include "mallocMC_prefixes.hpp"
 #include "mallocMC_traits.hpp"
+#include "mallocMC_utils.hpp"
 
-#include <boost/cstdint.hpp>
-#include <stdio.h>
-
-namespace mallocMC{
-
-namespace detail{
+#include <cstdint>
+#include <cstdio>
 
-    /**
-     * @brief Template class to call getAvailableSlots[Host|Accelerator] if the CreationPolicy provides it.
-     *
-     * Returns 0 else.
-     *
-     * @tparam T_Allocator The type of the Allocator to be used
-     * @tparam T_isHost True for the host call, false for the accelerator call
-     * @tparam T_providesAvailableSlots If the CreationPolicy provides getAvailableSlots[Host|Accelerator] (auto filled, do not set)
-     */
-    template<
-        typename T_Allocator,
-        bool T_providesAvailableSlots
-    >
-    struct GetAvailableSlotsIfAvailAcc
+namespace mallocMC
+{
+    namespace detail
     {
-        MAMC_ACCELERATOR static
-        unsigned
-        getAvailableSlots(
-            size_t,
-            T_Allocator &
-        )
+        /**
+         * @brief Template class to call getAvailableSlots[Host|Accelerator] if
+         * the CreationPolicy provides it.
+         *
+         * Returns 0 else.
+         *
+         * @tparam T_Allocator The type of the Allocator to be used
+         * @tparam T_isHost True for the host call, false for the accelerator
+         * call
+         * @tparam T_providesAvailableSlots If the CreationPolicy provides
+         * getAvailableSlots[Host|Accelerator] (auto filled, do not set)
+         */
+        template<typename T_Allocator, bool T_providesAvailableSlots>
+        struct GetAvailableSlotsIfAvailAcc
         {
-            return 0;
-        }
-
-    };
-
-    template<
-        typename T_Allocator
-    >
-    struct GetAvailableSlotsIfAvailAcc<
-        T_Allocator,
-        true
-    >{
-        MAMC_ACCELERATOR static
-        unsigned
-        getAvailableSlots(
-            size_t slotSize,
-            T_Allocator& alloc
-        )
+            MAMC_ACCELERATOR static auto
+            getAvailableSlots(size_t, T_Allocator &) -> unsigned
+            {
+                return 0;
+            }
+        };
+
+        template<typename T_Allocator>
+        struct GetAvailableSlotsIfAvailAcc<T_Allocator, true>
         {
-            return alloc.T_Allocator::CreationPolicy
-                ::getAvailableSlotsAccelerator( slotSize );
-        }
-
-    };
-
-} // namespace detail
+            MAMC_ACCELERATOR static auto
+            getAvailableSlots(size_t slotSize, T_Allocator & alloc) -> unsigned
+            {
+                return alloc
+                    .T_Allocator::CreationPolicy ::getAvailableSlotsAccelerator(
+                        slotSize);
+            }
+        };
 
+    } // namespace detail
 
     /**
      * @brief "HostClass" that combines all policies to a useful allocator
      *
-     * This class implements the necessary glue-logic to form an actual allocator
-     * from the provided policies. It implements the public interface and
-     * executes some constraint checking based on an instance of the class
+     * This class implements the necessary glue-logic to form an actual
+     * allocator from the provided policies. It implements the public interface
+     * and executes some constraint checking based on an instance of the class
      * PolicyConstraints.
      *
      * @tparam T_CreationPolicy The desired type of a CreationPolicy
@@ -108,63 +94,51 @@ namespace detail{
         typename T_CreationPolicy,
         typename T_DistributionPolicy,
         typename T_OOMPolicy,
-        typename T_AlignmentPolicy
-    >
-    class DeviceAllocator :
-        public T_CreationPolicy
+        typename T_AlignmentPolicy>
+    class DeviceAllocator : public T_CreationPolicy
     {
-        typedef boost::uint32_t uint32;
+        using uint32 = std::uint32_t;
+
     public:
-        typedef T_CreationPolicy CreationPolicy;
-        typedef T_DistributionPolicy DistributionPolicy;
-        typedef T_OOMPolicy OOMPolicy;
-        typedef T_AlignmentPolicy AlignmentPolicy;
+        using CreationPolicy = T_CreationPolicy;
+        using DistributionPolicy = T_DistributionPolicy;
+        using OOMPolicy = T_OOMPolicy;
+        using AlignmentPolicy = T_AlignmentPolicy;
 
-        void* pool;
+        void * pool;
 
         MAMC_ACCELERATOR
-        void*
-        malloc(
-            size_t bytes
-        )
+        auto malloc(size_t bytes) -> void *
         {
             DistributionPolicy distributionPolicy;
-            bytes = AlignmentPolicy::applyPadding( bytes );
-            uint32 req_size = distributionPolicy.collect( bytes );
-            void* memBlock = CreationPolicy::create( req_size );
-            const bool oom = CreationPolicy::isOOM( memBlock, req_size );
-            if( oom )
-                memBlock = OOMPolicy::handleOOM( memBlock );
-            void* myPart = distributionPolicy.distribute( memBlock );
+            bytes = AlignmentPolicy::applyPadding(bytes);
+            uint32 req_size = distributionPolicy.collect(bytes);
+            void * memBlock = CreationPolicy::create(req_size);
+            const bool oom = CreationPolicy::isOOM(memBlock, req_size);
+            if(oom)
+                memBlock = OOMPolicy::handleOOM(memBlock);
+            void * myPart = distributionPolicy.distribute(memBlock);
             return myPart;
         }
 
         MAMC_ACCELERATOR
-        void
-        free(
-            void* p
-        )
+        void free(void * p)
         {
-            CreationPolicy::destroy( p );
+            CreationPolicy::destroy(p);
         }
 
-
         /* polymorphism over the availability of getAvailableSlots for calling
          * from the accelerator
          */
         MAMC_ACCELERATOR
-        unsigned
-        getAvailableSlots(
-            size_t slotSize
-        )
+        auto getAvailableSlots(size_t slotSize) -> unsigned
         {
-            slotSize = AlignmentPolicy::applyPadding( slotSize );
+            slotSize = AlignmentPolicy::applyPadding(slotSize);
             return detail::GetAvailableSlotsIfAvailAcc<
                 DeviceAllocator,
-                Traits< DeviceAllocator >::providesAvailableSlots
-            >::getAvailableSlots( slotSize, *this );
+                Traits<DeviceAllocator>::providesAvailableSlots>::
+                getAvailableSlots(slotSize, *this);
         }
-
     };
 
 } // namespace mallocMC
diff --git a/src/include/mallocMC/distributionPolicies/Noop.hpp b/src/include/mallocMC/distributionPolicies/Noop.hpp
index 608a5f2f..94efd135 100644
--- a/src/include/mallocMC/distributionPolicies/Noop.hpp
+++ b/src/include/mallocMC/distributionPolicies/Noop.hpp
@@ -27,17 +27,17 @@
 
 #pragma once
 
-
-namespace mallocMC{
-namespace DistributionPolicies{
-
-  /**
-   * @brief a policy that does nothing
-   *
-   * This DistributionPolicy will not perform any distribution, but only return
-   * its input (identity function)
-   */
-  class Noop;
-
-} //namespace DistributionPolicies
-} //namespace mallocMC
+namespace mallocMC
+{
+    namespace DistributionPolicies
+    {
+        /**
+         * @brief a policy that does nothing
+         *
+         * This DistributionPolicy will not perform any distribution, but only
+         * return its input (identity function)
+         */
+        class Noop;
+
+    } // namespace DistributionPolicies
+} // namespace mallocMC
diff --git a/src/include/mallocMC/distributionPolicies/Noop_impl.hpp b/src/include/mallocMC/distributionPolicies/Noop_impl.hpp
index 6d55c2b9..ecc13455 100644
--- a/src/include/mallocMC/distributionPolicies/Noop_impl.hpp
+++ b/src/include/mallocMC/distributionPolicies/Noop_impl.hpp
@@ -27,36 +27,38 @@
 
 #pragma once
 
-#include <boost/cstdint.hpp>
-#include <string>
-
-#include "Noop.hpp"
 #include "../mallocMC_prefixes.hpp"
+#include "Noop.hpp"
 
-namespace mallocMC{
-namespace DistributionPolicies{
-    
-  class Noop 
-  {
-    typedef boost::uint32_t uint32;
-
-    public:
+#include <cstdint>
+#include <string>
 
-    MAMC_ACCELERATOR
-    uint32 collect(uint32 bytes){
-      return bytes;
-    }
+namespace mallocMC
+{
+    namespace DistributionPolicies
+    {
+        class Noop
+        {
+            using uint32 = std::uint32_t;
 
-    MAMC_ACCELERATOR
-    void* distribute(void* allocatedMem){
-      return allocatedMem;
-    }
+        public:
+            MAMC_ACCELERATOR
+            auto collect(uint32 bytes) const -> uint32
+            {
+                return bytes;
+            }
 
-    static std::string classname(){
-      return "Noop";
-    }
+            MAMC_ACCELERATOR
+            auto distribute(void * allocatedMem) const -> void *
+            {
+                return allocatedMem;
+            }
 
-  };
+            static auto classname() -> std::string
+            {
+                return "Noop";
+            }
+        };
 
-} //namespace DistributionPolicies
-} //namespace mallocMC
+    } // namespace DistributionPolicies
+} // namespace mallocMC
diff --git a/src/include/mallocMC/distributionPolicies/XMallocSIMD.hpp b/src/include/mallocMC/distributionPolicies/XMallocSIMD.hpp
index c80b785a..17528509 100644
--- a/src/include/mallocMC/distributionPolicies/XMallocSIMD.hpp
+++ b/src/include/mallocMC/distributionPolicies/XMallocSIMD.hpp
@@ -33,39 +33,39 @@
 
 #pragma once
 
-#include <boost/mpl/int.hpp>
+namespace mallocMC
+{
+    namespace DistributionPolicies
+    {
+        namespace XMallocSIMDConf
+        {
+            struct DefaultXMallocConfig
+            {
+                static constexpr auto pagesize = 4096;
+            };
+        } // namespace XMallocSIMDConf
 
-namespace mallocMC{
-namespace DistributionPolicies{
-    
-  namespace XMallocSIMDConf{
-    struct DefaultXMallocConfig{
-      typedef boost::mpl::int_<4096>     pagesize;
-    };  
-  }
+        /**
+         * @brief SIMD optimized chunk resizing in the style of XMalloc
+         *
+         * This DistributionPolicy can take the memory requests from a group of
+         * worker threads and combine them, so that only one of the workers will
+         * allocate the whole request. Later, each worker gets an appropriate
+         * offset into the allocated chunk. This is beneficial for SIMD
+         * architectures since only one of the workers has to compete for the
+         * resource.  This algorithm is inspired by the XMalloc memory allocator
+         * (http://ieeexplore.ieee.org/xpls/abs_all.jsp?arnumber=5577907&tag=1)
+         * and its implementation in ScatterAlloc
+         * (http://ieeexplore.ieee.org/xpl/articleDetails.jsp?arnumber=6339604)
+         * XMallocSIMD is inteded to be used with Nvidia CUDA capable
+         * accelerators that support at least compute capability 2.0
+         *
+         * @tparam T_Config (optional) The configuration struct to overwrite
+         *        default configuration. The default can be obtained through
+         *        XMallocSIMD<>::Properties
+         */
+        template<class T_Config = XMallocSIMDConf::DefaultXMallocConfig>
+        class XMallocSIMD;
 
-  /**
-   * @brief SIMD optimized chunk resizing in the style of XMalloc
-   *
-   * This DistributionPolicy can take the memory requests from a group of
-   * worker threads and combine them, so that only one of the workers will
-   * allocate the whole request. Later, each worker gets an appropriate offset
-   * into the allocated chunk. This is beneficial for SIMD architectures since
-   * only one of the workers has to compete for the resource.  This algorithm
-   * is inspired by the XMalloc memory allocator
-   * (http://ieeexplore.ieee.org/xpls/abs_all.jsp?arnumber=5577907&tag=1) and
-   * its implementation in ScatterAlloc
-   * (http://ieeexplore.ieee.org/xpl/articleDetails.jsp?arnumber=6339604)
-   * XMallocSIMD is inteded to be used with Nvidia CUDA capable accelerators
-   * that support at least compute capability 2.0
-   *
-   * @tparam T_Config (optional) The configuration struct to overwrite
-   *        default configuration. The default can be obtained through
-   *        XMallocSIMD<>::Properties
-   */
-  template<class T_Config=XMallocSIMDConf::DefaultXMallocConfig>
-  class XMallocSIMD;
-
-
-} //namespace DistributionPolicies
-} //namespace mallocMC
+    } // namespace DistributionPolicies
+} // namespace mallocMC
diff --git a/src/include/mallocMC/distributionPolicies/XMallocSIMD_impl.hpp b/src/include/mallocMC/distributionPolicies/XMallocSIMD_impl.hpp
index 37afe7f8..bafd62d0 100644
--- a/src/include/mallocMC/distributionPolicies/XMallocSIMD_impl.hpp
+++ b/src/include/mallocMC/distributionPolicies/XMallocSIMD_impl.hpp
@@ -33,39 +33,43 @@
 
 #pragma once
 
-#include <boost/cstdint.hpp>
-#include <boost/static_assert.hpp>
-#include <limits>
-#include <string>
-#include <sstream>
-
-#include "../mallocMC_utils.hpp"
 #include "../mallocMC_prefixes.hpp"
+#include "../mallocMC_utils.hpp"
 #include "XMallocSIMD.hpp"
 
-namespace mallocMC{
-namespace DistributionPolicies{
-
-  template<class T_Config>
-  class XMallocSIMD
-  {
-    private:
-
-      typedef boost::uint32_t uint32;
-      bool can_use_coalescing;
-      uint32 warpid;
-      uint32 myoffset;
-      uint32 threadcount;
-      uint32 req_size;
-    public:
-      typedef T_Config Properties;
-
-      MAMC_ACCELERATOR
-      XMallocSIMD() : can_use_coalescing(false), warpid(warpid_withinblock()),
-        myoffset(0), threadcount(0), req_size(0)
-      {}
-
-    private:
+#include <cstdint>
+#include <limits>
+#include <sstream>
+#include <string>
+
+namespace mallocMC
+{
+    namespace DistributionPolicies
+    {
+        template<class T_Config>
+        class XMallocSIMD
+        {
+        private:
+            using uint32 = std::uint32_t;
+            bool can_use_coalescing;
+            uint32 warpid;
+            uint32 myoffset;
+            uint32 threadcount;
+            uint32 req_size;
+
+        public:
+            using Properties = T_Config;
+
+            MAMC_ACCELERATOR
+            XMallocSIMD() :
+                    can_use_coalescing(false),
+                    warpid(warpid_withinblock()),
+                    myoffset(0),
+                    threadcount(0),
+                    req_size(0)
+            {}
+
+        private:
 /** Allow for a hierarchical validation of parameters:
  *
  * shipped default-parameters (in the inherited struct) have lowest precedence.
@@ -76,86 +80,82 @@ namespace DistributionPolicies{
  * default-struct < template-struct < command-line parameter
  */
 #ifndef MALLOCMC_DP_XMALLOCSIMD_PAGESIZE
-#define MALLOCMC_DP_XMALLOCSIMD_PAGESIZE Properties::pagesize::value
+#define MALLOCMC_DP_XMALLOCSIMD_PAGESIZE (Properties::pagesize)
 #endif
-      BOOST_STATIC_CONSTEXPR uint32 pagesize      = MALLOCMC_DP_XMALLOCSIMD_PAGESIZE;
-
-      //all the properties must be unsigned integers > 0
-      BOOST_STATIC_ASSERT(!std::numeric_limits<typename Properties::pagesize::type>::is_signed);
-
-      // \TODO: The static_cast can be removed once the minimal dependencies of
-      //        this project is are at least CUDA 7.0 and gcc 4.8.2
-      BOOST_STATIC_ASSERT(static_cast<uint32>(pagesize) > 0);
-
-    public:
-      BOOST_STATIC_CONSTEXPR uint32 _pagesize = pagesize;
-
-      MAMC_ACCELERATOR
-      uint32 collect(uint32 bytes){
-
-        can_use_coalescing = false;
-        myoffset = 0;
-        threadcount = 0;
-
-        //init with initial counter
-        __shared__ uint32 warp_sizecounter[MaxThreadsPerBlock::value / WarpSize::value];
-        warp_sizecounter[warpid] = 16;
-
-        //second half: make sure that all coalesced allocations can fit within one page
-        //necessary for offset calculation
-        bool coalescible = bytes > 0 && bytes < (pagesize / 32);
+            static constexpr uint32 pagesize = MALLOCMC_DP_XMALLOCSIMD_PAGESIZE;
+
+        public:
+            static constexpr uint32 _pagesize = pagesize;
+
+            MAMC_ACCELERATOR
+            auto collect(uint32 bytes) -> uint32
+            {
+                can_use_coalescing = false;
+                myoffset = 0;
+                threadcount = 0;
+
+                // init with initial counter
+                __shared__ uint32 warp_sizecounter
+                    [MaxThreadsPerBlock::value / WarpSize::value];
+                warp_sizecounter[warpid] = 16;
+
+                // second half: make sure that all coalesced allocations can fit
+                // within one page necessary for offset calculation
+                bool coalescible = bytes > 0 && bytes < (pagesize / 32);
 #if(__CUDACC_VER_MAJOR__ >= 9)
-        threadcount = __popc(__ballot_sync(__activemask(), coalescible));
+                threadcount
+                    = __popc(__ballot_sync(__activemask(), coalescible));
 #else
-        threadcount = __popc(__ballot(coalescible));
+                threadcount = __popc(__ballot(coalescible));
 #endif
-        if (coalescible && threadcount > 1)
-        {
-          myoffset = atomicAdd(&warp_sizecounter[warpid], bytes);
-          can_use_coalescing = true;
-        }
-
-        req_size = bytes;
-        if (can_use_coalescing)
-          req_size = (myoffset == 16) ? warp_sizecounter[warpid] : 0;
-
-        return req_size;
-      }
-
-
-      MAMC_ACCELERATOR
-      void* distribute(void* allocatedMem){
-        __shared__ char* warp_res[MaxThreadsPerBlock::value / WarpSize::value];
-
-        char* myalloc = (char*) allocatedMem;
-        if (req_size && can_use_coalescing)
-        {
-          warp_res[warpid] = myalloc;
-          if (myalloc != 0)
-            *(uint32*)myalloc = threadcount;
-        }
-        __threadfence_block();
-
-        void *myres = myalloc;
-        if(can_use_coalescing)
-        {
-          if(warp_res[warpid] != 0)
-            myres = warp_res[warpid] + myoffset;
-          else
-            myres = 0;
-        }
-        return myres;
-      }
-
-      MAMC_HOST
-      static std::string classname(){
-        std::stringstream ss;
-        ss << "XMallocSIMD[" << pagesize << "]";
-        return ss.str();
-      }
-
-  };
-
-} //namespace DistributionPolicies
-
-} //namespace mallocMC
+                if(coalescible && threadcount > 1)
+                {
+                    myoffset = atomicAdd(&warp_sizecounter[warpid], bytes);
+                    can_use_coalescing = true;
+                }
+
+                req_size = bytes;
+                if(can_use_coalescing)
+                    req_size = (myoffset == 16) ? warp_sizecounter[warpid] : 0;
+
+                return req_size;
+            }
+
+            MAMC_ACCELERATOR
+            auto distribute(void * allocatedMem) -> void *
+            {
+                __shared__ char *
+                    warp_res[MaxThreadsPerBlock::value / WarpSize::value];
+
+                char * myalloc = (char *)allocatedMem;
+                if(req_size && can_use_coalescing)
+                {
+                    warp_res[warpid] = myalloc;
+                    if(myalloc != 0)
+                        *(uint32 *)myalloc = threadcount;
+                }
+                __threadfence_block();
+
+                void * myres = myalloc;
+                if(can_use_coalescing)
+                {
+                    if(warp_res[warpid] != 0)
+                        myres = warp_res[warpid] + myoffset;
+                    else
+                        myres = 0;
+                }
+                return myres;
+            }
+
+            MAMC_HOST
+            static auto classname() -> std::string
+            {
+                std::stringstream ss;
+                ss << "XMallocSIMD[" << pagesize << "]";
+                return ss.str();
+            }
+        };
+
+    } // namespace DistributionPolicies
+
+} // namespace mallocMC
diff --git a/src/include/mallocMC/mallocMC.hpp b/src/include/mallocMC/mallocMC.hpp
index 4e421032..9fb7d7aa 100644
--- a/src/include/mallocMC/mallocMC.hpp
+++ b/src/include/mallocMC/mallocMC.hpp
@@ -43,8 +43,8 @@
 #include "mallocMC_hostclass.hpp"
 
 // all the policies
+#include "AlignmentPolicies.hpp"
 #include "CreationPolicies.hpp"
 #include "DistributionPolicies.hpp"
-#include "ReservePoolPolicies.hpp"
-#include "AlignmentPolicies.hpp"
 #include "OOMPolicies.hpp"
+#include "ReservePoolPolicies.hpp"
diff --git a/src/include/mallocMC/mallocMC_allocator_handle.hpp b/src/include/mallocMC/mallocMC_allocator_handle.hpp
index 2832496d..9a52e910 100644
--- a/src/include/mallocMC/mallocMC_allocator_handle.hpp
+++ b/src/include/mallocMC/mallocMC_allocator_handle.hpp
@@ -30,49 +30,34 @@
 
 #include "mallocMC_prefixes.hpp"
 
-namespace mallocMC{
-
-    template <typename T_HostAllocator>
+namespace mallocMC
+{
+    template<typename T_HostAllocator>
     struct AllocatorHandleImpl
     {
-        typedef typename T_HostAllocator::DevAllocator DevAllocator;
+        using DevAllocator = typename T_HostAllocator::DevAllocator;
 
-        DevAllocator* devAllocator;
+        DevAllocator * devAllocator;
 
-        AllocatorHandleImpl(
-            DevAllocator* p
-        ) :
-            devAllocator( p )
-        {
-        }
+        explicit AllocatorHandleImpl(DevAllocator * p) : devAllocator(p) {}
 
         MAMC_ACCELERATOR
-        void*
-        malloc(
-            size_t size
-        )
+        auto malloc(size_t size) -> void *
         {
-            return devAllocator->malloc( size );
+            return devAllocator->malloc(size);
         }
 
         MAMC_ACCELERATOR
-        void
-        free(
-            void* p
-        )
+        void free(void * p)
         {
-            devAllocator->free( p );
+            devAllocator->free(p);
         }
 
         MAMC_ACCELERATOR
-        unsigned
-        getAvailableSlots(
-            size_t slotSize
-        )
+        auto getAvailableSlots(size_t slotSize) -> unsigned
         {
-            return devAllocator->getAvailableSlots( slotSize );
+            return devAllocator->getAvailableSlots(slotSize);
         }
-
     };
 
 } // namespace mallocMC
diff --git a/src/include/mallocMC/mallocMC_constraints.hpp b/src/include/mallocMC/mallocMC_constraints.hpp
index 867e97f7..5f6f4b36 100644
--- a/src/include/mallocMC/mallocMC_constraints.hpp
+++ b/src/include/mallocMC/mallocMC_constraints.hpp
@@ -30,58 +30,69 @@
 
 #include "creationPolicies/Scatter.hpp"
 #include "distributionPolicies/XMallocSIMD.hpp"
-#include <boost/mpl/assert.hpp>
 
-namespace mallocMC{
-
-  /** The default PolicyCheckers (do always succeed)
-   */
-  template<typename Policy1>
-  class PolicyCheck1{};
-
-  template<typename Policy1, typename Policy2>
-  class PolicyCheck2{};
-
-  template<typename Policy1, typename Policy2, typename Policy3>
-  class PolicyCheck3{};
-
-  template<typename Policy1, typename Policy2, typename Policy3, typename Policy4>
-  class PolicyCheck4{};
-
-  template<typename Policy1, typename Policy2, typename Policy3, typename Policy4, typename Policy5>
-  class PolicyCheck5{};
-
-
-  /** Enforces constraints on policies or combinations of polices
-   * 
-   * Uses template specialization of PolicyChecker
-   */
-  template < 
-     typename T_CreationPolicy, 
-     typename T_DistributionPolicy, 
-     typename T_OOMPolicy, 
-     typename T_GetHeapPolicy,
-     typename T_AlignmentPolicy
-       >
-
-  class PolicyConstraints:PolicyCheck2<T_CreationPolicy, T_DistributionPolicy>{
-
-  };
-
-
-  /** Scatter and XMallocSIMD need the same pagesize!
-   *
-   * This constraint ensures that if the CreationPolicy "Scatter" and the
-   * DistributionPolicy "XMallocSIMD" are selected, they are configured to use
-   * the same value for their "pagesize"-parameter.
-   */
-  template<typename x, typename y, typename z >
-  class PolicyCheck2<
-    typename CreationPolicies::Scatter<x,y>,
-    typename DistributionPolicies::XMallocSIMD<z> 
-  >{
-    BOOST_MPL_ASSERT_MSG(x::pagesize::value == z::pagesize::value,
-        Pagesize_must_be_the_same_when_combining_Scatter_and_XMallocSIMD, () );
-  };
-
-}//namespace mallocMC
+namespace mallocMC
+{
+    /** The default PolicyCheckers (do always succeed)
+     */
+    template<typename Policy1>
+    class PolicyCheck1
+    {};
+
+    template<typename Policy1, typename Policy2>
+    class PolicyCheck2
+    {};
+
+    template<typename Policy1, typename Policy2, typename Policy3>
+    class PolicyCheck3
+    {};
+
+    template<
+        typename Policy1,
+        typename Policy2,
+        typename Policy3,
+        typename Policy4>
+    class PolicyCheck4
+    {};
+
+    template<
+        typename Policy1,
+        typename Policy2,
+        typename Policy3,
+        typename Policy4,
+        typename Policy5>
+    class PolicyCheck5
+    {};
+
+    /** Enforces constraints on policies or combinations of polices
+     *
+     * Uses template specialization of PolicyChecker
+     */
+    template<
+        typename T_CreationPolicy,
+        typename T_DistributionPolicy,
+        typename T_OOMPolicy,
+        typename T_GetHeapPolicy,
+        typename T_AlignmentPolicy>
+
+    class PolicyConstraints :
+            PolicyCheck2<T_CreationPolicy, T_DistributionPolicy>
+    {};
+
+    /** Scatter and XMallocSIMD need the same pagesize!
+     *
+     * This constraint ensures that if the CreationPolicy "Scatter" and the
+     * DistributionPolicy "XMallocSIMD" are selected, they are configured to use
+     * the same value for their "pagesize"-parameter.
+     */
+    template<typename x, typename y, typename z>
+    class PolicyCheck2<
+        typename CreationPolicies::Scatter<x, y>,
+        typename DistributionPolicies::XMallocSIMD<z>>
+    {
+        static_assert(
+            x::pagesize == z::pagesize,
+            "Pagesize must be the same when combining Scatter and XMallocSIMD");
+    };
+
+} // namespace mallocMC
diff --git a/src/include/mallocMC/mallocMC_hostclass.hpp b/src/include/mallocMC/mallocMC_hostclass.hpp
index c47e6a15..48bc1f74 100644
--- a/src/include/mallocMC/mallocMC_hostclass.hpp
+++ b/src/include/mallocMC/mallocMC_hostclass.hpp
@@ -28,6 +28,6 @@
 
 #pragma once
 
-#include "mallocMC_traits.hpp"
-#include "device_allocator.hpp"
 #include "allocator.hpp"
+#include "device_allocator.hpp"
+#include "mallocMC_traits.hpp"
diff --git a/src/include/mallocMC/mallocMC_prefixes.hpp b/src/include/mallocMC/mallocMC_prefixes.hpp
index 1199e3e1..71c15372 100644
--- a/src/include/mallocMC/mallocMC_prefixes.hpp
+++ b/src/include/mallocMC/mallocMC_prefixes.hpp
@@ -28,6 +28,7 @@
 
 #pragma once
 
+#include <device_launch_parameters.h>
+
 #define MAMC_HOST __host__
 #define MAMC_ACCELERATOR __device__
-
diff --git a/src/include/mallocMC/mallocMC_traits.hpp b/src/include/mallocMC/mallocMC_traits.hpp
index 0e811d74..d40efa83 100644
--- a/src/include/mallocMC/mallocMC_traits.hpp
+++ b/src/include/mallocMC/mallocMC_traits.hpp
@@ -28,15 +28,12 @@
 
 #pragma once
 
-#include <boost/config.hpp>
-
-
-namespace mallocMC{
-
-    template <class T_Allocator>
-    struct  Traits{
-        BOOST_STATIC_CONSTEXPR bool providesAvailableSlots = T_Allocator::CreationPolicy::providesAvailableSlots::value;
+namespace mallocMC
+{
+    template<class T_Allocator>
+    struct Traits
+    {
+        static constexpr bool providesAvailableSlots
+            = T_Allocator::CreationPolicy::providesAvailableSlots;
     };
-
-} //namespace mallocMC
-
+} // namespace mallocMC
diff --git a/src/include/mallocMC/mallocMC_utils.hpp b/src/include/mallocMC/mallocMC_utils.hpp
index 2353cd37..ebe5ee83 100644
--- a/src/include/mallocMC/mallocMC_utils.hpp
+++ b/src/include/mallocMC/mallocMC_utils.hpp
@@ -33,204 +33,204 @@
 
 #pragma once
 
+#include <cuda_runtime.h>
+
 #ifdef _MSC_VER
 #include <intrin.h>
 #endif
 
-#include <string>
-#include <sstream>
-#include <stdexcept>
-#include <boost/cstdint.hpp>
-
 #include "mallocMC_prefixes.hpp"
 
+#include <cstdint>
+#include <sstream>
+#include <stdexcept>
+#include <string>
 
 namespace CUDA
 {
-  class error : public std::runtime_error
-  {
-  private:
-    static std::string genErrorString(cudaError errorValue, const char* file, int line)
-    {
-      std::ostringstream msg;
-      msg << file << '(' << line << "): error: " << cudaGetErrorString(errorValue);
-      return msg.str();
-    }
-  public:
-    error(cudaError errorValue, const char* file, int line)
-      : runtime_error(genErrorString(errorValue, file, line))
+    class error : public std::runtime_error
+    {
+    private:
+        static auto
+        genErrorString(cudaError errorValue, const char * file, int line)
+            -> std::string
+        {
+            std::ostringstream msg;
+            msg << file << '(' << line
+                << "): error: " << cudaGetErrorString(errorValue);
+            return msg.str();
+        }
+
+    public:
+        error(cudaError errorValue, const char * file, int line) :
+                runtime_error(genErrorString(errorValue, file, line))
+        {}
+
+        explicit error(cudaError errorValue) :
+                runtime_error(cudaGetErrorString(errorValue))
+        {}
+
+        explicit error(const std::string & msg) : runtime_error(msg) {}
+    };
+
+    inline void checkError(cudaError errorValue, const char * file, int line)
     {
+        if(errorValue != cudaSuccess)
+            throw CUDA::error(errorValue, file, line);
     }
 
-    error(cudaError errorValue)
-      : runtime_error(cudaGetErrorString(errorValue))
+    inline void checkError(const char * file, int line)
     {
+        checkError(cudaGetLastError(), file, line);
     }
 
-    error(const std::string& msg)
-      : runtime_error(msg)
+    inline void checkError()
     {
+        cudaError errorValue = cudaGetLastError();
+        if(errorValue != cudaSuccess)
+            throw CUDA::error(errorValue);
     }
-  };
-
-  inline void checkError(cudaError errorValue, const char* file, int line)
-  {
-    if (errorValue != cudaSuccess)
-      throw CUDA::error(errorValue, file, line);
-  }
-
-  inline void checkError(const char* file, int line)
-  {
-    checkError(cudaGetLastError(), file, line);
-  }
-
-  inline void checkError()
-  {
-    cudaError errorValue = cudaGetLastError();
-    if (errorValue != cudaSuccess)
-      throw CUDA::error(errorValue);
-  }
-
-#define MALLOCMC_CUDA_CHECKED_CALL(call) CUDA::checkError(call, __FILE__, __LINE__)
-#define MALLOCMC_CUDA_CHECK_ERROR() CUDA::checkError(__FILE__, __LINE__)
-}
 
+#define MALLOCMC_CUDA_CHECKED_CALL(call) \
+    CUDA::checkError(call, __FILE__, __LINE__)
+#define MALLOCMC_CUDA_CHECK_ERROR() CUDA::checkError(__FILE__, __LINE__)
+} // namespace CUDA
 
 namespace mallocMC
 {
+    template<int PSIZE>
+    class __PointerEquivalent
+    {
+    public:
+        using type = unsigned int;
+    };
+    template<>
+    class __PointerEquivalent<8>
+    {
+    public:
+        using type = unsigned long long;
+    };
+
+    using PointerEquivalent
+        = mallocMC::__PointerEquivalent<sizeof(char *)>::type;
 
-  template<int PSIZE>
-  class __PointerEquivalent
-  {
-  public:
-    typedef unsigned int type;
-  };
-  template<>
-  class __PointerEquivalent<8>
-  {
-  public:
-    typedef unsigned long long int type;
-  };
-
-  typedef mallocMC::__PointerEquivalent<sizeof(char*)>::type PointerEquivalent;
-
-
-  MAMC_ACCELERATOR inline boost::uint32_t laneid()
-  {
-    boost::uint32_t mylaneid;
-    asm("mov.u32 %0, %%laneid;" : "=r" (mylaneid));
-    return mylaneid;
-  }
-
-  /** warp index within a multiprocessor
-   *
-   * Index of the warp within the multiprocessor at the moment of the query.
-   * The result is volatile and can be different with each query.
-   *
-   * @return current index of the warp
-   */
-  MAMC_ACCELERATOR inline boost::uint32_t warpid()
-  {
-    boost::uint32_t mywarpid;
-    asm("mov.u32 %0, %%warpid;" : "=r" (mywarpid));
-    return mywarpid;
-  }
-
-  /** maximum number of warps on a multiprocessor
-   *
-   * @return maximum number of warps on a multiprocessor
-   */
-  MAMC_ACCELERATOR inline boost::uint32_t nwarpid()
-  {
-    boost::uint32_t mynwarpid;
-    asm("mov.u32 %0, %%nwarpid;" : "=r" (mynwarpid));
-    return mynwarpid;
-  }
-
-  MAMC_ACCELERATOR inline boost::uint32_t smid()
-  {
-    boost::uint32_t mysmid;
-    asm("mov.u32 %0, %%smid;" : "=r" (mysmid));
-    return mysmid;
-  }
-
-  MAMC_ACCELERATOR inline boost::uint32_t nsmid()
-  {
-    boost::uint32_t mynsmid;
-    asm("mov.u32 %0, %%nsmid;" : "=r" (mynsmid));
-    return mynsmid;
-  }
-  MAMC_ACCELERATOR inline boost::uint32_t lanemask()
-  {
-    boost::uint32_t lanemask;
-    asm("mov.u32 %0, %%lanemask_eq;" : "=r" (lanemask));
-    return lanemask;
-  }
-
-  MAMC_ACCELERATOR inline boost::uint32_t lanemask_le()
-  {
-    boost::uint32_t lanemask;
-    asm("mov.u32 %0, %%lanemask_le;" : "=r" (lanemask));
-    return lanemask;
-  }
-
-  MAMC_ACCELERATOR inline boost::uint32_t lanemask_lt()
-  {
-    boost::uint32_t lanemask;
-    asm("mov.u32 %0, %%lanemask_lt;" : "=r" (lanemask));
-    return lanemask;
-  }
-
-  MAMC_ACCELERATOR inline boost::uint32_t lanemask_ge()
-  {
-    boost::uint32_t lanemask;
-    asm("mov.u32 %0, %%lanemask_ge;" : "=r" (lanemask));
-    return lanemask;
-  }
-
-  MAMC_ACCELERATOR inline boost::uint32_t lanemask_gt()
-  {
-    boost::uint32_t lanemask;
-    asm("mov.u32 %0, %%lanemask_gt;" : "=r" (lanemask));
-    return lanemask;
-  }
-
-  template<class T>
-  MAMC_HOST MAMC_ACCELERATOR inline T divup(T a, T b) { return (a + b - 1)/b; }
-
-  /** the maximal number threads per block
-   *
-   * https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#compute-capabilities
-   */
-  struct MaxThreadsPerBlock
-  {
-    // valid for sm_2.X - sm_7.5
-    BOOST_STATIC_CONSTEXPR uint32_t value = 1024;
-  };
-
-  /** number of threads within a warp
-   *
-   * https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#compute-capabilities
-   */
-  struct WarpSize
-  {
-    // valid for sm_2.X - sm_7.5
-    BOOST_STATIC_CONSTEXPR uint32_t value = 32;
-  };
-
-  /** warp id within a cuda block
-   *
-   * The id is constant over the lifetime of the thread.
-   * The id is not equal to warpid().
-   *
-   * @return warp id within the block
-   */
-  MAMC_ACCELERATOR inline boost::uint32_t warpid_withinblock()
-  {
-    return (
-      threadIdx.z * blockDim.y * blockDim.x +
-      threadIdx.y * blockDim.x +
-      threadIdx.x
-    ) / WarpSize::value;
-  }
-}
+    MAMC_ACCELERATOR inline auto laneid() -> std::uint32_t
+    {
+        std::uint32_t mylaneid;
+        asm("mov.u32 %0, %%laneid;" : "=r"(mylaneid));
+        return mylaneid;
+    }
+
+    /** warp index within a multiprocessor
+     *
+     * Index of the warp within the multiprocessor at the moment of the query.
+     * The result is volatile and can be different with each query.
+     *
+     * @return current index of the warp
+     */
+    MAMC_ACCELERATOR inline auto warpid() -> std::uint32_t
+    {
+        std::uint32_t mywarpid;
+        asm("mov.u32 %0, %%warpid;" : "=r"(mywarpid));
+        return mywarpid;
+    }
+
+    /** maximum number of warps on a multiprocessor
+     *
+     * @return maximum number of warps on a multiprocessor
+     */
+    MAMC_ACCELERATOR inline auto nwarpid() -> std::uint32_t
+    {
+        std::uint32_t mynwarpid;
+        asm("mov.u32 %0, %%nwarpid;" : "=r"(mynwarpid));
+        return mynwarpid;
+    }
+
+    MAMC_ACCELERATOR inline auto smid() -> std::uint32_t
+    {
+        std::uint32_t mysmid;
+        asm("mov.u32 %0, %%smid;" : "=r"(mysmid));
+        return mysmid;
+    }
+
+    MAMC_ACCELERATOR inline auto nsmid() -> std::uint32_t
+    {
+        std::uint32_t mynsmid;
+        asm("mov.u32 %0, %%nsmid;" : "=r"(mynsmid));
+        return mynsmid;
+    }
+    MAMC_ACCELERATOR inline auto lanemask() -> std::uint32_t
+    {
+        std::uint32_t lanemask;
+        asm("mov.u32 %0, %%lanemask_eq;" : "=r"(lanemask));
+        return lanemask;
+    }
+
+    MAMC_ACCELERATOR inline auto lanemask_le() -> std::uint32_t
+    {
+        std::uint32_t lanemask;
+        asm("mov.u32 %0, %%lanemask_le;" : "=r"(lanemask));
+        return lanemask;
+    }
+
+    MAMC_ACCELERATOR inline auto lanemask_lt() -> std::uint32_t
+    {
+        std::uint32_t lanemask;
+        asm("mov.u32 %0, %%lanemask_lt;" : "=r"(lanemask));
+        return lanemask;
+    }
+
+    MAMC_ACCELERATOR inline auto lanemask_ge() -> std::uint32_t
+    {
+        std::uint32_t lanemask;
+        asm("mov.u32 %0, %%lanemask_ge;" : "=r"(lanemask));
+        return lanemask;
+    }
+
+    MAMC_ACCELERATOR inline auto lanemask_gt() -> std::uint32_t
+    {
+        std::uint32_t lanemask;
+        asm("mov.u32 %0, %%lanemask_gt;" : "=r"(lanemask));
+        return lanemask;
+    }
+
+    template<class T>
+    MAMC_HOST MAMC_ACCELERATOR inline auto divup(T a, T b) -> T
+    {
+        return (a + b - 1) / b;
+    }
+
+    /** the maximal number threads per block
+     *
+     * https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#compute-capabilities
+     */
+    struct MaxThreadsPerBlock
+    {
+        // valid for sm_2.X - sm_7.5
+        static constexpr uint32_t value = 1024;
+    };
+
+    /** number of threads within a warp
+     *
+     * https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#compute-capabilities
+     */
+    struct WarpSize
+    {
+        // valid for sm_2.X - sm_7.5
+        static constexpr uint32_t value = 32;
+    };
+
+    /** warp id within a cuda block
+     *
+     * The id is constant over the lifetime of the thread.
+     * The id is not equal to warpid().
+     *
+     * @return warp id within the block
+     */
+    MAMC_ACCELERATOR inline auto warpid_withinblock() -> std::uint32_t
+    {
+        return (threadIdx.z * blockDim.y * blockDim.x + threadIdx.y * blockDim.x
+                + threadIdx.x)
+            / WarpSize::value;
+    }
+} // namespace mallocMC
diff --git a/src/include/mallocMC/oOMPolicies/BadAllocException.hpp b/src/include/mallocMC/oOMPolicies/BadAllocException.hpp
index 9f25ecae..26a13418 100644
--- a/src/include/mallocMC/oOMPolicies/BadAllocException.hpp
+++ b/src/include/mallocMC/oOMPolicies/BadAllocException.hpp
@@ -27,19 +27,21 @@
 
 #pragma once
 
-namespace mallocMC{
-namespace OOMPolicies{
-
-  /**
-   * @brief Throws a std::bad_alloc exception on OutOfMemory
-   *
-   * This OOMPolicy will throw a std::bad_alloc exception, if the accelerator
-   * supports it. Currently, Nvidia CUDA does not support any form of exception
-   * handling, therefore handleOOM() does not have any effect on these
-   * accelerators. Using this policy on other types of accelerators that do not
-   * support exceptions results in undefined behaviour.
-   */
-  struct BadAllocException;
-
-} //namespace OOMPolicies
-} //namespace mallocMC
+namespace mallocMC
+{
+    namespace OOMPolicies
+    {
+        /**
+         * @brief Throws a std::bad_alloc exception on OutOfMemory
+         *
+         * This OOMPolicy will throw a std::bad_alloc exception, if the
+         * accelerator supports it. Currently, Nvidia CUDA does not support any
+         * form of exception handling, therefore handleOOM() does not have any
+         * effect on these accelerators. Using this policy on other types of
+         * accelerators that do not support exceptions results in undefined
+         * behaviour.
+         */
+        struct BadAllocException;
+
+    } // namespace OOMPolicies
+} // namespace mallocMC
diff --git a/src/include/mallocMC/oOMPolicies/BadAllocException_impl.hpp b/src/include/mallocMC/oOMPolicies/BadAllocException_impl.hpp
index fb180370..05bd6626 100644
--- a/src/include/mallocMC/oOMPolicies/BadAllocException_impl.hpp
+++ b/src/include/mallocMC/oOMPolicies/BadAllocException_impl.hpp
@@ -27,19 +27,21 @@
 
 #pragma once
 
-#include <cassert>
-#include <string>
-
-#include "BadAllocException.hpp"
 #include "../mallocMC_prefixes.hpp"
+#include "BadAllocException.hpp"
 
-namespace mallocMC{
-namespace OOMPolicies{
+#include <cassert>
+#include <string>
 
-  struct BadAllocException
-  {
-    MAMC_ACCELERATOR
-    static void* handleOOM(void* mem){
+namespace mallocMC
+{
+    namespace OOMPolicies
+    {
+        struct BadAllocException
+        {
+            MAMC_ACCELERATOR
+            static auto handleOOM(void * mem) -> void *
+            {
 #ifdef __CUDACC__
 //#if __CUDA_ARCH__ < 350
 #define PM_EXCEPTIONS_NOT_SUPPORTED_HERE
@@ -48,18 +50,18 @@ namespace OOMPolicies{
 
 #ifdef PM_EXCEPTIONS_NOT_SUPPORTED_HERE
 #undef PM_EXCEPTIONS_NOT_SUPPORTED_HERE
-      assert(false);
+                assert(false);
 #else
-      std::bad_alloc exception;
-      throw exception;
+                throw std::bad_alloc{};
 #endif
-      return mem;
-    }
+                return mem;
+            }
 
-    static std::string classname(){
-      return "BadAllocException";
-    }
-  };
+            static auto classname() -> std::string
+            {
+                return "BadAllocException";
+            }
+        };
 
-} //namespace OOMPolicies
-} //namespace mallocMC
+    } // namespace OOMPolicies
+} // namespace mallocMC
diff --git a/src/include/mallocMC/oOMPolicies/ReturnNull.hpp b/src/include/mallocMC/oOMPolicies/ReturnNull.hpp
index 2db8568a..1d723133 100644
--- a/src/include/mallocMC/oOMPolicies/ReturnNull.hpp
+++ b/src/include/mallocMC/oOMPolicies/ReturnNull.hpp
@@ -27,15 +27,16 @@
 
 #pragma once
 
-namespace mallocMC{
-namespace OOMPolicies{
-
-  /**
-   * @brief Returns a NULL pointer on OutOfMemory conditions
-   *
-   * This OOMPolicy will return NULL, if handleOOM() is called.
-   */
-  class ReturnNull;
-    
-} //namespace OOMPolicies
-} //namespace mallocMC
+namespace mallocMC
+{
+    namespace OOMPolicies
+    {
+        /**
+         * @brief Returns a nullptr pointer on OutOfMemory conditions
+         *
+         * This OOMPolicy will return nullptr, if handleOOM() is called.
+         */
+        class ReturnNull;
+
+    } // namespace OOMPolicies
+} // namespace mallocMC
diff --git a/src/include/mallocMC/oOMPolicies/ReturnNull_impl.hpp b/src/include/mallocMC/oOMPolicies/ReturnNull_impl.hpp
index f18bd0e2..3e44b18f 100644
--- a/src/include/mallocMC/oOMPolicies/ReturnNull_impl.hpp
+++ b/src/include/mallocMC/oOMPolicies/ReturnNull_impl.hpp
@@ -27,26 +27,29 @@
 
 #pragma once
 
-#include <string>
-
-#include "ReturnNull.hpp"
 #include "../mallocMC_prefixes.hpp"
+#include "ReturnNull.hpp"
 
-namespace mallocMC{
-namespace OOMPolicies{
-
-  class ReturnNull
-  {
-    public:
-      MAMC_ACCELERATOR
-      static void* handleOOM(void* mem){
-        return NULL;
-      }
-
-      static std::string classname(){
-        return "ReturnNull";
-      }
-  };
+#include <string>
 
-} //namespace OOMPolicies
-} //namespace mallocMC
+namespace mallocMC
+{
+    namespace OOMPolicies
+    {
+        class ReturnNull
+        {
+        public:
+            MAMC_ACCELERATOR
+            static auto handleOOM(void * mem) -> void *
+            {
+                return nullptr;
+            }
+
+            static auto classname() -> std::string
+            {
+                return "ReturnNull";
+            }
+        };
+
+    } // namespace OOMPolicies
+} // namespace mallocMC
diff --git a/src/include/mallocMC/reservePoolPolicies/CudaSetLimits.hpp b/src/include/mallocMC/reservePoolPolicies/CudaSetLimits.hpp
index cfbceea2..1cee0a5b 100644
--- a/src/include/mallocMC/reservePoolPolicies/CudaSetLimits.hpp
+++ b/src/include/mallocMC/reservePoolPolicies/CudaSetLimits.hpp
@@ -27,18 +27,19 @@
 
 #pragma once
 
-namespace mallocMC{
-namespace ReservePoolPolicies{
-
-  /**
-   * @brief set CUDA internal heap for device-side malloc calls
-   *
-   * This ReservePoolPolicy is intended for use with CUDA capable accelerators
-   * that support at least compute capability 2.0. It should be used in
-   * conjunction with a CreationPolicy that actually requires the CUDA-internal
-   * heap to be sized by calls to cudaDeviceSetLimit()
-   */
-  struct CudaSetLimits;
-
-} //namespace ReservePoolPolicies
-} //namespace mallocMC
+namespace mallocMC
+{
+    namespace ReservePoolPolicies
+    {
+        /**
+         * @brief set CUDA internal heap for device-side malloc calls
+         *
+         * This ReservePoolPolicy is intended for use with CUDA capable
+         * accelerators that support at least compute capability 2.0. It should
+         * be used in conjunction with a CreationPolicy that actually requires
+         * the CUDA-internal heap to be sized by calls to cudaDeviceSetLimit()
+         */
+        struct CudaSetLimits;
+
+    } // namespace ReservePoolPolicies
+} // namespace mallocMC
diff --git a/src/include/mallocMC/reservePoolPolicies/CudaSetLimits_impl.hpp b/src/include/mallocMC/reservePoolPolicies/CudaSetLimits_impl.hpp
index eb4ddfa2..baed9ad6 100644
--- a/src/include/mallocMC/reservePoolPolicies/CudaSetLimits_impl.hpp
+++ b/src/include/mallocMC/reservePoolPolicies/CudaSetLimits_impl.hpp
@@ -27,29 +27,33 @@
 
 #pragma once
 
-#include <cuda_runtime_api.h>
-#include <string>
-
 #include "CudaSetLimits.hpp"
 
-namespace mallocMC{
-namespace ReservePoolPolicies{
-
-  struct CudaSetLimits{
-    static void* setMemPool(size_t memsize){
-      cudaDeviceSetLimit(cudaLimitMallocHeapSize, memsize);
-      return NULL;
-    }
-
-    static void resetMemPool(void *p=NULL){
-      cudaDeviceSetLimit(cudaLimitMallocHeapSize, 8192U);
-    }
-
-    static std::string classname(){
-      return "CudaSetLimits";
-    }
-
-  };
+#include <cuda_runtime_api.h>
+#include <string>
 
-} //namespace ReservePoolPolicies
-} //namespace mallocMC
+namespace mallocMC
+{
+    namespace ReservePoolPolicies
+    {
+        struct CudaSetLimits
+        {
+            static auto setMemPool(size_t memsize) -> void *
+            {
+                cudaDeviceSetLimit(cudaLimitMallocHeapSize, memsize);
+                return nullptr;
+            }
+
+            static void resetMemPool(void * p = nullptr)
+            {
+                cudaDeviceSetLimit(cudaLimitMallocHeapSize, 8192U);
+            }
+
+            static auto classname() -> std::string
+            {
+                return "CudaSetLimits";
+            }
+        };
+
+    } // namespace ReservePoolPolicies
+} // namespace mallocMC
diff --git a/src/include/mallocMC/reservePoolPolicies/SimpleCudaMalloc.hpp b/src/include/mallocMC/reservePoolPolicies/SimpleCudaMalloc.hpp
index 1bb386f7..c98197d1 100644
--- a/src/include/mallocMC/reservePoolPolicies/SimpleCudaMalloc.hpp
+++ b/src/include/mallocMC/reservePoolPolicies/SimpleCudaMalloc.hpp
@@ -27,18 +27,19 @@
 
 #pragma once
 
-namespace mallocMC{
-namespace ReservePoolPolicies{
-
-  /**
-   * @brief creates/allocates a fixed memory pool on the accelerator
-   *
-   * This ReservePoolPolicy will create a memory pool of a fixed size on the
-   * accelerator by using a host-side call to cudaMalloc(). The pool is later
-   * freed through cudaFree(). This can only be used with accelerators that
-   * support CUDA and compute capability 2.0 or higher.
-   */
-  struct SimpleCudaMalloc;
-
-} //namespace ReservePoolPolicies
-} //namespace mallocMC
+namespace mallocMC
+{
+    namespace ReservePoolPolicies
+    {
+        /**
+         * @brief creates/allocates a fixed memory pool on the accelerator
+         *
+         * This ReservePoolPolicy will create a memory pool of a fixed size on
+         * the accelerator by using a host-side call to cudaMalloc(). The pool
+         * is later freed through cudaFree(). This can only be used with
+         * accelerators that support CUDA and compute capability 2.0 or higher.
+         */
+        struct SimpleCudaMalloc;
+
+    } // namespace ReservePoolPolicies
+} // namespace mallocMC
diff --git a/src/include/mallocMC/reservePoolPolicies/SimpleCudaMalloc_impl.hpp b/src/include/mallocMC/reservePoolPolicies/SimpleCudaMalloc_impl.hpp
index 23563d97..71b297e2 100644
--- a/src/include/mallocMC/reservePoolPolicies/SimpleCudaMalloc_impl.hpp
+++ b/src/include/mallocMC/reservePoolPolicies/SimpleCudaMalloc_impl.hpp
@@ -27,31 +27,34 @@
 
 #pragma once
 
-#include <string>
-
 #include "../mallocMC_utils.hpp"
-
 #include "SimpleCudaMalloc.hpp"
 
-namespace mallocMC{
-namespace ReservePoolPolicies{
-
-  struct SimpleCudaMalloc{
-    static void* setMemPool(size_t memsize){
-      void* pool = NULL;
-      MALLOCMC_CUDA_CHECKED_CALL(cudaMalloc(&pool, memsize));
-      return pool;
-    }
-
-    static void resetMemPool(void* p){
-      MALLOCMC_CUDA_CHECKED_CALL(cudaFree(p));
-    }
-
-    static std::string classname(){
-      return "SimpleCudaMalloc";
-    }
-
-  };
+#include <string>
 
-} //namespace ReservePoolPolicies
-} //namespace mallocMC
+namespace mallocMC
+{
+    namespace ReservePoolPolicies
+    {
+        struct SimpleCudaMalloc
+        {
+            static auto setMemPool(size_t memsize) -> void *
+            {
+                void * pool = nullptr;
+                MALLOCMC_CUDA_CHECKED_CALL(cudaMalloc(&pool, memsize));
+                return pool;
+            }
+
+            static void resetMemPool(void * p)
+            {
+                MALLOCMC_CUDA_CHECKED_CALL(cudaFree(p));
+            }
+
+            static auto classname() -> std::string
+            {
+                return "SimpleCudaMalloc";
+            }
+        };
+
+    } // namespace ReservePoolPolicies
+} // namespace mallocMC
diff --git a/src/include/mallocMC/version.hpp b/src/include/mallocMC/version.hpp
index f2c15e1e..8a81728c 100644
--- a/src/include/mallocMC/version.hpp
+++ b/src/include/mallocMC/version.hpp
@@ -38,8 +38,8 @@
 
 /** the mallocMC version: major API changes should be reflected here */
 #define MALLOCMC_VERSION_MAJOR 2
-#define MALLOCMC_VERSION_MINOR 3
-#define MALLOCMC_VERSION_PATCH 1
+#define MALLOCMC_VERSION_MINOR 4
+#define MALLOCMC_VERSION_PATCH 0
 
 /** the mallocMC flavor is used to differentiate the releases of the
  *  Computational Radiation Physics group (crp) from other releases
diff --git a/tests/verify_heap.cu b/tests/verify_heap.cu
index d5a3004f..3ee214ab 100644
--- a/tests/verify_heap.cu
+++ b/tests/verify_heap.cu
@@ -26,57 +26,75 @@
   THE SOFTWARE.
 */
 
-
 // get a CUDA error and print it nicely
-#define CUDA_CHECK(cmd) {cudaError_t error = cmd; \
-  if(error!=cudaSuccess){\
-    printf("<%s>:%i ",__FILE__,__LINE__);\
-    printf("[CUDA] Error: %s\n", cudaGetErrorString(error));}}
+#define CUDA_CHECK(cmd) \
+    { \
+        cudaError_t error = cmd; \
+        if(error != cudaSuccess) \
+        { \
+            printf("<%s>:%i ", __FILE__, __LINE__); \
+            printf("[CUDA] Error: %s\n", cudaGetErrorString(error)); \
+        } \
+    }
 
 // start kernel, wait for finish and check errors
-#define CUDA_CHECK_KERNEL_SYNC(...) __VA_ARGS__;CUDA_CHECK(cudaDeviceSynchronize())
+#define CUDA_CHECK_KERNEL_SYNC(...) \
+    __VA_ARGS__; \
+    CUDA_CHECK(cudaDeviceSynchronize())
 
 // each pointer in the datastructure will point to this many
 // elements of type allocElem_t
-#define ELEMS_PER_SLOT 750
+constexpr auto ELEMS_PER_SLOT = 750;
 
+#include <cstdio>
 #include <cuda.h>
 #include <iostream>
-#include <cstdio>
+#include <sstream>
 #include <typeinfo>
 #include <vector>
 
-//include the Heap with the arguments given in the config
-#include "src/include/mallocMC/mallocMC_utils.hpp"
+// include the Heap with the arguments given in the config
 #include "verify_heap_config.hpp"
 
+#include <mallocMC/mallocMC_utils.hpp>
+
 // global variable for verbosity, might change due to user input '--verbose'
 bool verbose = false;
 
 // the type of the elements to allocate
-typedef unsigned long long allocElem_t;
-
-bool run_heap_verification(const size_t, const unsigned, const unsigned, const bool);
-void parse_cmdline(const int, char**, size_t*, unsigned*, unsigned*, bool*);
-void print_help(char**);
+using allocElem_t = unsigned long long;
 
+auto run_heap_verification(
+    const size_t,
+    const unsigned,
+    const unsigned,
+    const bool) -> bool;
+void parse_cmdline(
+    const int,
+    char **,
+    size_t *,
+    unsigned *,
+    unsigned *,
+    bool *);
+void print_help(char **);
 
 // used to create an empty stream for non-verbose output
-struct nullstream : std::ostream {
-  nullstream() : std::ostream(0) { }
+struct nullstream : std::ostream
+{
+    nullstream() : std::ostream(0) {}
 };
 
-// uses global verbosity to switch between std::cout and a NULL-output
-std::ostream& dout() {
-  static nullstream n;
-  return verbose ? std::cout : n;
+// uses global verbosity to switch between std::cout and a nullptr-output
+auto dout() -> std::ostream &
+{
+    static nullstream n;
+    return verbose ? std::cout : n;
 }
 
 // define some defaults
-BOOST_STATIC_CONSTEXPR unsigned threads_default = 128;
-BOOST_STATIC_CONSTEXPR unsigned blocks_default  = 64;
-BOOST_STATIC_CONSTEXPR size_t heapInMB_default  = 1024; // 1GB
-
+static constexpr unsigned threads_default = 128;
+static constexpr unsigned blocks_default = 64;
+static constexpr size_t heapInMB_default = 1024; // 1GB
 
 /**
  * will do a basic verification of scatterAlloc.
@@ -87,39 +105,50 @@ BOOST_STATIC_CONSTEXPR size_t heapInMB_default  = 1024; // 1GB
  * @return will return 0 if the verification was successful,
  *         otherwise returns 1
  */
-int main(int argc, char** argv){
-  bool correct          = false;
-  bool machine_readable = false;
-  size_t heapInMB       = heapInMB_default;
-  unsigned threads      = threads_default;
-  unsigned blocks       = blocks_default;
-
-  parse_cmdline(argc, argv, &heapInMB, &threads, &blocks, &machine_readable);
-
-  cudaDeviceProp deviceProp;
-  cudaGetDeviceProperties(&deviceProp, 0);
-
-  if( deviceProp.major < 2 ) {
-    std::cerr << "Error: Compute Capability >= 2.0 required. (is ";
-    std::cerr << deviceProp.major << "."<< deviceProp.minor << ")" << std::endl;
-    return 1;
-  }
-
-  cudaSetDevice(0);
-  correct = run_heap_verification(heapInMB, threads, blocks, machine_readable);
-  cudaDeviceReset();
-
-  if(!machine_readable || verbose){
-    if(correct){
-      std::cout << "\033[0;32mverification successful ✔\033[0m" << std::endl;
-      return 0;
-    }else{
-      std::cerr << "\033[0;31mverification failed\033[0m" << std::endl;
-      return 1;
+auto main(int argc, char ** argv) -> int
+{
+    bool correct = false;
+    bool machine_readable = false;
+    size_t heapInMB = heapInMB_default;
+    unsigned threads = threads_default;
+    unsigned blocks = blocks_default;
+
+    parse_cmdline(argc, argv, &heapInMB, &threads, &blocks, &machine_readable);
+
+    int computeCapabilityMajor = 0;
+    cudaDeviceGetAttribute(
+        &computeCapabilityMajor, cudaDevAttrComputeCapabilityMajor, 0);
+    int computeCapabilityMinor = 0;
+    cudaDeviceGetAttribute(
+        &computeCapabilityMinor, cudaDevAttrComputeCapabilityMinor, 0);
+
+    if(computeCapabilityMajor < 2)
+    {
+        std::cerr << "Error: Compute Capability >= 2.0 required. (is ";
+        std::cerr << computeCapabilityMajor << "." << computeCapabilityMinor
+                  << ")\n";
+        return 1;
     }
-  }
-}
 
+    cudaSetDevice(0);
+    correct
+        = run_heap_verification(heapInMB, threads, blocks, machine_readable);
+    cudaDeviceReset();
+
+    if(!machine_readable || verbose)
+    {
+        if(correct)
+        {
+            std::cout << "\033[0;32mverification successful ✔\033[0m\n";
+            return 0;
+        }
+        else
+        {
+            std::cerr << "\033[0;31mverification failed\033[0m\n";
+            return 1;
+        }
+    }
+}
 
 /**
  * will parse command line arguments
@@ -134,98 +163,106 @@ int main(int argc, char** argv){
  */
 void parse_cmdline(
     const int argc,
-    char**argv,
-    size_t *heapInMB,
-    unsigned *threads,
-    unsigned *blocks,
-    bool *machine_readable
-    ){
-
-  std::vector<std::pair<std::string, std::string> > parameters;
-
-  // Parse Commandline, tokens are shaped like ARG=PARAM or ARG
-  // This requires to use '=', if you want to supply a value with a parameter
-  for (int i = 1; i < argc; ++i) {
-    char* pos = strtok(argv[i], "=");
-    std::pair < std::string, std::string > p(std::string(pos), std::string(""));
-    pos = strtok(NULL, "=");
-    if (pos != NULL) {
-      p.second = std::string(pos);
+    char ** argv,
+    size_t * heapInMB,
+    unsigned * threads,
+    unsigned * blocks,
+    bool * machine_readable)
+{
+    std::vector<std::pair<std::string, std::string>> parameters;
+
+    // Parse Commandline, tokens are shaped like ARG=PARAM or ARG
+    // This requires to use '=', if you want to supply a value with a parameter
+    for(int i = 1; i < argc; ++i)
+    {
+        char * pos = strtok(argv[i], "=");
+        std::pair<std::string, std::string> p(
+            std::string(pos), std::string(""));
+        pos = strtok(nullptr, "=");
+        if(pos != nullptr)
+        {
+            p.second = std::string(pos);
+        }
+        parameters.push_back(p);
     }
-    parameters.push_back(p);
-  }
-
-  // go through all parameters that were found
-  for (unsigned i = 0; i < parameters.size(); ++i) {
-    std::pair < std::string, std::string > p = parameters.at(i);
 
-    if (p.first == "-v" || p.first == "--verbose") {
-      verbose = true;
+    // go through all parameters that were found
+    for(unsigned i = 0; i < parameters.size(); ++i)
+    {
+        std::pair<std::string, std::string> p = parameters.at(i);
+
+        if(p.first == "-v" || p.first == "--verbose")
+        {
+            verbose = true;
+        }
+
+        if(p.first == "--threads")
+        {
+            *threads = atoi(p.second.c_str());
+        }
+
+        if(p.first == "--blocks")
+        {
+            *blocks = atoi(p.second.c_str());
+        }
+
+        if(p.first == "--heapsize")
+        {
+            *heapInMB = size_t(atoi(p.second.c_str()));
+        }
+
+        if(p.first == "-h" || p.first == "--help")
+        {
+            print_help(argv);
+            exit(0);
+        }
+
+        if(p.first == "-m" || p.first == "--machine_readable")
+        {
+            *machine_readable = true;
+        }
     }
-
-    if (p.first == "--threads") {
-      *threads = atoi(p.second.c_str());
-    }
-
-    if (p.first == "--blocks") {
-      *blocks = atoi(p.second.c_str());
-    }
-
-    if(p.first == "--heapsize") {
-      *heapInMB = size_t(atoi(p.second.c_str()));
-    }
-
-    if(p.first == "-h" || p.first == "--help"){
-      print_help(argv);
-      exit(0);
-    }
-
-    if(p.first == "-m" || p.first == "--machine_readable"){
-      *machine_readable = true;
-    }
-  }
 }
 
-
 /**
  * prints a helpful message about program use
  *
  * @param argv the argv-parameter from main, used to find the program name
  */
-void print_help(char** argv){
-  std::stringstream s;
-
-  s << "SYNOPSIS:"                                              << std::endl;
-  s << argv[0] << " [OPTIONS]"                                  << std::endl;
-  s << ""                                                       << std::endl;
-  s << "OPTIONS:"                                               << std::endl;
-  s << "  -h, --help"                                           << std::endl;
-  s << "    Print this help message and exit"                   << std::endl;
-  s << ""                                                       << std::endl;
-  s << "  -v, --verbose"                                        << std::endl;
-  s << "    Print information about parameters and progress"    << std::endl;
-  s << ""                                                       << std::endl;
-  s << "  -m, --machine_readable"                               << std::endl;
-  s << "    Print all relevant parameters as CSV. This will"    << std::endl;
-  s << "    suppress all other output unless explicitly"        << std::endl;
-  s << "    requested with --verbose or -v"                     << std::endl;
-  s << ""                                                       << std::endl;
-  s << "  --threads=N"                                          << std::endl;
-  s << "    Set the number of threads per block (default "                  ;
-  s <<                               threads_default << "128)"  << std::endl;
-  s << ""                                                       << std::endl;
-  s << "  --blocks=N"                                           << std::endl;
-  s << "    Set the number of blocks in the grid (default "                 ;
-  s <<                                   blocks_default << ")"  << std::endl;
-  s << ""                                                       << std::endl;
-  s << "  --heapsize=N"                                         << std::endl;
-  s << "    Set the heapsize to N Megabyte (default "                       ;
-  s <<                         heapInMB_default << "1024)"      << std::endl;
-
-  std::cout << s.str();
+void print_help(char ** argv)
+{
+    std::stringstream s;
+
+    s << "SYNOPSIS:" << '\n';
+    s << argv[0] << " [OPTIONS]" << '\n';
+    s << "" << '\n';
+    s << "OPTIONS:" << '\n';
+    s << "  -h, --help" << '\n';
+    s << "    Print this help message and exit" << '\n';
+    s << "" << '\n';
+    s << "  -v, --verbose" << '\n';
+    s << "    Print information about parameters and progress" << '\n';
+    s << "" << '\n';
+    s << "  -m, --machine_readable" << '\n';
+    s << "    Print all relevant parameters as CSV. This will" << '\n';
+    s << "    suppress all other output unless explicitly" << '\n';
+    s << "    requested with --verbose or -v" << '\n';
+    s << "" << '\n';
+    s << "  --threads=N" << '\n';
+    s << "    Set the number of threads per block (default ";
+    s << threads_default << "128)" << '\n';
+    s << "" << '\n';
+    s << "  --blocks=N" << '\n';
+    s << "    Set the number of blocks in the grid (default ";
+    s << blocks_default << ")" << '\n';
+    s << "" << '\n';
+    s << "  --heapsize=N" << '\n';
+    s << "    Set the heapsize to N Megabyte (default ";
+    s << heapInMB_default << "1024)" << '\n';
+
+    std::cout << s.str() << std::flush;
 }
 
-
 /**
  * checks validity of memory for each single cell
  *
@@ -245,31 +282,37 @@ void print_help(char** argv){
  *        Will change to 0, if there was a value that didn't match
  */
 __global__ void check_content(
-    allocElem_t** data,
-    unsigned long long *counter,
-    unsigned long long* globalSum,
+    allocElem_t ** data,
+    unsigned long long * counter,
+    unsigned long long * globalSum,
     const size_t nSlots,
-    int* correct
-    ){
-
-  unsigned long long sum=0;
-  while(true){
-    size_t pos = atomicAdd(counter,1);
-    if(pos >= nSlots){break;}
-    const size_t offset = pos*ELEMS_PER_SLOT;
-    for(size_t i=0;i<ELEMS_PER_SLOT;++i){
-      if (static_cast<allocElem_t>(data[pos][i]) != static_cast<allocElem_t>(offset+i)){
-        //printf("\nError in Kernel: data[%llu][%llu] is %#010x (should be %#010x)\n",
-        //    pos,i,static_cast<allocElem_t>(data[pos][i]),allocElem_t(offset+i));
-        atomicAnd(correct,0);
-      }
-      sum += static_cast<unsigned long long>(data[pos][i]);
+    int * correct)
+{
+    unsigned long long sum = 0;
+    while(true)
+    {
+        const size_t pos = atomicAdd(counter, 1);
+        if(pos >= nSlots)
+        {
+            break;
+        }
+        const size_t offset = pos * ELEMS_PER_SLOT;
+        for(size_t i = 0; i < ELEMS_PER_SLOT; ++i)
+        {
+            if(static_cast<allocElem_t>(data[pos][i])
+               != static_cast<allocElem_t>(offset + i))
+            {
+                // printf("\nError in Kernel: data[%llu][%llu] is %#010x (should
+                // be %#010x)\n",
+                //    pos,i,static_cast<allocElem_t>(data[pos][i]),allocElem_t(offset+i));
+                atomicAnd(correct, 0);
+            }
+            sum += static_cast<unsigned long long>(data[pos][i]);
+        }
     }
-  }
-  atomicAdd(globalSum,sum);
+    atomicAdd(globalSum, sum);
 }
 
-
 /**
  * checks validity of memory for each single cell
  *
@@ -285,27 +328,32 @@ __global__ void check_content(
  *        Will change to 0, if there was a value that didn't match
  */
 __global__ void check_content_fast(
-    allocElem_t** data,
-    unsigned long long *counter,
+    allocElem_t ** data,
+    unsigned long long * counter,
     const size_t nSlots,
-    int* correct
-    ){
-
-  int c = 1;
-  while(true){
-    size_t pos = atomicAdd(counter,1);
-    if(pos >= nSlots){break;}
-    const size_t offset = pos*ELEMS_PER_SLOT;
-    for(size_t i=0;i<ELEMS_PER_SLOT;++i){
-      if (static_cast<allocElem_t>(data[pos][i]) != static_cast<allocElem_t>(offset+i)){
-        c=0;
-      }
+    int * correct)
+{
+    int c = 1;
+    while(true)
+    {
+        size_t pos = atomicAdd(counter, 1);
+        if(pos >= nSlots)
+        {
+            break;
+        }
+        const size_t offset = pos * ELEMS_PER_SLOT;
+        for(size_t i = 0; i < ELEMS_PER_SLOT; ++i)
+        {
+            if(static_cast<allocElem_t>(data[pos][i])
+               != static_cast<allocElem_t>(offset + i))
+            {
+                c = 0;
+            }
+        }
     }
-  }
-  atomicAnd(correct,c);
+    atomicAnd(correct, c);
 }
 
-
 /**
  * allocate a lot of small arrays and fill them
  *
@@ -320,30 +368,32 @@ __global__ void check_content_fast(
  *        allocated structures (for verification purposes)
  */
 __global__ void allocAll(
-    allocElem_t** data,
-    unsigned long long* counter,
-    unsigned long long* globalSum,
-    ScatterAllocator::AllocatorHandle mMC
-    ){
-
-  unsigned long long sum=0;
-  while(true){
-    allocElem_t* p = (allocElem_t*) mMC.malloc(sizeof(allocElem_t) * ELEMS_PER_SLOT);
-    if(p == NULL) break;
-
-    size_t pos = atomicAdd(counter,1);
-    const size_t offset = pos*ELEMS_PER_SLOT;
-    for(size_t i=0;i<ELEMS_PER_SLOT;++i){
-      p[i] = static_cast<allocElem_t>(offset + i);
-      sum += static_cast<unsigned long long>(p[i]);
+    allocElem_t ** data,
+    unsigned long long * counter,
+    unsigned long long * globalSum,
+    ScatterAllocator::AllocatorHandle mMC)
+{
+    unsigned long long sum = 0;
+    while(true)
+    {
+        allocElem_t * p
+            = (allocElem_t *)mMC.malloc(sizeof(allocElem_t) * ELEMS_PER_SLOT);
+        if(p == nullptr)
+            break;
+
+        size_t pos = atomicAdd(counter, 1);
+        const size_t offset = pos * ELEMS_PER_SLOT;
+        for(size_t i = 0; i < ELEMS_PER_SLOT; ++i)
+        {
+            p[i] = static_cast<allocElem_t>(offset + i);
+            sum += static_cast<unsigned long long>(p[i]);
+        }
+        data[pos] = p;
     }
-    data[pos] = p;
-  }
 
-  atomicAdd(globalSum,sum);
+    atomicAdd(globalSum, sum);
 }
 
-
 /**
  * free all the values again
  *
@@ -353,20 +403,20 @@ __global__ void allocAll(
  * @param max the maximum number of elements to free
  */
 __global__ void deallocAll(
-    allocElem_t** data,
-    unsigned long long* counter,
+    allocElem_t ** data,
+    unsigned long long * counter,
     const size_t nSlots,
-    ScatterAllocator::AllocatorHandle mMC
-    ){
-
-  while(true){
-    size_t pos = atomicAdd(counter,1);
-    if(pos >= nSlots) break;
-    mMC.free(data[pos]);
-  }
+    ScatterAllocator::AllocatorHandle mMC)
+{
+    while(true)
+    {
+        size_t pos = atomicAdd(counter, 1);
+        if(pos >= nSlots)
+            break;
+        mMC.free(data[pos]);
+    }
 }
 
-
 /**
  * damages one element in the data
  *
@@ -376,11 +426,11 @@ __global__ void deallocAll(
  *
  * @param data the datastructure to damage
  */
-__global__ void damageElement(allocElem_t** data){
-  data[1][0] = static_cast<allocElem_t>(5*ELEMS_PER_SLOT - 1);
+__global__ void damageElement(allocElem_t ** data)
+{
+    data[1][0] = static_cast<allocElem_t>(5 * ELEMS_PER_SLOT - 1);
 }
 
-
 /**
  * wrapper function to allocate memory on device
  *
@@ -396,35 +446,43 @@ __global__ void damageElement(allocElem_t** data){
  * @param threads the number of CUDA threads per block
  */
 void allocate(
-    allocElem_t** d_testData,
-    unsigned long long* h_nSlots,
-    unsigned long long* h_sum,
+    allocElem_t ** d_testData,
+    unsigned long long * h_nSlots,
+    unsigned long long * h_sum,
     const unsigned blocks,
     const unsigned threads,
-    ScatterAllocator* mMC
-    ){
-
-  dout() << "allocating on device...";
-
-  unsigned long long zero = 0;
-  unsigned long long *d_sum;
-  unsigned long long *d_nSlots;
-
-  MALLOCMC_CUDA_CHECKED_CALL(cudaMalloc((void**) &d_sum,sizeof(unsigned long long)));
-  MALLOCMC_CUDA_CHECKED_CALL(cudaMalloc((void**) &d_nSlots, sizeof(unsigned long long)));
-  MALLOCMC_CUDA_CHECKED_CALL(cudaMemcpy(d_sum,&zero,sizeof(unsigned long long),cudaMemcpyHostToDevice));
-  MALLOCMC_CUDA_CHECKED_CALL(cudaMemcpy(d_nSlots,&zero,sizeof(unsigned long long),cudaMemcpyHostToDevice));
-
-  CUDA_CHECK_KERNEL_SYNC(allocAll<<<blocks,threads>>>(d_testData, d_nSlots, d_sum, *mMC ));
-
-  MALLOCMC_CUDA_CHECKED_CALL(cudaMemcpy(h_sum,d_sum,sizeof(unsigned long long),cudaMemcpyDeviceToHost));
-  MALLOCMC_CUDA_CHECKED_CALL(cudaMemcpy(h_nSlots,d_nSlots,sizeof(unsigned long long),cudaMemcpyDeviceToHost));
-  cudaFree(d_sum);
-  cudaFree(d_nSlots);
-  dout() << "done" << std::endl;
+    ScatterAllocator * mMC)
+{
+    dout() << "allocating on device...";
+
+    unsigned long long zero = 0;
+    unsigned long long * d_sum;
+    unsigned long long * d_nSlots;
+
+    MALLOCMC_CUDA_CHECKED_CALL(
+        cudaMalloc((void **)&d_sum, sizeof(unsigned long long)));
+    MALLOCMC_CUDA_CHECKED_CALL(
+        cudaMalloc((void **)&d_nSlots, sizeof(unsigned long long)));
+    MALLOCMC_CUDA_CHECKED_CALL(cudaMemcpy(
+        d_sum, &zero, sizeof(unsigned long long), cudaMemcpyHostToDevice));
+    MALLOCMC_CUDA_CHECKED_CALL(cudaMemcpy(
+        d_nSlots, &zero, sizeof(unsigned long long), cudaMemcpyHostToDevice));
+
+    CUDA_CHECK_KERNEL_SYNC(
+        allocAll<<<blocks, threads>>>(d_testData, d_nSlots, d_sum, *mMC));
+
+    MALLOCMC_CUDA_CHECKED_CALL(cudaMemcpy(
+        h_sum, d_sum, sizeof(unsigned long long), cudaMemcpyDeviceToHost));
+    MALLOCMC_CUDA_CHECKED_CALL(cudaMemcpy(
+        h_nSlots,
+        d_nSlots,
+        sizeof(unsigned long long),
+        cudaMemcpyDeviceToHost));
+    cudaFree(d_sum);
+    cudaFree(d_nSlots);
+    dout() << "done\n";
 }
 
-
 /**
  * Wrapper function to verify allocation on device
  *
@@ -438,72 +496,74 @@ void allocate(
  * @param threads the number of CUDA threads per block
  * @return true if the verification was successful, false otherwise
  */
-bool verify(
-    allocElem_t **d_testData,
+auto verify(
+    allocElem_t ** d_testData,
     const unsigned long long nSlots,
     const unsigned blocks,
-    const unsigned threads
-    ){
-
-  dout() << "verifying on device... ";
-
-  const unsigned long long zero = 0;
-  int  h_correct = 1;
-  int* d_correct;
-  unsigned long long *d_sum;
-  unsigned long long *d_counter;
-
-  MALLOCMC_CUDA_CHECKED_CALL(cudaMalloc((void**) &d_sum, sizeof(unsigned long long)));
-  MALLOCMC_CUDA_CHECKED_CALL(cudaMalloc((void**) &d_counter, sizeof(unsigned long long)));
-  MALLOCMC_CUDA_CHECKED_CALL(cudaMalloc((void**) &d_correct, sizeof(int)));
-  MALLOCMC_CUDA_CHECKED_CALL(cudaMemcpy(d_sum,&zero,sizeof(unsigned long long),cudaMemcpyHostToDevice));
-  MALLOCMC_CUDA_CHECKED_CALL(cudaMemcpy(d_counter,&zero,sizeof(unsigned long long),cudaMemcpyHostToDevice));
-  MALLOCMC_CUDA_CHECKED_CALL(cudaMemcpy(d_correct,&h_correct,sizeof(int),cudaMemcpyHostToDevice));
-
-  // can be replaced by a call to check_content_fast,
-  // if the gaussian sum (see below) is not used and you
-  // want to be a bit faster
-  CUDA_CHECK_KERNEL_SYNC(check_content<<<blocks,threads>>>(
-        d_testData,
-        d_counter,
-        d_sum,
-        static_cast<size_t>(nSlots),
-        d_correct
-        ));
-  MALLOCMC_CUDA_CHECKED_CALL(cudaMemcpy(&h_correct,d_correct,sizeof(int),cudaMemcpyDeviceToHost));
-
-  // This only works, if the type "allocElem_t"
-  // can hold all the IDs (usually unsigned long long)
-  /*
-  dout() << "verifying on host...";
-  unsigned long long h_sum, h_counter;
-  unsigned long long gaussian_sum = (ELEMS_PER_SLOT*nSlots * (ELEMS_PER_SLOT*nSlots-1))/2;
-  MALLOCMC_CUDA_CHECKED_CALL(cudaMemcpy(&h_sum,d_sum,sizeof(unsigned long long),cudaMemcpyDeviceToHost));
-  MALLOCMC_CUDA_CHECKED_CALL(cudaMemcpy(&h_counter,d_counter,sizeof(unsigned long long),cudaMemcpyDeviceToHost));
-  if(gaussian_sum != h_sum){
-    dout() << "\nGaussian Sum doesn't match: is " << h_sum;
-    dout() << " (should be " << gaussian_sum << ")" << std::endl;
-    h_correct=false;
-  }
-  if(nSlots != h_counter-(blocks*threads)){
-    dout() << "\nallocated number of elements doesn't match: is " << h_counter;
-    dout() << " (should be " << nSlots << ")" << std::endl;
-    h_correct=false;
-  }
-  */
-
-  if(h_correct){
-    dout() << "done" << std::endl;
-  }else{
-    dout() << "failed" << std::endl;
-  }
-
-  cudaFree(d_correct);
-  cudaFree(d_sum);
-  cudaFree(d_counter);
-  return static_cast<bool>(h_correct);
-}
+    const unsigned threads) -> bool
+{
+    dout() << "verifying on device... ";
+
+    const unsigned long long zero = 0;
+    int h_correct = 1;
+    int * d_correct;
+    unsigned long long * d_sum;
+    unsigned long long * d_counter;
+
+    MALLOCMC_CUDA_CHECKED_CALL(
+        cudaMalloc((void **)&d_sum, sizeof(unsigned long long)));
+    MALLOCMC_CUDA_CHECKED_CALL(
+        cudaMalloc((void **)&d_counter, sizeof(unsigned long long)));
+    MALLOCMC_CUDA_CHECKED_CALL(cudaMalloc((void **)&d_correct, sizeof(int)));
+    MALLOCMC_CUDA_CHECKED_CALL(cudaMemcpy(
+        d_sum, &zero, sizeof(unsigned long long), cudaMemcpyHostToDevice));
+    MALLOCMC_CUDA_CHECKED_CALL(cudaMemcpy(
+        d_counter, &zero, sizeof(unsigned long long), cudaMemcpyHostToDevice));
+    MALLOCMC_CUDA_CHECKED_CALL(
+        cudaMemcpy(d_correct, &h_correct, sizeof(int), cudaMemcpyHostToDevice));
+
+    // can be replaced by a call to check_content_fast,
+    // if the gaussian sum (see below) is not used and you
+    // want to be a bit faster
+    CUDA_CHECK_KERNEL_SYNC(check_content<<<blocks, threads>>>(
+        d_testData, d_counter, d_sum, static_cast<size_t>(nSlots), d_correct));
+    MALLOCMC_CUDA_CHECKED_CALL(
+        cudaMemcpy(&h_correct, d_correct, sizeof(int), cudaMemcpyDeviceToHost));
+
+    // This only works, if the type "allocElem_t"
+    // can hold all the IDs (usually unsigned long long)
+    /*
+    dout() << "verifying on host...";
+    unsigned long long h_sum, h_counter;
+    unsigned long long gaussian_sum = (ELEMS_PER_SLOT*nSlots *
+    (ELEMS_PER_SLOT*nSlots-1))/2;
+    MALLOCMC_CUDA_CHECKED_CALL(cudaMemcpy(&h_sum,d_sum,sizeof(unsigned long
+    long),cudaMemcpyDeviceToHost));
+    MALLOCMC_CUDA_CHECKED_CALL(cudaMemcpy(&h_counter,d_counter,sizeof(unsigned
+    long long),cudaMemcpyDeviceToHost)); if(gaussian_sum != h_sum){ dout() <<
+    "\nGaussian Sum doesn't match: is " << h_sum; dout() << " (should be " <<
+    gaussian_sum << ")\n"; h_correct=false;
+    }
+    if(nSlots != h_counter-(blocks*threads)){
+      dout() << "\nallocated number of elements doesn't match: is " <<
+    h_counter; dout() << " (should be " << nSlots << ")\n"; h_correct=false;
+    }
+    */
+
+    if(h_correct)
+    {
+        dout() << "done\n";
+    }
+    else
+    {
+        dout() << "failed\n";
+    }
 
+    cudaFree(d_correct);
+    cudaFree(d_sum);
+    cudaFree(d_counter);
+    return static_cast<bool>(h_correct);
+}
 
 /**
  * prints all parameters machine readable
@@ -511,81 +571,79 @@ bool verify(
  * for params, see run_heap_verification-internal parameters
  */
 void print_machine_readable(
-        const unsigned pagesize,
-        const unsigned accessblocks,
-        const unsigned regionsize,
-        const unsigned wastefactor,
-        const bool resetfreedpages,
-        const unsigned blocks,
-        const unsigned threads,
-        const unsigned elemsPerSlot,
-        const size_t allocElemSize,
-        const size_t heapSize,
-        const size_t maxSpace,
-        const size_t maxSlots,
-        const unsigned long long usedSlots,
-        const float allocFrac,
-        const size_t wasted,
-        const bool correct
-        ){
-
-  std::string sep = ",";
-  std::stringstream h;
-  std::stringstream v;
+    const unsigned pagesize,
+    const unsigned accessblocks,
+    const unsigned regionsize,
+    const unsigned wastefactor,
+    const bool resetfreedpages,
+    const unsigned blocks,
+    const unsigned threads,
+    const unsigned elemsPerSlot,
+    const size_t allocElemSize,
+    const size_t heapSize,
+    const size_t maxSpace,
+    const size_t maxSlots,
+    const unsigned long long usedSlots,
+    const float allocFrac,
+    const size_t wasted,
+    const bool correct)
+{
+    std::string sep = ",";
+    std::stringstream h;
+    std::stringstream v;
 
-  h << "PagesizeByte"   << sep;
-  v << pagesize         << sep;
+    h << "PagesizeByte" << sep;
+    v << pagesize << sep;
 
-  h << "Accessblocks"   << sep;
-  v << accessblocks     << sep;
+    h << "Accessblocks" << sep;
+    v << accessblocks << sep;
 
-  h << "Regionsize"     << sep;
-  v << regionsize       << sep;
+    h << "Regionsize" << sep;
+    v << regionsize << sep;
 
-  h << "Wastefactor"    << sep;
-  v << wasted           << sep;
+    h << "Wastefactor" << sep;
+    v << wasted << sep;
 
-  h << "ResetFreedPage" << sep;
-  v << resetfreedpages  << sep;
+    h << "ResetFreedPage" << sep;
+    v << resetfreedpages << sep;
 
-  h << "Gridsize"       << sep;
-  v <<  blocks          << sep;
+    h << "Gridsize" << sep;
+    v << blocks << sep;
 
-  h << "Blocksize"      << sep;
-  v << threads          << sep;
+    h << "Blocksize" << sep;
+    v << threads << sep;
 
-  h << "ELEMS_PER_SLOT" << sep;
-  v << elemsPerSlot     << sep;
+    h << "ELEMS_PER_SLOT" << sep;
+    v << elemsPerSlot << sep;
 
-  h << "allocElemByte"  << sep;
-  v << allocElemSize    << sep;
+    h << "allocElemByte" << sep;
+    v << allocElemSize << sep;
 
-  h << "heapsizeByte"   << sep;
-  v << heapSize         << sep;
+    h << "heapsizeByte" << sep;
+    v << heapSize << sep;
 
-  h << "maxSpaceByte"   << sep;
-  v << maxSpace         << sep;
+    h << "maxSpaceByte" << sep;
+    v << maxSpace << sep;
 
-  h << "maxSlots"       << sep;
-  v << maxSlots         << sep;
+    h << "maxSlots" << sep;
+    v << maxSlots << sep;
 
-  h << "usedSlots"      << sep;
-  v << usedSlots        << sep;
+    h << "usedSlots" << sep;
+    v << usedSlots << sep;
 
-  h << "allocFraction"  << sep;
-  v << allocFrac        << sep;
+    h << "allocFraction" << sep;
+    v << allocFrac << sep;
 
-  h << "wastedBytes"    << sep;
-  v << wasted           << sep;
+    h << "wastedBytes" << sep;
+    v << wasted << sep;
 
-  h << "correct"        ;
-  v << correct          ;
+    h << "correct";
+    v << correct;
 
-  std::cout << h.str() << std::endl;
-  std::cout << v.str() << std::endl;
+    std::cout << h.str() << '\n';
+    std::cout << v.str() << '\n';
 }
 
-
 /**
  * Verify the heap allocation of mallocMC
  *
@@ -600,102 +658,111 @@ void print_machine_readable(
  * @return true if the verification was successful,
  *         false otherwise
  */
-bool run_heap_verification(
+auto run_heap_verification(
     const size_t heapMB,
     const unsigned blocks,
     const unsigned threads,
-    const bool machine_readable
-    ){
-
-  cudaSetDeviceFlags(cudaDeviceMapHost);
-
-  const size_t heapSize         = size_t(1024U*1024U) * heapMB;
-  const size_t slotSize         = sizeof(allocElem_t)*ELEMS_PER_SLOT;
-  const size_t nPointers        = ceil(static_cast<float>(heapSize) / slotSize);
-  const size_t maxSlots         = heapSize/slotSize;
-  const size_t maxSpace         = maxSlots*slotSize + nPointers*sizeof(allocElem_t*);
-  bool correct                  = true;
-  const unsigned long long zero = 0;
-
-  dout() << "CreationPolicy Arguments:"                                         << std::endl;
-  dout() << "Pagesize:              "     << ScatterConfig::pagesize::value        << std::endl;
-  dout() << "Accessblocks:          "     << ScatterConfig::accessblocks::value    << std::endl;
-  dout() << "Regionsize:            "     << ScatterConfig::regionsize::value      << std::endl;
-  dout() << "Wastefactor:           "     << ScatterConfig::wastefactor::value     << std::endl;
-  dout() << "ResetFreedPages        "     << ScatterConfig::resetfreedpages::value << std::endl;
-  dout() << ""                                                                  << std::endl;
-  dout() << "Gridsize:              "     << blocks                             << std::endl;
-  dout() << "Blocksize:             "     << threads                            << std::endl;
-  dout() << "Allocated elements:    "     << ELEMS_PER_SLOT << " x "  << sizeof(allocElem_t);
-  dout() << "    Byte ("  << slotSize     << " Byte)"                           << std::endl;
-  dout() << "Heap:                  "     << heapSize << " Byte";
-  dout() << " (" << heapSize/pow(1024,2)  << " MByte)"                          << std::endl;
-  dout() << "max space w/ pointers: "     << maxSpace << " Byte";
-  dout() << " (" << maxSpace/pow(1024,2)  << " MByte)"                          << std::endl;
-  dout() << "maximum of elements:   "     << maxSlots                           << std::endl;
-
-  // initializing the heap
-  ScatterAllocator* mMC = new ScatterAllocator(heapSize);
-  allocElem_t** d_testData;
-  MALLOCMC_CUDA_CHECKED_CALL(cudaMalloc((void**) &d_testData, nPointers*sizeof(allocElem_t*)));
-
-  // allocating with mallocMC
-  unsigned long long usedSlots = 0;
-  unsigned long long sumAllocElems = 0;
-  allocate(d_testData, &usedSlots, &sumAllocElems, blocks, threads, mMC);
-
-  const float allocFrac = static_cast<float>(usedSlots)*100/maxSlots;
-  const size_t wasted = heapSize - static_cast<size_t>(usedSlots) * slotSize;
-  dout() << "allocated elements:    "   << usedSlots;
-  dout() << " (" << allocFrac << "%)"   << std::endl;
-  dout() << "wasted heap space:     "   << wasted << " Byte";
-  dout() << " (" << wasted/pow(1024,2)  << " MByte)" << std::endl;
-
-  // verifying on device
-  correct = correct && verify(d_testData,usedSlots,blocks,threads);
-
-  // damaging one cell
-  dout() << "damaging of element... ";
-  CUDA_CHECK_KERNEL_SYNC(damageElement<<<1,1>>>(d_testData));
-  dout() << "done" << std::endl;
-
-  // verifying on device
-  // THIS SHOULD FAIL (damage was done before!). Therefore, we must inverse the logic
-  correct = correct && !verify(d_testData,usedSlots,blocks,threads);
-
-
-  // release all memory
-  dout() << "deallocation...        ";
-  unsigned long long* d_dealloc_counter;
-  MALLOCMC_CUDA_CHECKED_CALL(cudaMalloc((void**) &d_dealloc_counter, sizeof(unsigned long long)));
-  MALLOCMC_CUDA_CHECKED_CALL(cudaMemcpy(d_dealloc_counter,&zero,sizeof(unsigned long long),cudaMemcpyHostToDevice));
-  CUDA_CHECK_KERNEL_SYNC(deallocAll<<<blocks,threads>>>(d_testData,d_dealloc_counter,static_cast<size_t>(usedSlots), *mMC ));
-  cudaFree(d_dealloc_counter);
-  cudaFree(d_testData);
-  delete mMC;
-
-  dout() << "done "<< std::endl;
-
-  if(machine_readable){
-    print_machine_readable(
-        ScatterConfig::pagesize::value,
-        ScatterConfig::accessblocks::value,
-        ScatterConfig::regionsize::value,
-        ScatterConfig::wastefactor::value,
-        ScatterConfig::resetfreedpages::value,
-        blocks,
-        threads,
-        ELEMS_PER_SLOT,
-        sizeof(allocElem_t),
-        heapSize,
-        maxSpace,
-        maxSlots,
-        usedSlots,
-        allocFrac,
-        wasted,
-        correct
-        );
-  }
-
-  return correct;
+    const bool machine_readable) -> bool
+{
+    cudaSetDeviceFlags(cudaDeviceMapHost);
+
+    const size_t heapSize = size_t(1024U * 1024U) * heapMB;
+    const size_t slotSize = sizeof(allocElem_t) * ELEMS_PER_SLOT;
+    const size_t nPointers = (heapSize + slotSize - 1) / slotSize;
+    const size_t maxSlots = heapSize / slotSize;
+    const size_t maxSpace
+        = maxSlots * slotSize + nPointers * sizeof(allocElem_t *);
+    bool correct = true;
+    const unsigned long long zero = 0;
+
+    dout() << "CreationPolicy Arguments:\n";
+    dout() << "Pagesize:              " << ScatterConfig::pagesize << '\n';
+    dout() << "Accessblocks:          " << ScatterConfig::accessblocks << '\n';
+    dout() << "Regionsize:            " << ScatterConfig::regionsize << '\n';
+    dout() << "Wastefactor:           " << ScatterConfig::wastefactor << '\n';
+    dout() << "ResetFreedPages        " << ScatterConfig::resetfreedpages
+           << '\n';
+    dout() << "\n";
+    dout() << "Gridsize:              " << blocks << '\n';
+    dout() << "Blocksize:             " << threads << '\n';
+    dout() << "Allocated elements:    " << ELEMS_PER_SLOT << " x "
+           << sizeof(allocElem_t);
+    dout() << "    Byte (" << slotSize << " Byte)\n";
+    dout() << "Heap:                  " << heapSize << " Byte";
+    dout() << " (" << heapSize / pow(1024, 2) << " MByte)\n";
+    dout() << "max space w/ pointers: " << maxSpace << " Byte";
+    dout() << " (" << maxSpace / pow(1024, 2) << " MByte)\n";
+    dout() << "maximum of elements:   " << maxSlots << '\n';
+
+    // initializing the heap
+    ScatterAllocator * mMC = new ScatterAllocator(heapSize);
+    allocElem_t ** d_testData;
+    MALLOCMC_CUDA_CHECKED_CALL(
+        cudaMalloc((void **)&d_testData, nPointers * sizeof(allocElem_t *)));
+
+    // allocating with mallocMC
+    unsigned long long usedSlots = 0;
+    unsigned long long sumAllocElems = 0;
+    allocate(d_testData, &usedSlots, &sumAllocElems, blocks, threads, mMC);
+
+    const float allocFrac = static_cast<float>(usedSlots) * 100 / maxSlots;
+    const size_t wasted = heapSize - static_cast<size_t>(usedSlots) * slotSize;
+    dout() << "allocated elements:    " << usedSlots;
+    dout() << " (" << allocFrac << "%)\n";
+    dout() << "wasted heap space:     " << wasted << " Byte";
+    dout() << " (" << wasted / pow(1024, 2) << " MByte)\n";
+
+    // verifying on device
+    correct = correct && verify(d_testData, usedSlots, blocks, threads);
+
+    // damaging one cell
+    dout() << "damaging of element... ";
+    CUDA_CHECK_KERNEL_SYNC(damageElement<<<1, 1>>>(d_testData));
+    dout() << "done\n";
+
+    // verifying on device
+    // THIS SHOULD FAIL (damage was done before!). Therefore, we must inverse
+    // the logic
+    correct = correct && !verify(d_testData, usedSlots, blocks, threads);
+
+    // release all memory
+    dout() << "deallocation...        ";
+    unsigned long long * d_dealloc_counter;
+    MALLOCMC_CUDA_CHECKED_CALL(
+        cudaMalloc((void **)&d_dealloc_counter, sizeof(unsigned long long)));
+    MALLOCMC_CUDA_CHECKED_CALL(cudaMemcpy(
+        d_dealloc_counter,
+        &zero,
+        sizeof(unsigned long long),
+        cudaMemcpyHostToDevice));
+    CUDA_CHECK_KERNEL_SYNC(deallocAll<<<blocks, threads>>>(
+        d_testData, d_dealloc_counter, static_cast<size_t>(usedSlots), *mMC));
+    cudaFree(d_dealloc_counter);
+    cudaFree(d_testData);
+    delete mMC;
+
+    dout() << "done \n";
+
+    if(machine_readable)
+    {
+        print_machine_readable(
+            ScatterConfig::pagesize,
+            ScatterConfig::accessblocks,
+            ScatterConfig::regionsize,
+            ScatterConfig::wastefactor,
+            ScatterConfig::resetfreedpages,
+            blocks,
+            threads,
+            ELEMS_PER_SLOT,
+            sizeof(allocElem_t),
+            heapSize,
+            maxSpace,
+            maxSlots,
+            usedSlots,
+            allocFrac,
+            wasted,
+            correct);
+    }
+
+    return correct;
 }
diff --git a/tests/verify_heap_config.hpp b/tests/verify_heap_config.hpp
index e76337a6..cdaa306e 100644
--- a/tests/verify_heap_config.hpp
+++ b/tests/verify_heap_config.hpp
@@ -28,52 +28,51 @@
 
 #pragma once
 
-#include <boost/mpl/int.hpp>
-#include <boost/mpl/bool.hpp>
-
 // basic files for mallocMC
-#include "src/include/mallocMC/mallocMC_hostclass.hpp"
+#include <mallocMC/mallocMC_hostclass.hpp>
 
 // Load all available policies for mallocMC
-#include "src/include/mallocMC/CreationPolicies.hpp"
-#include "src/include/mallocMC/DistributionPolicies.hpp"
-#include "src/include/mallocMC/OOMPolicies.hpp"
-#include "src/include/mallocMC/ReservePoolPolicies.hpp"
-#include "src/include/mallocMC/AlignmentPolicies.hpp"
-    
+#include <mallocMC/AlignmentPolicies.hpp>
+#include <mallocMC/CreationPolicies.hpp>
+#include <mallocMC/DistributionPolicies.hpp>
+#include <mallocMC/OOMPolicies.hpp>
+#include <mallocMC/ReservePoolPolicies.hpp>
 
 // configurate the CreationPolicy "Scatter"
-struct ScatterConfig{
-    typedef boost::mpl::int_<4096>  pagesize;
-    typedef boost::mpl::int_<8>     accessblocks;
-    typedef boost::mpl::int_<16>    regionsize;
-    typedef boost::mpl::int_<2>     wastefactor;
-    typedef boost::mpl::bool_<false> resetfreedpages;
+struct ScatterConfig
+{
+    static constexpr auto pagesize = 4096;
+    static constexpr auto accessblocks = 8;
+    static constexpr auto regionsize = 16;
+    static constexpr auto wastefactor = 2;
+    static constexpr auto resetfreedpages = false;
 };
 
-struct ScatterHashParams{
-    typedef boost::mpl::int_<38183> hashingK;
-    typedef boost::mpl::int_<17497> hashingDistMP;
-    typedef boost::mpl::int_<1>     hashingDistWP;
-    typedef boost::mpl::int_<1>     hashingDistWPRel;
+struct ScatterHashParams
+{
+    static constexpr auto hashingK = 38183;
+    static constexpr auto hashingDistMP = 17497;
+    static constexpr auto hashingDistWP = 1;
+    static constexpr auto hashingDistWPRel = 1;
 };
 
 // configure the DistributionPolicy "XMallocSIMD"
-struct DistributionConfig{
-  typedef ScatterConfig::pagesize pagesize;
+struct DistributionConfig
+{
+    static constexpr auto pagesize = ScatterConfig::pagesize;
 };
 
 // configure the AlignmentPolicy "Shrink"
-struct AlignmentConfig{
-  typedef boost::mpl::int_<16> dataAlignment;
+struct AlignmentConfig
+{
+    static constexpr auto dataAlignment = 16;
 };
 
 // Define a new allocator and call it ScatterAllocator
 // which resembles the behaviour of ScatterAlloc
-typedef mallocMC::Allocator< 
-  mallocMC::CreationPolicies::Scatter<ScatterConfig,ScatterHashParams>,
-  mallocMC::DistributionPolicies::XMallocSIMD<DistributionConfig>,
-  mallocMC::OOMPolicies::ReturnNull,
-  mallocMC::ReservePoolPolicies::SimpleCudaMalloc,
-  mallocMC::AlignmentPolicies::Shrink<AlignmentConfig>
-  > ScatterAllocator;
+using ScatterAllocator = mallocMC::Allocator<
+    mallocMC::CreationPolicies::Scatter<ScatterConfig, ScatterHashParams>,
+    mallocMC::DistributionPolicies::XMallocSIMD<DistributionConfig>,
+    mallocMC::OOMPolicies::ReturnNull,
+    mallocMC::ReservePoolPolicies::SimpleCudaMalloc,
+    mallocMC::AlignmentPolicies::Shrink<AlignmentConfig>>;