Merge pull request #76 from ComputationalRadiationPhysics/dev

Merge dev into master for release 2.0.1crp
alpaka-group · Jan 15, 2015 · 1314bf2 · 1314bf2
2 parents ddeae86 + 15730e4
commit 1314bf2
Show file tree

Hide file tree

Showing 10 changed files with 130 additions and 40 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,6 +1,29 @@
 Change Log / Release Log for mallocMC
 ================================================================
 
+2.0.1crp
+-------------
+**Date:** 2015-01-13
+
+This release fixes several bugs that occured after the release of 2.0.0crp.
+We closed all issues documented in
+[Milestone *Bugfixes*](https://github.com/ComputationalRadiationPhysics/mallocMC/issues?milestone=4&state=closed)
+
+### Changes to mallocMC 2.0.0crp
+
+**Bug fixes**
+ - page table metadata was not correctly initialized with 0 #70
+ - freeing pages would not work under certain circumstances #66
+ - the bitmask in a page table entry could be wrong due to a racecondition #62
+ - not all regions were initialized correctly #60
+ - getAvailableSlots could sometimes miss blocks #59
+ - the counter for elements in a page could get too high due to a racecondition #61
+ - Out of Memory (OOM) Policy sometimes did not recognize allocation failures correctly #67
+
+**Misc:**
+ - See the full changes at https://github.com/ComputationalRadiationPhysics/mallocMC/compare/2.0.0crp...2.0.1crp
+
+
 2.0.0crp
 -------------
 **Date:** 2014-06-02

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -39,6 +39,30 @@ if(Boost_VERSION EQUAL 105500)
   set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} \"-DBOOST_NOINLINE=__attribute__((noinline))\" ")
 endif(Boost_VERSION EQUAL 105500)
 
+
+################################################################################
+# Warnings
+################################################################################
+# GNU
+if(CMAKE_COMPILER_IS_GNUCXX)
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unknown-pragmas")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wextra")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unused-parameter")
+  # new warning in gcc 4.8 (flag ignored in previous version)
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unused-local-typedefs")
+  # ICC
+elseif("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Intel")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DBOOST_NO_VARIADIC_TEMPLATES")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DBOOST_NO_CXX11_VARIADIC_TEMPLATES")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DBOOST_NO_FENV_H")
+  # PGI
+elseif("${CMAKE_CXX_COMPILER_ID}" STREQUAL "PGI")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Minform=inform")
+endif()
+
+
 ###############################################################################
 # Installation
 ###############################################################################

diff --git a/INSTALL.md b/INSTALL.md
@@ -45,3 +45,47 @@ This is an example how to compile `mallocMC` and test the example code snippets
  - `./mallocMC_Example02`
  - `./VerifyHeap`
   - additional options: see `./VerifyHeap --help`
+
+
+Linking to your Project
+-----------------------
+
+To use mallocMC in your project, you must include the header `mallocMC/mallocMC.hpp` and
+add the correct include path.
+
+Because we are linking to Boost and CUDA, the following **external dependencies** must be linked:
+- `-lboost`, `-lcudart`
+
+If you are using CMake you can download our `FindmallocMC.cmake` module with
+```bash
+wget https://raw.githubusercontent.com/ComputationalRadiationPhysics/picongpu/dev/src/cmake/FindmallocMC.cmake
+# read the documentation
+cmake -DCMAKE_MODULE_PATH=. --help-module FindmallocMC | less
+```
+
+and use the following lines in your `CMakeLists.txt`:
+```cmake
+# this example will require at least CMake 2.8.5
+cmake_minimum_required(VERSION 2.8.5)
+
+# add path to FindmallocMC.cmake, e.g. in the directory in cmake/
+set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${CMAKE_CURRENT_SOURCE_DIR}/cmake/)
+
+# find the packages that are required by mallocMC. This has to be done BEFORE
+# loading mallocMC
+find_package(Boost REQUIRED)
+set(LIBS ${LIBS} ${Boost_LIBRARIES})
+
+find_package(CUDA REQUIRED)
+cuda_include_directories(${CUDA_INCLUDE_DIRS})
+
+# find mallocMC installation
+find_package(mallocMC 2.0.1 REQUIRED)
+
+# where to find headers (-I includes for compiler)
+include_directories(SYSTEM ${mallocMC_INCLUDE_DIRS} ${CUDA_INCLUDE_DIRS} ${Boost_INCLUDE_DIRS})
+
+add_executable(yourBinary ${SOURCES})
+
+target_link_libraries(yourBinary ${LIBS})
+```
diff --git a/src/include/mallocMC/alignmentPolicies/Shrink_impl.hpp b/src/include/mallocMC/alignmentPolicies/Shrink_impl.hpp
@@ -74,7 +74,7 @@ namespace Shrink2NS{
 #endif
     static const uint32 dataAlignment = MALLOCMC_AP_SHRINK_DATAALIGNMENT;
 
-    BOOST_STATIC_ASSERT(dataAlignment > 0); 
+    BOOST_STATIC_ASSERT(static_cast<uint32>(dataAlignment) > 0);
     //dataAlignment must also be a power of 2!
     BOOST_STATIC_ASSERT(dataAlignment && !(dataAlignment & (dataAlignment-1)) ); 
 

diff --git a/src/include/mallocMC/creationPolicies/OldMalloc_impl.hpp b/src/include/mallocMC/creationPolicies/OldMalloc_impl.hpp
@@ -52,8 +52,8 @@ namespace CreationPolicies{
       free(mem);
     }
 
-    __device__ bool isOOM(void* p){
-      return  32 == __popc(__ballot(p == NULL));
+    __device__ bool isOOM(void* p, size_t s){
+      return s && (p == NULL);
     }
 
     template < typename T>

diff --git a/src/include/mallocMC/creationPolicies/Scatter_impl.hpp b/src/include/mallocMC/creationPolicies/Scatter_impl.hpp
@@ -187,15 +187,11 @@ namespace ScatterKernelDetail{
          * bit fields when the page is used for a small chunk size
          * @param previous_chunksize the chunksize which was uses for the page before
          */
-        __device__ void init(uint32 previous_chunksize = 0)
+        __device__ void init()
         {
-          //TODO: we can speed this up for pages being freed, because we know the
-          //chunksize used before (these bits must be zero again) 
-
-          //init the entire data which can hold bitfields 
-          uint32 max_bits = min(32*32,pagesize/minChunkSize1);
-          uint32 max_entries = divup<uint32>(max_bits/8,sizeof(uint32))*sizeof(uint32);
-          uint32* write = (uint32*)(data+(pagesize-max_entries));
+          //clear the entire data which can hold bitfields
+          uint32 first_possible_metadata = 32*HierarchyThreshold;
+          uint32* write = (uint32*)(data+(pagesize-first_possible_metadata));
           while(write < (uint32*)(data + pagesize))
             *write++ = 0;
         }
@@ -319,6 +315,9 @@ namespace ScatterKernelDetail{
        */
       __device__ inline void* tryUsePage(uint32 page, uint32 chunksize)
       {
+
+        void* chunk_ptr = NULL;
+
         //increse the fill level
         uint32 filllevel = atomicAdd((uint32*)&(_ptes[page].count), 1);
         //recheck chunck size (it could be that the page got freed in the meanwhile...)
@@ -333,19 +332,21 @@ namespace ScatterKernelDetail{
             fullsegments = pagesize / segmentsize;
             additional_chunks = max(0,(int)pagesize - (int)fullsegments*segmentsize - (int)sizeof(uint32))/chunksize;
             if(filllevel < fullsegments * 32 + additional_chunks)
-              return addChunkHierarchy(chunksize, fullsegments, additional_chunks, page);
+              chunk_ptr = addChunkHierarchy(chunksize, fullsegments, additional_chunks, page);
           }
           else
           {
             uint32 chunksinpage = min(pagesize / chunksize, 32);
             if(filllevel < chunksinpage)
-              return addChunkNoHierarchy(chunksize, page, chunksinpage);
+              chunk_ptr = addChunkNoHierarchy(chunksize, page, chunksinpage);
           }
         }
 
         //this one is full/not useable
-        atomicSub((uint32*)&(_ptes[page].count), 1);
-        return 0;
+        if(chunk_ptr == NULL)
+          atomicSub((uint32*)&(_ptes[page].count), 1);
+
+        return chunk_ptr;
       }
 
 
@@ -444,9 +445,8 @@ namespace ScatterKernelDetail{
           uint32* onpagemasks = (uint32*)(_page[page].data + chunksize*(fullsegments*32 + additional_chunks));
           uint32 old = atomicAnd(onpagemasks + segment, ~(1 << withinsegment));
 
-          uint32 elementsinsegment = segment < fullsegments ? 32 : additional_chunks;
-          if(__popc(old) == elementsinsegment)
-            atomicAnd((uint32*)&_ptes[page].bitmask, ~(1 << segment));
+          // always do this, since it might fail due to a race-condition with addChunkHierarchy
+          atomicAnd((uint32*)&_ptes[page].bitmask, ~(1 << segment));
         }
         else
         {
@@ -718,7 +718,7 @@ namespace ScatterKernelDetail{
           ptes[i].init();
           page[i].init();
         }
-        for(uint32 i = linid; i < numregions; i+= numregions)
+        for(uint32 i = linid; i < numregions; i+= threads)
           regions[i] = 0;
 
         if(linid == 0)
@@ -777,9 +777,9 @@ namespace ScatterKernelDetail{
         }
       }
 
-      __device__ bool isOOM(void* p){
-        // all threads in a warp return get NULL
-        return  32 == __popc(__ballot(p == NULL));
+      __device__ bool isOOM(void* p, size_t s){
+        // one thread that requested memory returned null
+        return  s && (p == NULL);
       }
 
 
@@ -869,7 +869,8 @@ namespace ScatterKernelDetail{
           if(gid > 0) return 0; //do this serially
           uint32 pagestoalloc = divup((uint32)slotSize, pagesize);
           uint32 freecount = 0;
-          for(uint32 currentpage = _numpages; currentpage > 0; --currentpage){ //this already includes all superblocks
+          for(uint32 currentpage = _numpages; currentpage > 0;){ //this already includes all superblocks
+            --currentpage;
             if(_ptes[currentpage].chunksize == 0){
               if(++freecount == pagestoalloc){
                 freecount = 0;

diff --git a/src/include/mallocMC/distributionPolicies/XMallocSIMD_impl.hpp b/src/include/mallocMC/distributionPolicies/XMallocSIMD_impl.hpp
@@ -77,7 +77,7 @@ namespace DistributionPolicies{
 
       //all the properties must be unsigned integers > 0
       BOOST_STATIC_ASSERT(!std::numeric_limits<typename Properties::pagesize::type>::is_signed);
-      BOOST_STATIC_ASSERT(pagesize > 0);
+      BOOST_STATIC_ASSERT(static_cast<uint32>(pagesize) > 0);
 
     public:
       static const uint32 _pagesize = pagesize;
@@ -97,7 +97,7 @@ namespace DistributionPolicies{
         //second half: make sure that all coalesced allocations can fit within one page
         //necessary for offset calculation
         bool coalescible = bytes > 0 && bytes < (pagesize / 32);
-        uint32 threadcount = __popc(__ballot(coalescible));
+        threadcount = __popc(__ballot(coalescible));
 
         if (coalescible && threadcount > 1)
         {

diff --git a/src/include/mallocMC/mallocMC_constraints.hpp b/src/include/mallocMC/mallocMC_constraints.hpp
@@ -37,19 +37,19 @@ namespace mallocMC{
   /** The default PolicyCheckers (do always succeed)
    */
   template<typename Policy1>
-  struct PolicyCheck1{};
+  class PolicyCheck1{};
 
   template<typename Policy1, typename Policy2>
-  struct PolicyCheck2{};
+  class PolicyCheck2{};
 
   template<typename Policy1, typename Policy2, typename Policy3>
-  struct PolicyCheck3{};
+  class PolicyCheck3{};
 
   template<typename Policy1, typename Policy2, typename Policy3, typename Policy4>
-  struct PolicyCheck4{};
+  class PolicyCheck4{};
 
   template<typename Policy1, typename Policy2, typename Policy3, typename Policy4, typename Policy5>
-  struct PolicyCheck5{};
+  class PolicyCheck5{};
 
 
   /** Enforces constraints on policies or combinations of polices
@@ -63,8 +63,9 @@ namespace mallocMC{
      typename T_GetHeapPolicy,
      typename T_AlignmentPolicy
        >
-  class PolicyConstraints{
-      PolicyCheck2<T_CreationPolicy, T_DistributionPolicy> c;
+
+  class PolicyConstraints:PolicyCheck2<T_CreationPolicy, T_DistributionPolicy>{
+
   };
 
 
@@ -75,7 +76,7 @@ namespace mallocMC{
    * the same value for their "pagesize"-parameter.
    */
   template<typename x, typename y, typename z >
-  struct PolicyCheck2<
+  class PolicyCheck2<
     typename CreationPolicies::Scatter<x,y>,
     typename DistributionPolicies::XMallocSIMD<z> 
   >{

diff --git a/src/include/mallocMC/mallocMC_hostclass.hpp b/src/include/mallocMC/mallocMC_hostclass.hpp
@@ -83,7 +83,8 @@ namespace mallocMC{
     public T_CreationPolicy, 
     public T_OOMPolicy, 
     public T_ReservePoolPolicy,
-    public T_AlignmentPolicy
+    public T_AlignmentPolicy,
+    public PolicyConstraints<T_CreationPolicy,T_DistributionPolicy,T_OOMPolicy,T_ReservePoolPolicy,T_AlignmentPolicy>
   {
     public:
       typedef T_CreationPolicy CreationPolicy;
@@ -96,10 +97,6 @@ namespace mallocMC{
       typedef boost::uint32_t uint32;
       void* pool;
 
-      //Instantiating the constraints checker will execute the check
-      PolicyConstraints<CreationPolicy,DistributionPolicy,
-        OOMPolicy,ReservePoolPolicy,AlignmentPolicy> c;
-
     public:
 
       typedef Allocator<CreationPolicy,DistributionPolicy,
@@ -112,7 +109,7 @@ namespace mallocMC{
         bytes            = AlignmentPolicy::applyPadding(bytes);
         uint32 req_size  = distributionPolicy.collect(bytes);
         void* memBlock   = CreationPolicy::create(req_size);
-        const bool oom   = CreationPolicy::isOOM(memBlock);
+        const bool oom   = CreationPolicy::isOOM(memBlock, req_size);
         if(oom) memBlock = OOMPolicy::handleOOM(memBlock);
         void* myPart     = distributionPolicy.distribute(memBlock);
 

diff --git a/src/include/mallocMC/version.hpp b/src/include/mallocMC/version.hpp
@@ -39,7 +39,7 @@
 /** the mallocMC version: major API changes should be reflected here */
 #define MALLOCMC_VERSION_MAJOR 2
 #define MALLOCMC_VERSION_MINOR 0
-#define MALLOCMC_VERSION_PATCH 0
+#define MALLOCMC_VERSION_PATCH 1
 
 /** the mallocMC flavor is used to differenciate the releases of the
  *  Computational Radiation Physics group (crp) from other releases