Merge pull request #45 from bcumming/gpu

Part 1 of the big GPU merge
arbor-sim · Nov 14, 2016 · 1bc18ea · 1bc18ea
2 parents 0e4970d + 6cf11d3
commit 1bc18ea
Show file tree

Hide file tree

Showing 213 changed files with 6,391 additions and 37,620 deletions.
diff --git a/.gitignore b/.gitignore
@@ -53,18 +53,9 @@ Makefile
 # mechanism implementations generated my modparser
 include/mechanisms
 
-# external build stuff
-external/bin
-external/modparser-build
-external/modparser-configure
-external/modparser-done
-external/modparser-download
-external/modparser-install
-external/modparser-mkdir
-external/modparser-patch
-external/modparser-update
-external/tmp
-mechanisms/*.hpp
+# mechanisms generated from .mod files
+mechanisms/multicore/*.hpp
+mechanisms/gpu/*.hpp
 
 # build path
 build*

diff --git a/.gitmodules b/.gitmodules
diff --git a/.ycm_extra_conf.py b/.ycm_extra_conf.py
@@ -50,17 +50,13 @@
     'external',
     '-I',
     'miniapp',
-#    '-isystem',
-#    '/Applications/Xcode.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX10.10.sdk/usr/include/c++/4.2.1',
-#    '-I',
-#    '/usr/include/c++/4.9.2',
-#    '-isystem',
-#    '/usr/lib/gcc/x86_64-unknown-linux-gnu/4.9.2/include'
-#    '-isystem',
-#    '/usr/local/include',
+    '-I',
+    'modcc',
+    '-I',
+    '/cm/shared/apps/cuda/8.0.44/include',
+    '-DWITH_CUDA'
 ]
-
-
+
 # Set this to the absolute path to the folder (NOT the file!) containing the
 # compile_commands.json file to use that instead of 'flags'. See here for
 # more details: http://clang.llvm.org/docs/JSONCompilationDatabase.html

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -31,12 +31,61 @@ if(WITH_TRACE)
     add_definitions("-DWITH_TRACE")
 endif()
 
-# TBB support
-set(WITH_TBB OFF CACHE BOOL "use TBB for on-node threading" )
-if(WITH_TBB)
+# list of libraries to be linked against targets
+set(EXTERNAL_LIBRARIES "")
+
+#threading model selection
+set(THREADING_MODEL "serial" CACHE STRING "set the threading model, one of serial/tbb/omp")
+if(THREADING_MODEL MATCHES "tbb")
+    # TBB support
     find_package(TBB REQUIRED)
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${TBB_DEFINITIONS}")
     add_definitions(-DWITH_TBB)
+    list(APPEND EXTERNAL_LIBRARIES ${TBB_LIBRARIES})
+
+elseif(THREADING_MODEL MATCHES "omp")
+    # OpenMP support
+    find_package(OpenMP REQUIRED)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
+    add_definitions(-DWITH_OMP)
+
+elseif(THREADING_MODEL MATCHES "serial")
+    #setup previously done
+
+else()
+    message( FATAL_ERROR "-- Threading model '${THREADING_MODEL}' not supported, use one of serial/tbb/omp")
+
+endif()
+
+# libunwind for pretty printing stack traces
+find_package(Unwind)
+if(UNWIND_FOUND)
+    add_definitions(-DWITH_UNWIND)
+    include_directories(${UNWIND_INCLUDE_DIR})
+    list(APPEND EXTERNAL_LIBRARIES ${UNWIND_LIBRARIES})
+endif()
+
+# CUDA support
+set(WITH_CUDA OFF CACHE BOOL "use CUDA for GPU offload" )
+if(WITH_CUDA)
+    find_package(CUDA REQUIRED)
+
+    # Turn off annoying and incorrect warnings generated in the JSON file.
+    # We also work around the same issue with the intel compiler.
+    set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-Xcudafe \"--diag_suppress=not_used_in_template_function_params\";-Xcudafe \"--diag_suppress=cast_to_qualified_type\")
+
+    # set the CUDA target specfic flags
+    # code regions protected by WITH_CUDA should only be available to the CUDA
+    # compiler, which regions protected by WITH_GPU are visible to both host
+    # and device compiler when targetting GPU.
+    set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-DWITH_CUDA)
+    set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-DWITH_GPU)
+    set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-arch=sm_35)
+    #set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-arch=sm_60)
+
+    add_definitions(-DWITH_GPU)
+    include_directories(SYSTEM ${CUDA_INCLUDE_DIRS})
+    list(APPEND EXTERNAL_LIBRARIES ${CUDA_LIBRARIES})
 endif()
 
 # MPI support
@@ -50,7 +99,7 @@ if(WITH_MPI)
     set_property(DIRECTORY APPEND_STRING PROPERTY COMPILE_OPTIONS "${MPI_C_COMPILE_FLAGS}")
 endif()
 
-# Profiler support
+# Internal profiler support
 set(WITH_PROFILING OFF CACHE BOOL "use built-in profiling of miniapp" )
 if(WITH_PROFILING)
     add_definitions(-DWITH_PROFILING)
@@ -115,9 +164,7 @@ else()
     set(BUILD_NRN_VALIDATION_DATA TRUE)
 endif()
 
-
-include_directories(${CMAKE_SOURCE_DIR}/tclap/include)
-include_directories(${CMAKE_SOURCE_DIR}/vector)
+include_directories(${CMAKE_SOURCE_DIR}/tclap)
 include_directories(${CMAKE_SOURCE_DIR}/include)
 include_directories(${CMAKE_SOURCE_DIR}/src)
 include_directories(${CMAKE_SOURCE_DIR}/miniapp)

diff --git a/cmake/FindUnwind.cmake b/cmake/FindUnwind.cmake
@@ -0,0 +1,48 @@
+# Find the libunwind library
+#
+#  UNWIND_FOUND       - True if libunwind was found
+#  UNWIND_LIBRARIES   - The libraries needed to use libunwind
+#  UNWIND_INCLUDE_DIR - Location of unwind.h and libunwind.h
+#
+# The environment and cmake variables UNWIND_ROOT and UNWIND_ROOT_DIR
+# respectively can be used to help CMake finding the library if it
+# is not installed in any of the usual locations.
+
+if(NOT UNWIND_FOUND)
+    set(UNWIND_SEARCH_DIR ${UNWIND_ROOT_DIR} $ENV{UNWIND_ROOT})
+
+    find_path(UNWIND_INCLUDE_DIR libunwind.h
+        HINTS ${UNWIND_SEARCH_DIR}
+        PATH_SUFFIXES include
+    )
+
+    # libunwind requires that we link agains both libunwind.so/a and a
+    # a target-specific library libunwind-target.so/a.
+    # This code sets the "target" string above in libunwind_arch.
+    if (CMAKE_SYSTEM_PROCESSOR MATCHES "^arm")
+        set(libunwind_arch "arm")
+    elseif (CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64" OR CMAKE_SYSTEM_PROCESSOR STREQUAL "amd64")
+        set(libunwind_arch "x86_64")
+    elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "^i.86$")
+        set(libunwind_arch "x86")
+    endif()
+
+    find_library(unwind_library_generic unwind
+        HINTS ${UNWIND_SEARCH_DIR}
+        PATH_SUFFIXES lib64 lib
+    )
+
+    find_library(unwind_library_target unwind-${libunwind_arch}
+        HINTS ${UNWIND_SEARCH_DIR}
+        PATH_SUFFIXES lib64 lib
+    )
+
+    set(UNWIND_LIBRARIES ${unwind_library_generic} ${unwind_library_target})
+
+    mark_as_advanced(UNWIND_LIBRARIES UNWIND_INCLUDE_DIR)
+
+    unset(unwind_search_dir)
+    unset(unwind_library_generic)
+    unset(unwind_library_target)
+    unset(libunwind_arch)
+endif()
diff --git a/data/test.mod b/data/test.mod
@@ -10,7 +10,7 @@ NEURON  {
 }
 
 STATE {
-    h
+    h (nA)
     m r
 }
 

diff --git a/mechanisms/CMakeLists.txt b/mechanisms/CMakeLists.txt
@@ -4,14 +4,15 @@ set(mechanisms pas hh expsyn exp2syn)
 # set the flags for the modcc compiler that converts NMODL
 # files to C++/CUDA source.
 set(modcc_flags "-t cpu")
+
 if(USE_OPTIMIZED_KERNELS) # generate optimized kernels
     set(modcc_flags ${modcc_flags} -O)
 endif()
 
 # generate source for each mechanism
 foreach(mech ${mechanisms})
     set(mod "${CMAKE_CURRENT_SOURCE_DIR}/mod/${mech}.mod")
-    set(hpp "${CMAKE_CURRENT_SOURCE_DIR}/${mech}.hpp")
+    set(hpp "${CMAKE_CURRENT_SOURCE_DIR}/multicore/${mech}.hpp")
     if(use_external_modcc)
         add_custom_command(
            OUTPUT "${hpp}"
@@ -33,3 +34,38 @@ endforeach()
 # Fake target to always trigger .mod -> .hpp dependencies because wtf CMake
 add_custom_target(build_all_mods DEPENDS ${all_mod_hpps} modcc)
 
+# oh sweet jesus, CMake is a dog's breakfast.
+# that said, let'g go through the same dance to generate CUDA kernels if
+# we are targetting the GPU.
+if(WITH_CUDA)
+    set(modcc_flags "-t gpu")
+
+    if(USE_OPTIMIZED_KERNELS)
+        set(modcc_flags ${modcc_flags} -O)
+    endif()
+
+    # generate source for each mechanism
+    foreach(mech ${mechanisms})
+        set(mod "${CMAKE_CURRENT_SOURCE_DIR}/mod/${mech}.mod")
+        set(hpp "${CMAKE_CURRENT_SOURCE_DIR}/gpu/${mech}.hpp")
+        if(use_external_modcc)
+            add_custom_command(
+               OUTPUT "${hpp}"
+               WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}"
+               COMMAND ${modcc} ${modcc_flags} ${mod} -o ${hpp}
+           )
+        else()
+            add_custom_command(
+                OUTPUT "${hpp}"
+                DEPENDS modparser "${mod}"
+                WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}"
+                COMMAND ${modcc} ${modcc_flags} ${mod} -o ${hpp}
+            )
+        endif()
+        set_source_files_properties("${hpp}" PROPERTIES GENERATED TRUE)
+        list(APPEND all_gpu_mod_hpps "${hpp}")
+    endforeach()
+
+    # Fake target to always trigger .mod -> .hpp dependencies because wtf CMake
+    add_custom_target(build_all_gpu_mods DEPENDS ${all_gpu_mod_hpps} modcc)
+endif()
diff --git a/mechanisms/generate.sh b/mechanisms/generate.sh
diff --git a/miniapp/CMakeLists.txt b/miniapp/CMakeLists.txt
@@ -1,15 +1,25 @@
 set(HEADERS
 )
 set(MINIAPP_SOURCES
-    io.cpp
     miniapp.cpp
+    io.cpp
+    miniapp_recipes.cpp
+)
+set(MINIAPP_SOURCES_CUDA
+    miniapp.cu
+    io.cpp
     miniapp_recipes.cpp
 )
 
-add_executable(miniapp.exe ${MINIAPP_SOURCES} ${HEADERS})
+if(WITH_CUDA)
+    cuda_add_executable(miniapp.exe ${MINIAPP_SOURCES_CUDA} ${HEADERS})
+    target_link_libraries(miniapp.exe LINK_PUBLIC gpu)
+else()
+    add_executable(miniapp.exe ${MINIAPP_SOURCES} ${HEADERS})
+endif()
 
-target_link_libraries(miniapp.exe LINK_PUBLIC cellalgo)
-target_link_libraries(miniapp.exe LINK_PUBLIC ${TBB_LIBRARIES})
+target_link_libraries(miniapp.exe LINK_PUBLIC nestmc)
+target_link_libraries(miniapp.exe LINK_PUBLIC ${EXTERNAL_LIBRARIES})
 
 if(WITH_MPI)
     target_link_libraries(miniapp.exe LINK_PUBLIC ${MPI_C_LIBRARIES})

diff --git a/miniapp/miniapp.cpp b/miniapp/miniapp.cpp
@@ -7,16 +7,17 @@
 
 #include <json/json.hpp>
 
+#include <backends/fvm.hpp>
 #include <common_types.hpp>
-#include <cell.hpp>
 #include <communication/communicator.hpp>
 #include <communication/global_policy.hpp>
+#include <cell.hpp>
 #include <fvm_multicell.hpp>
 #include <io/exporter_spike_file.hpp>
-#include <mechanism_catalogue.hpp>
 #include <model.hpp>
 #include <profiling/profiler.hpp>
 #include <threading/threading.hpp>
+#include <util/debug.hpp>
 #include <util/ioutil.hpp>
 #include <util/nop.hpp>
 #include <util/optional.hpp>
@@ -28,17 +29,19 @@
 using namespace nest::mc;
 
 using global_policy = communication::global_policy;
-using lowered_cell = fvm::fvm_multicell<double, cell_local_size_type>;
-//using lowered_cell = fvm::fvm_cell<double, cell_local_size_type>;
+#ifdef WITH_CUDA
+using lowered_cell = fvm::fvm_multicell<gpu::backend>;
+#else
+using lowered_cell = fvm::fvm_multicell<multicore::backend>;
+#endif
 using model_type = model<lowered_cell>;
-using time_type = model_type::time_type;
-using sample_trace_type = sample_trace<time_type, model_type::value_type>;
-using file_export_type = io::exporter_spike_file<time_type, global_policy>;
+using sample_trace_type = sample_trace<model_type::time_type, model_type::value_type>;
+using file_export_type = io::exporter_spike_file<model_type::time_type, global_policy>;
 void banner();
 std::unique_ptr<recipe> make_recipe(const io::cl_options&, const probe_distribution&);
 std::unique_ptr<sample_trace_type> make_trace(cell_member_type probe_id, probe_spec probe);
 std::pair<cell_gid_type, cell_gid_type> distribute_cells(cell_size_type ncells);
-using communicator_type = communication::communicator<time_type, communication::global_policy>;
+using communicator_type = communication::communicator<model_type::time_type, communication::global_policy>;
 using spike_type = typename communicator_type::spike_type;
 
 void write_trace_json(const sample_trace_type& trace, const std::string& prefix = "trace_");
@@ -84,7 +87,7 @@ int main(int argc, char** argv) {
                     options.file_extension, options.over_write);
         };
 
-        // File output is depending on the input arguments
+        // File output depends on the input arguments
         std::unique_ptr<file_export_type> file_exporter;
         if (options.spike_file_output) {
             if (options.single_file_per_rank) {
@@ -128,7 +131,7 @@ int main(int argc, char** argv) {
 
         // reset the model
         m.reset();
-        // rest the source spikes
+        // reset the source spikes
         for (auto source : local_sources) {
             m.add_artificial_spike({source, 0});
         }
@@ -178,6 +181,11 @@ void banner() {
     std::cout << "  starting miniapp\n";
     std::cout << "  - " << threading::description() << " threading support\n";
     std::cout << "  - communication policy: " << global_policy::name() << "\n";
+#ifdef WITH_CUDA
+    std::cout << "  - gpu support: on\n";
+#else
+    std::cout << "  - gpu support: off\n";
+#endif
     std::cout << "====================\n";
 }
 

diff --git a/miniapp/miniapp.cu b/miniapp/miniapp.cu
@@ -0,0 +1 @@
+#include "miniapp.cpp"